def check_maximum_node_pos(): # step4において,最も良い値を出力する条件を調査する為の関数 resolution = 100 node_pos, input_nodes, input_vectors,\ output_nodes, output_vectors, frozen_nodes,\ edges_indices, edges_thickness, frozen_nodes = easy_dev() env = BarFemGym(node_pos, input_nodes, input_vectors, output_nodes, output_vectors, frozen_nodes, edges_indices, edges_thickness, frozen_nodes) rewards = np.zeros((resolution, resolution)) max = 0 for ix, x in tqdm(enumerate(np.linspace(0, 1, resolution))): for iy, y in enumerate(np.linspace(1, 0, resolution)): env.reset() action = {} action['which_node'] = np.array([0, 4]) action['end'] = 0 action['edge_thickness'] = np.array([1]) action['new_node'] = np.array([[x, y]]) next_nodes_pos, _, done, _ = env.step(action) action = {} action['which_node'] = np.array([2, 4]) action['end'] = 0 action['edge_thickness'] = np.array([1]) action['new_node'] = np.array([[0, 1]]) next_nodes_pos, _, done, _ = env.step(action) action = {} action['which_node'] = np.array([3, 4]) action['end'] = 0 action['edge_thickness'] = np.array([1]) action['new_node'] = np.array([[0, 1]]) next_nodes_pos, _, done, _ = env.step(action) reward = env.calculate_simulation() if max < reward: max = reward max_x = x max_y = y max_reward = reward rewards[iy, ix] = reward fig = plt.figure(figsize=(6, 6)) ax = fig.add_subplot(111) im = plt.imshow(rewards, extent=(0, 1, 0, 1)) plt.colorbar(im) ax.set_xlabel(r"x", fontsize=20) ax.set_ylabel(r"y", fontsize=20) ax.tick_params(axis='x', labelsize=20) ax.tick_params(axis='y', labelsize=20) plt.savefig("distribution.png") print(max_x, max_y) print(max_reward)
def confirm_max_status(): "最大値となる状態を求める." max = 0 x = 1000 for i in np.arange(0.1, 1.001, 0.001): node_pos, input_nodes, input_vectors,\ output_nodes, output_vectors, frozen_nodes,\ edges_indices, edges_thickness, frozen_nodes = easy_dev() env = BarFemGym(node_pos, input_nodes, input_vectors, output_nodes, output_vectors, frozen_nodes, edges_indices, edges_thickness, frozen_nodes) env.reset() action = {} action['which_node'] = np.array([0, 3]) action['end'] = 0 action['edge_thickness'] = np.array([i]) action['new_node'] = np.array([[0, 2]]) next_nodes_pos, _, done, _ = env.step(action) reward = env.calculate_simulation() if max < reward: max = reward x = i print("最小は", x, max)
def actor_gcn_critic_gcn(max_episodes=5000, test_name="test", log_file=False, save_pth=False): """Actor-Criticを行う.Actor,CriticはGCN Actorの指定できるものは,ノード1とノード2であり,一つのエッジのみを選択できる. max_episodes:学習回数 test_name:保存ファイルの名前 log_file: Trueにすると,progress.txtに損失関数などの情報のログをとる.""" history = {} history['epoch'] = [] history['result_efficiency'] = [] history['mean_efficiency'] = [] # a_meanの値の時のηの値を収納する history['a'] = [] history['a_mean'] = [] history['a_sigma'] = [] history['advantage'] = [] history['critic_value'] = [] log_dir = "confirm/step3/a_gcn_c_gcn_results/{}".format(test_name) assert not os.path.exists(log_dir), "already folder exists" if log_file: log_file = log_dir else: log_file = None os.makedirs(log_dir, exist_ok=True) node_pos, input_nodes, input_vectors,\ output_nodes, output_vectors, frozen_nodes,\ edges_indices, edges_thickness, frozen_nodes = easy_dev() env = BarFemGym(node_pos, input_nodes, input_vectors, output_nodes, output_vectors, frozen_nodes, edges_indices, edges_thickness, frozen_nodes) env.reset() max_steps = 1 lr_actor = 1e-4 lr_critic = 1e-3 weight_decay = 1e-2 gamma = 0.99 device = torch.device('cpu') actorNet = Select_node1_model(2, 1, 400, 400).to(device).double() actorNet2 = Select_node2_model(400 + 2, 400).to(device).double() criticNet = CriticNetwork_GCN(2, 1, 400, 400).to(device).double() edgethickNet = Edgethick_Actor(400).to(device).double() optimizer_actor = optim.Adam(actorNet.parameters(), lr=lr_actor) optimizer_actor2 = optim.Adam(actorNet2.parameters(), lr=lr_actor) optimizer_edgethick = optim.Adam(edgethickNet.parameters(), lr=lr_actor) optimizer_critic = optim.Adam(criticNet.parameters(), lr=lr_critic, weight_decay=weight_decay) for episode in tqdm(range(max_episodes)): if log_file: with open(os.path.join(log_dir, "progress.txt"), mode='a') as f: print('\nepoch:', episode, file=f) env.reset() nodes_pos, edges_indices, edges_thickness, node_adj = env.extract_node_edge_info( ) for step in range(max_steps): action = select_action_gcn_critic_gcn(env, actorNet, actorNet2, criticNet, edgethickNet, device, log_dir=log_file) next_nodes_pos, _, done, _ = env.step(action) reward = env.calculate_simulation() criticNet.rewards.append(reward) loss = finish_episode(criticNet, actorNet, actorNet2, edgethickNet, optimizer_critic, optimizer_actor, optimizer_actor2, optimizer_edgethick, gamma, log_dir=log_file) history['epoch'].append(episode + 1) history['result_efficiency'].append(reward) if episode % 100 == 0: if save_pth: save_model(criticNet, edgethickNet, os.path.join(log_dir, "pth"), save_name=str(episode)) env.close() plot_efficiency_history(history, os.path.join(log_dir, 'learning_effi_curve.png')) return history
def main(): # running_reward = 0 prior_efficiency = 0 continuous_trigger = 0 best_efficiency = -1000 best_epoch = 0 # 1エピソードのループ while (1): new_node_pos, new_input_nodes, new_input_vectors, new_output_nodes, new_output_vectors, new_frozen_nodes, new_edges_indices, new_edges_thickness = make_continuous_init_graph( origin_nodes_positions, origin_edges_indices, origin_input_nodes, origin_input_vectors, origin_output_nodes, origin_output_vectors, origin_frozen_nodes, EDGE_THICKNESS) env = BarFemGym(new_node_pos, new_input_nodes, new_input_vectors, new_output_nodes, new_output_vectors, new_frozen_nodes, new_edges_indices, new_edges_thickness) env.reset() if env.confirm_graph_is_connected(): break nodes_pos, _, _, _ = env.extract_node_edge_info() first_node_num = nodes_pos.shape[0] # run inifinitely many episodes for epoch in tqdm(range(train_num)): # for epoch in count(1): # reset environment and episode reward while (1): new_node_pos, new_input_nodes, new_input_vectors, new_output_nodes, new_output_vectors, new_frozen_nodes, new_edges_indices, new_edges_thickness = make_continuous_init_graph( origin_nodes_positions, origin_edges_indices, origin_input_nodes, origin_input_vectors, origin_output_nodes, origin_output_vectors, origin_frozen_nodes, EDGE_THICKNESS) env = BarFemGym(new_node_pos, new_input_nodes, new_input_vectors, new_output_nodes, new_output_vectors, new_frozen_nodes, new_edges_indices, new_edges_thickness) env.reset() if env.confirm_graph_is_connected(): break state = env.reset() ep_reward = 0 continuous_trigger = 0 # for each episode, only run 9999 steps so that we don't # infinite loop while learning for t in range(max_action): # select action from policy action = select_action(first_node_num) nodes_pos, edges_indices, edges_thickness, adj = env.extract_node_edge_info( ) # take the action state, _, done, info = env.step(action) if (t == (max_action - 1)) and (done is not True): # max_action内にてactionが終わらない時 reward = -final_penalty elif env.confirm_graph_is_connected(): efficiency = env.calculate_simulation() if continuous_trigger == 1: reward = efficiency - prior_efficiency else: reward = efficiency + continuous_reward continuous_trigger = 1 prior_efficiency = efficiency elif continuous_trigger == 1: reward = -penalty else: reward = 0 GCN.rewards.append(reward) ep_reward += reward if done: steps = t break # update cumulative reward # running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward # perform backprop loss = finish_episode() # efficiencyの最終結果を求める if env.confirm_graph_is_connected(): result_efficiency = env.calculate_simulation() else: result_efficiency = -1 if best_efficiency < result_efficiency: best_epoch = epoch best_efficiency = result_efficiency save_model(save_name="Good") # env.render(os.path.join( # log_dir, 'render_image/{}.png'.format(epoch+1))) history['epoch'].append(epoch + 1) history['loss'].append(loss) history['ep_reward'].append(ep_reward) history['result_efficiency'].append(result_efficiency) history['steps'].append(steps + 1) # 学習履歴を保存 with open(os.path.join(log_dir, 'history.pkl'), 'wb') as f: pickle.dump(history, f) with open(os.path.join(log_dir, "progress.txt"), mode='a') as f: f.writelines( 'epoch %d, loss: %.4f ep_reward: %.4f result_efficiency: %.4f\n' % (epoch + 1, loss, ep_reward, result_efficiency)) with open(os.path.join(log_dir, "represent_value.txt"), mode='w') as f: f.writelines('epoch %d, best_efficiency: %.4f\n' % (best_epoch + 1, best_efficiency)) save_model(save_name="Last") plot_loss_history(history, os.path.join(log_dir, 'learning_loss_curve.png')) plot_reward_history(history, os.path.join(log_dir, 'learning_reward_curve.png')) plot_efficiency_history( history, os.path.join(log_dir, 'learning_effi_curve.png')) plot_steps_history(history, os.path.join(log_dir, 'learning_steps_curve.png'))
def select_action_gcn_critic_gcn(env, criticNet, edgethickNet, device, log_dir=None, history=None): nodes_pos, edges_indices, edges_thickness, node_adj = env.extract_node_edge_info( ) node, edge, node_adj, edge_adj, D_v, D_e, T = make_torch_type_for_GCN( nodes_pos, edges_indices, edges_thickness, node_adj) state_value = criticNet(node, edge, node_adj, edge_adj, D_v, D_e, T) node1 = 0 node2 = 3 edge_thickness = edgethickNet(node, edge, node_adj, edge_adj, D_v, D_e, T, node1, node2) edge_thickness_tdist = tdist.Normal(edge_thickness[0][0].item(), edge_thickness[0][1].item()) edge_thickness_action = edge_thickness_tdist.sample() edge_thickness_action = torch.clamp(edge_thickness_action, min=0, max=1) action = {} action['which_node'] = np.array([node1, node2]) action['end'] = 0 action['edge_thickness'] = np.array([edge_thickness_action.item()]) action['new_node'] = np.array([[0, 2]]) # save to action buffer criticNet.saved_actions.append(Saved_Action(action, state_value)) edgethickNet.saved_actions.append( Saved_mean_std_Action(edge_thickness[0][0], edge_thickness[0][1])) if log_dir is not None: # lossの確認事項 with open(os.path.join(log_dir, "progress.txt"), mode='a') as f: print('edge_thick_mean:', edge_thickness[0][0].item(), file=f) print('edge_thick_std:', edge_thickness[0][1].item(), file=f) print('edge_thickness:', edge_thickness_action.item(), file=f) if history is not None: node_pos, input_nodes, input_vectors,\ output_nodes, output_vectors, frozen_nodes,\ edges_indices, edges_thickness, frozen_nodes = easy_dev() calc_effi_env = BarFemGym(node_pos, input_nodes, input_vectors, output_nodes, output_vectors, frozen_nodes, edges_indices, edges_thickness, frozen_nodes) calc_effi_env.reset() mean_action = {} mean_action['which_node'] = np.array([node1, node2]) mean_action['end'] = 0 mean_action['edge_thickness'] = np.array([edge_thickness[0][0].item()]) mean_action['new_node'] = np.array([[0, 2]]) next_nodes_pos, _, done, _ = calc_effi_env.step(mean_action) mean_efficiency = calc_effi_env.calculate_simulation(mode='force') # historyにログを残す history['mean_efficiency'].append(mean_efficiency) history['a'].append(edge_thickness_action.item()) history['a_mean'].append(edge_thickness[0][0].item()) history['a_sigma'].append(edge_thickness[0][1].item()) history['critic_value'].append(state_value.item()) return action
def select_action_gcn_critic_gcn(env, node1Net, node2Net, criticNet, edgethickNet, device, log_dir=None, history=None): nodes_pos, edges_indices, edges_thickness, node_adj = env.extract_node_edge_info( ) node_num = nodes_pos.shape[0] node, edge, node_adj, edge_adj, D_v, D_e, T = make_torch_type_for_GCN( nodes_pos, edges_indices, edges_thickness, node_adj) state_value = criticNet(node, edge, node_adj, edge_adj, D_v, D_e, T) # ノード1を求める emb_node, node1_prob = node1Net(node, edge, node_adj, edge_adj, D_v, D_e, T) node1_categ = Categorical(node1_prob) node1 = node1_categ.sample() # ノード1を除いたnodeの作成 non_node1_node = torch.cat([node[:, 0:node1, :], node[:, node1 + 1:, :]], 1) # ノード1の情報抽出 H1 = emb_node[0][node1] H1_cat = H1.repeat(node_num - 1, 1) H1_cat = H1_cat.unsqueeze(0) # HとH1のノード情報をconcat emb_graph_cat = torch.cat([non_node1_node, H1_cat], 2) # ノード2を求める emb_edge, node2_prob = node2Net(emb_graph_cat) node2_categ = Categorical(node2_prob) node2_temp = node2_categ.sample() if node2_temp >= node1: node2 = node2_temp + 1 # node1分の調整 else: node2 = node2_temp edge_thickness = edgethickNet(emb_edge) edge_thickness_tdist = tdist.Normal(edge_thickness[0][0].item(), edge_thickness[0][1].item()) edge_thickness_action = edge_thickness_tdist.sample() edge_thickness_action = torch.clamp(edge_thickness_action, min=0, max=1) action = {} action['which_node'] = np.array([node1.item(), node2.item()]) action['end'] = 0 action['edge_thickness'] = np.array([edge_thickness_action.item()]) action['new_node'] = np.array([[0, 2]]) # save to action buffer criticNet.saved_actions.append(Saved_Action(action, state_value)) node1Net.saved_actions.append( Saved_prob_Action(node1_categ.log_prob(node1))) node2Net.saved_actions.append( Saved_prob_Action(node2_categ.log_prob(node2_temp))) edgethickNet.saved_actions.append( Saved_mean_std_Action(edge_thickness[0][0], edge_thickness[0][1])) if log_dir is not None: # lossの確認事項 with open(os.path.join(log_dir, "progress.txt"), mode='a') as f: print('node1_prob:', node1_prob, file=f) print('node1:', node1.item(), file=f) print('node2_prob:', node2_prob, file=f) print('node2:', node2.item(), file=f) print('edge_thick_mean:', edge_thickness[0][0].item(), file=f) print('edge_thick_std:', edge_thickness[0][1].item(), file=f) print('edge_thickness:', edge_thickness_action.item(), file=f) if history is not None: node_pos, input_nodes, input_vectors,\ output_nodes, output_vectors, frozen_nodes,\ edges_indices, edges_thickness, frozen_nodes = easy_dev() calc_effi_env = BarFemGym(node_pos, input_nodes, input_vectors, output_nodes, output_vectors, frozen_nodes, edges_indices, edges_thickness, frozen_nodes) calc_effi_env.reset() mean_action = {} mean_action['which_node'] = np.array([node1, node2]) mean_action['end'] = 0 mean_action['edge_thickness'] = np.array([edge_thickness[0][0].item()]) mean_action['new_node'] = np.array([[0, 2]]) next_nodes_pos, _, done, _ = calc_effi_env.step(mean_action) mean_efficiency = calc_effi_env.calculate_simulation() # historyにログを残す history['mean_efficiency'].append(mean_efficiency) history['a'].append(edge_thickness_action.item()) history['a_mean'].append(edge_thickness[0][0].item()) history['a_sigma'].append(edge_thickness[0][1].item()) history['critic_value'].append(state_value.item()) return action
def load_actor_gcn_critic_gcn(load_dir, load_epoch, max_episodes=5000, test_name="test", history=None, log_file=False): """ActorCriticにおいて保存されpthをロードし,そこから学習を開始する. Args: load_dir ([type]): ロードする対象のpthが複数存在するディレクトリのパスを指定する. load_epoch ([type]): いつのepochから学習を開始するかを決める. max_episodes (int, optional): 学習回数. Defaults to 5000. test_name (str, optional): 保存ファイルの名前. Defaults to "test". history ([type], optional): 保存したhistory.これを指定した時,グラフにもロード結果が適用される. Defaults to None. log_file (bool, optional): Trueにすると,progress.txtに損失関数などの情報のログをとる. Defaults to False. """ if history is None: history = {} history['epoch'] = [] history['result_efficiency'] = [] history['mean_efficiency'] = [] # a_meanの値の時のηの値を収納する history['a'] = [] history['a_mean'] = [] history['a_sigma'] = [] history['advantage'] = [] history['critic_value'] = [] else: for key in history.keys(): history[key] = history[key][:load_epoch] log_dir = "confirm/step5_entropy/a_gcn_c_gcn_results/{}".format(test_name) assert not os.path.exists(log_dir), "already folder exists" if log_file: log_file = log_dir else: log_file = None os.makedirs(log_dir, exist_ok=True) node_pos, input_nodes, input_vectors,\ output_nodes, output_vectors, frozen_nodes,\ edges_indices, edges_thickness, frozen_nodes = easy_dev() env = BarFemGym(node_pos, input_nodes, input_vectors, output_nodes, output_vectors, frozen_nodes, edges_indices, edges_thickness, frozen_nodes) env.reset() max_steps = 1 lr_actor = 1e-4 lr_critic = 1e-3 weight_decay = 1e-2 gamma = 0.99 device = torch.device('cpu') criticNet = CriticNetwork_GCN(2, 1, 400, 400).to(device).double() edgethickNet = Edgethick_Actor(2, 1, 400, 400).to(device).double() criticNet.load_state_dict( torch.load( os.path.join(load_dir, "pth/{}_criticNet.pth".format(load_epoch)))) edgethickNet.load_state_dict( torch.load( os.path.join(load_dir, "pth/{}_edgethickNet.pth".format(load_epoch)))) optimizer_edgethick = optim.SGD(edgethickNet.parameters(), lr=lr_actor) optimizer_critic = optim.Adam(criticNet.parameters(), lr=lr_critic, weight_decay=weight_decay) for episode in tqdm(range(load_epoch, max_episodes)): if log_file: with open(os.path.join(log_dir, "progress.txt"), mode='a') as f: print('\nepoch:', episode, file=f) env.reset() nodes_pos, edges_indices, edges_thickness, node_adj = env.extract_node_edge_info( ) for step in range(max_steps): action = select_action_gcn_critic_gcn(env, criticNet, edgethickNet, device, log_dir=log_file, history=history) next_nodes_pos, _, done, _ = env.step(action) reward = env.calculate_simulation(mode='force') criticNet.rewards.append(reward) loss = finish_episode(criticNet, edgethickNet, optimizer_critic, optimizer_edgethick, gamma, log_dir=log_file, history=history) history['epoch'].append(episode + 1) history['result_efficiency'].append(reward) env.close() plot_efficiency_history(history, os.path.join(log_dir, 'learning_effi_curve.png')) return history
def actor_gcn_critic_gcn(max_episodes=5000, test_name="test", log_file=False, save_pth=False): """Actor-Criticを行う.Actor,CriticはGCN Actorの指定できるものは,一つのエッジのみの幅を選択できる. max_episodes:学習回数 test_name:保存ファイルの名前 log_file: Trueにすると,progress.txtに損失関数などの情報のログをとる.""" history = {} history['epoch'] = [] history['result_efficiency'] = [] history['x'] = [] history['x_mean'] = [] history['x_sigma'] = [] history['y'] = [] history['y_mean'] = [] history['y_sigma'] = [] history['advantage'] = [] history['critic_value'] = [] log_dir = "confirm/step5_entropy/a_gcn_c_gcn_results/{}".format(test_name) assert not os.path.exists(log_dir), "already folder exists" if log_file: log_file = log_dir else: log_file = None os.makedirs(log_dir, exist_ok=True) node_pos, input_nodes, input_vectors,\ output_nodes, output_vectors, frozen_nodes,\ edges_indices, edges_thickness, frozen_nodes = easy_dev() env = BarFemGym(node_pos, input_nodes, input_vectors, output_nodes, output_vectors, frozen_nodes, edges_indices, edges_thickness, frozen_nodes) env.reset() lr_actor = 1e-4 lr_critic = 1e-3 weight_decay = 1e-2 gamma = 0.99 device = torch.device('cpu') criticNet = CriticNetwork_GCN(2, 1, 400, 400).to(device).double() x_y_Net = X_Y_Actor(2, 1, 400, 400).to(device).double() node1Net = Select_node1_model(2, 1, 400, 400).to(device).double() node2Net = Select_node2_model(400 + 2, 400).to( device).double() # 400+2における400は,Select_node1_modelのinput3の部分に対応 optimizer_node1 = optim.Adam(node1Net.parameters(), lr=lr_actor) optimizer_node2 = optim.Adam(node2Net.parameters(), lr=lr_actor) optimizer_xy = optim.Adam(x_y_Net.parameters(), lr=lr_actor) optimizer_critic = optim.Adam(criticNet.parameters(), lr=lr_critic, weight_decay=weight_decay) for episode in tqdm(range(max_episodes)): if log_file: with open(os.path.join(log_dir, "progress.txt"), mode='a') as f: print('\nepoch:', episode, file=f) env = BarFemGym(node_pos, input_nodes, input_vectors, output_nodes, output_vectors, frozen_nodes, edges_indices, edges_thickness, frozen_nodes) env.reset() nodes_pos, edges_indices, edges_thickness, node_adj = env.extract_node_edge_info( ) action = select_action_gcn_critic_gcn(env, criticNet, node1Net, node2Net, x_y_Net, device, log_dir=log_file, history=history) next_nodes_pos, _, done, _ = env.step(action) if 4 in action['which_node']: env.input_nodes = [2, 4] env.input_vectors = np.array([[1, 0], [0, 1]]) if 2 in action['which_node'] and 4 in action[ 'which_node']: # TODO [2,4]を選択しないように学習させる reward = np.array([0]) else: reward = env.calculate_simulation() criticNet.rewards.append(reward) loss = finish_episode(criticNet, x_y_Net, node1Net, node2Net, optimizer_critic, optimizer_xy, optimizer_node1, optimizer_node2, gamma, log_dir=log_file, history=history) history['epoch'].append(episode + 1) history['result_efficiency'].append(reward) plot_efficiency_history( history, os.path.join(log_dir, 'learning_effi_curve.png')) if episode % 100 == 0: if save_pth: save_model(criticNet, x_y_Net, os.path.join(log_dir, "pth"), save_name=str(episode)) env.close() with open(os.path.join(log_dir, 'history.pkl'), 'wb') as f: pickle.dump(history, f) return history
origin_input_vectors, origin_output_nodes, origin_output_vectors, origin_frozen_nodes) env = BarFemGym(new_node_pos, new_input_nodes, new_input_vectors, new_output_nodes, new_output_vectors, new_frozen_nodes, new_edges_indices, new_edges_thickness) # 1エピソードのループ state = env.reset() total_time = 0 total_calc_time = 0 for i in range(500): # ランダム行動の取得 action = env.random_action() # 1ステップの実行 state, reward, done, info = env.step(action) if env.confirm_graph_is_connected(): reward = 0 start = time.time() efficiency = env.calculate_simulation() elapsed_time = time.time() - start print("elapsed_time:{0}".format(elapsed_time) + "[sec]") reward = efficiency total_time += elapsed_time total_calc_time += 1 else: reward = -1 print("一回辺りの計算時間:", total_time / total_calc_time)
class Worker(mp.Process): def __init__(self, global_criticNet, global_x_y_Net, global_node1Net, global_node2Net, Critic_opt, x_y_opt, Node1_opt, Node2_opt, global_ep, global_ep_r, res_queue, name, gamma=0.99, total_episodes=5000): super(Worker, self).__init__() self.name = 'w%i' % name self.g_ep, self.g_ep_r, self.res_queue = global_ep, global_ep_r, res_queue self.Critic_opt, self.x_y_opt, self.Node1_opt, self.Node2_opt = Critic_opt, x_y_opt, Node1_opt, Node2_opt self.global_criticNet, self.global_x_y_Net,\ self.global_node1Net, self.global_node2Net = global_criticNet, global_x_y_Net, global_node1Net, global_node2Net device = torch.device('cpu') self.local_criticNet = CriticNetwork_GCN(2, 1, 400, 400).double().to(device) self.local_x_y_Net = X_Y_Actor(2, 1, 400, 400).double().to(device) self.local_node1Net = Select_node1_model(2, 1, 400, 400).double().to(device) self.local_node2Net = Select_node2_model(400 + 2, 400).double().to(device) node_pos, input_nodes, input_vectors,\ output_nodes, output_vectors, frozen_nodes,\ edges_indices, edges_thickness, frozen_nodes = easy_dev() self.env = BarFemGym(node_pos, input_nodes, input_vectors, output_nodes, output_vectors, frozen_nodes, edges_indices, edges_thickness, frozen_nodes) self.env.reset() self.gamma = gamma # 報酬減衰率 self.total_episodes = total_episodes # すべてのプロセスにおいての合計epoch def finish_episode(self, log_dir=None, history=None): R = 0 GCN_saved_actions = self.local_criticNet.saved_actions x_y_saved_actions = self.local_x_y_Net.saved_actions node1Net_saved_actions = self.local_node1Net.saved_actions node2Net_saved_actions = self.local_node2Net.saved_actions policy_losses = [] # list to save actor (policy) loss value_losses = [] # list to save critic (value) loss returns = [] # list to save the true values # calculate the true value using rewards returned from the environment for r in self.local_criticNet.rewards[::-1]: # calculate the discounted value R = r + self.gamma * R returns.insert(0, R) returns = torch.tensor(returns) x_y_opt_trigger = False # advantage>0の場合したときにx_y_optを作動出来るようにする為のトリガー for (action, value), (x_y_mean, x_y_std, x_dist, y_dist), (node1_prob, node1_dist), (node2_prob, node2_dist), R in zip( GCN_saved_actions, x_y_saved_actions, node1Net_saved_actions, node2Net_saved_actions, returns): advantage = R - value.item() # calculate actor (policy) loss if action["end"]: print("okasii") else: log_probs = torch.cat([node1_prob, node2_prob]) policy_loss = -torch.mean(log_probs) * advantage policy_losses.append(policy_loss) if advantage > 0: x_y_mean_loss = F.l1_loss( torch.from_numpy(action["new_node"][0]).double(), x_y_mean.double()) x_y_var_loss = F.l1_loss( torch.from_numpy( np.abs( action["new_node"][0] - x_y_mean.to('cpu').detach().numpy().copy())), x_y_std.double()) policy_losses.append( (x_y_mean_loss + x_y_var_loss) * advantage) x_y_opt_trigger = True # x_y_optのトリガーを起動 else: x_y_mean_loss = torch.zeros(1) x_y_var_loss = torch.zeros(1) # calculate critic (value) loss using L1 loss value_losses.append( F.l1_loss(value.double(), torch.tensor([[R]]).double())) # reset gradients self.Critic_opt.zero_grad() self.Node1_opt.zero_grad() self.Node2_opt.zero_grad() if x_y_opt_trigger: self.x_y_opt.zero_grad() # sum up all the values of policy_losses and value_losses if len(policy_losses) == 0: loss = torch.stack(value_losses).sum() else: loss = torch.stack(policy_losses).sum() + \ torch.stack(value_losses).sum() # perform backprop loss.backward() for lp, gp in zip(self.local_criticNet.parameters(), self.global_criticNet.parameters()): gp._grad = lp.grad for lp, gp in zip(self.local_node1Net.parameters(), self.global_node1Net.parameters()): gp._grad = lp.grad for lp, gp in zip(self.local_node2Net.parameters(), self.global_node2Net.parameters()): gp._grad = lp.grad if x_y_opt_trigger: for lp, gp in zip(self.local_x_y_Net.parameters(), self.global_x_y_Net.parameters()): gp._grad = lp.grad self.Critic_opt.step() self.Node1_opt.step() self.Node2_opt.step() if x_y_opt_trigger: self.x_y_opt.step() # pull global parameters self.local_criticNet.load_state_dict( self.global_criticNet.state_dict()) self.local_x_y_Net.load_state_dict(self.global_x_y_Net.state_dict()) self.local_node1Net.load_state_dict(self.global_node1Net.state_dict()) self.local_node2Net.load_state_dict(self.global_node2Net.state_dict()) # reset rewards and action buffer del self.local_criticNet.rewards[:] del self.local_criticNet.saved_actions[:] del self.local_x_y_Net.saved_actions[:] del self.local_node1Net.saved_actions[:] del self.local_node2Net.saved_actions[:] if history is not None: history['advantage'].append(advantage.item()) return loss.item() def run(self, max_episodes=5000, test_name="test", log_file=False, save_pth=False, history=None, device=torch.device('cpu')): while self.g_ep.value < self.total_episodes: # 入力ノードを再設定している為,ここに追加 node_pos, input_nodes, input_vectors,\ output_nodes, output_vectors, frozen_nodes,\ edges_indices, edges_thickness, frozen_nodes = easy_dev() self.env = BarFemGym(node_pos, input_nodes, input_vectors, output_nodes, output_vectors, frozen_nodes, edges_indices, edges_thickness, frozen_nodes) self.env.reset() for episode in range(max_episodes): action = select_action_gcn_critic_gcn(self.env, self.local_criticNet, self.local_node1Net, self.local_node2Net, self.local_x_y_Net, device, log_dir=log_file, history=history) next_nodes_pos, _, done, _ = self.env.step(action) if 4 in action['which_node']: self.env.input_nodes = [2, 4] self.env.input_vectors = np.array([[1, 0], [0, 1]]) if 2 in action['which_node'] and 4 in action[ 'which_node']: # TODO [2,4]を選択しないように学習させる reward = np.array([0]) else: reward = self.env.calculate_simulation() self.local_criticNet.rewards.append(reward) done = True # 今回はonestepの為 if done: # update global and assign to local net record(self.g_ep, self.g_ep_r, reward, self.res_queue, self.name) # sync # 各プロセスの重み更新をglobalにpushし,その更新後のものを各プロセスの重みに戻す self.finish_episode() break self.res_queue.put(None)