def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations( Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', connected=False): """See my name, then you know what I do. """ from tqdm import tqdm # Gn_median = Gn_median[0:10] # Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] node_ir = np.inf # corresponding to the node remove and insertion. label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], edge_label=edge_label) ite_max = 50 epsilon = 0.001 def generate_graph(G, pi_p_forward, label_set): G_new_list = [G.copy()] # all "best" graphs generated in this iteration. # nx.draw_networkx(G) # import matplotlib.pyplot as plt # plt.show() # print(pi_p_forward) # update vertex labels. # pre-compute h_i0 for each label. # for label in get_node_labels(Gn, node_label): # print(label) # for nd in G.nodes(data=True): # pass if not ds_attrs['node_attr_dim']: # labels are symbolic for ndi, (nd, _) in enumerate(G.nodes(data=True)): h_i0_list = [] label_list = [] for label in label_set: h_i0 = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][ndi] if pi_i != node_ir and g.nodes[pi_i][node_label] == label: h_i0 += 1 h_i0_list.append(h_i0) label_list.append(label) # case when the node is to be removed. h_i0_remove = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][ndi] if pi_i == node_ir: h_i0_remove += 1 h_i0_list.append(h_i0_remove) label_list.append(label_r) # get the best labels. idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() nlabel_best = [label_list[idx] for idx in idx_max] # generate "best" graphs with regard to "best" node labels. G_new_list_nd = [] for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. for nl in nlabel_best: g_tmp = g.copy() if nl == label_r: g_tmp.remove_node(nd) else: g_tmp.nodes[nd][node_label] = nl G_new_list_nd.append(g_tmp) # nx.draw_networkx(g_tmp) # import matplotlib.pyplot as plt # plt.show() # print(g_tmp.nodes(data=True)) # print(g_tmp.edges(data=True)) G_new_list = G_new_list_nd[:] else: # labels are non-symbolic for ndi, (nd, _) in enumerate(G.nodes(data=True)): Si_norm = 0 phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][ndi] if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? Si_norm += 1 phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) phi_i_bar /= Si_norm G_new_list[0].nodes[nd]['attributes'] = phi_i_bar # update edge labels and adjacency matrix. if ds_attrs['edge_labeled']: for nd1, nd2, _ in G.edges(data=True): h_ij0_list = [] label_list = [] for label in get_edge_labels(Gn_median, edge_label): h_ij0 = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][nd1] pi_j = pi_p_forward[idx][nd2] h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j) and g.edges[pi_i, pi_j][edge_label] == label) h_ij0 += h_ij0_p h_ij0_list.append(h_ij0) label_list.append(label) # choose one of the best randomly. idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() h_ij0_max = h_ij0_list[idx_max[0]] idx_rdm = random.randint(0, len(idx_max) - 1) best_label = label_list[idx_max[idx_rdm]] # check whether a_ij is 0 or 1. sij_norm = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][nd1] pi_j = pi_p_forward[idx][nd2] if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): sij_norm += 1 if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): if not G_new.has_edge(nd1, nd2): G_new.add_edge(nd1, nd2) G_new.edges[nd1, nd2][edge_label] = best_label else: if G_new.has_edge(nd1, nd2): G_new.remove_edge(nd1, nd2) else: # if edges are unlabeled # @todo: works only for undirected graphs. nd_list = [n for n in G.nodes()] for g_tmp in G_new_list: for nd1i in range(nx.number_of_nodes(G)): nd1 = nd_list[nd1i] for nd2i in range(nd1i + 1, nx.number_of_nodes(G)): nd2 = nd_list[nd2i] sij_norm = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][nd1i] pi_j = pi_p_forward[idx][nd2i] if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): sij_norm += 1 if sij_norm > len(Gn_median) * c_er / (c_er + c_ei): # @todo: should we consider if nd1 and nd2 in g_tmp? # or just add the edge anyway? if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \ and not g_tmp.has_edge(nd1, nd2): g_tmp.add_edge(nd1, nd2) elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): if g_tmp.has_edge(nd1, nd2): g_tmp.remove_edge(nd1, nd2) # do not change anything when equal. # # find the best graph generated in this iteration and update pi_p. # @todo: should we update all graphs generated or just the best ones? dis_list, pi_forward_list = median_distance(G_new_list, Gn_median) # @todo: should we remove the identical and connectivity check? # Don't know which is faster. if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: G_new_list, idx_list = remove_duplicates(G_new_list) pi_forward_list = [pi_forward_list[idx] for idx in idx_list] dis_list = [dis_list[idx] for idx in idx_list] # if connected == True: # G_new_list, idx_list = remove_disconnected(G_new_list) # pi_forward_list = [pi_forward_list[idx] for idx in idx_list] # idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() # dis_min = dis_list[idx_min_tmp_list[0]] # pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list] # G_new_list = [G_new_list[idx] for idx in idx_min_list] # for g in G_new_list: # import matplotlib.pyplot as plt # nx.draw_networkx(g) # plt.show() # print(g.nodes(data=True)) # print(g.edges(data=True)) return G_new_list, pi_forward_list, dis_list def best_median_graphs(Gn_candidate, pi_all_forward, dis_all): idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist() dis_min = dis_all[idx_min_list[0]] pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list] G_min_list = [Gn_candidate[idx] for idx in idx_min_list] return G_min_list, pi_forward_min_list, dis_min def iteration_proc(G, pi_p_forward, cur_sod): G_list = [G] pi_forward_list = [pi_p_forward] old_sod = cur_sod * 2 sod_list = [cur_sod] # iterations. itr = 0 while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon: # for itr in range(0, 5): # the convergence condition? print('itr is', itr) G_new_list = [] pi_forward_new_list = [] dis_new_list = [] for idx, G in enumerate(G_list): label_set = get_node_labels(Gn_median + [G], node_label) G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph( G, pi_forward_list[idx], label_set) G_new_list += G_tmp_list pi_forward_new_list += pi_forward_tmp_list dis_new_list += dis_tmp_list G_list = G_new_list[:] pi_forward_list = pi_forward_new_list[:] dis_list = dis_new_list[:] old_sod = cur_sod cur_sod = np.min(dis_list) sod_list.append(cur_sod) itr += 1 # @todo: do we return all graphs or the best ones? # get the best ones of the generated graphs. G_list, pi_forward_list, dis_min = best_median_graphs( G_list, pi_forward_list, dis_list) if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: G_list, idx_list = remove_duplicates(G_list) pi_forward_list = [pi_forward_list[idx] for idx in idx_list] # dis_list = [dis_list[idx] for idx in idx_list] # import matplotlib.pyplot as plt # for g in G_list: # nx.draw_networkx(g) # plt.show() # print(g.nodes(data=True)) # print(g.edges(data=True)) print('\nsods:', sod_list, '\n') return G_list, pi_forward_list, dis_min def remove_duplicates(Gn): """Remove duplicate graphs from list. """ Gn_new = [] idx_list = [] for idx, g in enumerate(Gn): dupl = False for g_new in Gn_new: if graph_isIdentical(g_new, g): dupl = True break if not dupl: Gn_new.append(g) idx_list.append(idx) return Gn_new, idx_list def remove_disconnected(Gn): """Remove disconnected graphs from list. """ Gn_new = [] idx_list = [] for idx, g in enumerate(Gn): if nx.is_connected(g): Gn_new.append(g) idx_list.append(idx) return Gn_new, idx_list # phase 1: initilize. # compute set-median. dis_min = np.inf dis_list, pi_forward_all = median_distance(Gn_candidate, Gn_median) # find all smallest distances. idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() dis_min = dis_list[idx_min_list[0]] # phase 2: iteration. G_list = [] dis_list = [] pi_forward_list = [] for idx_min in idx_min_list: # print('idx_min is', idx_min) G = Gn_candidate[idx_min].copy() # list of edit operations. pi_p_forward = pi_forward_all[idx_min] # pi_p_backward = pi_all_backward[idx_min] Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min) G_list += Gi_list dis_list.append(dis_i_min) pi_forward_list += pi_i_forward_list if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: G_list, idx_list = remove_duplicates(G_list) dis_list = [dis_list[idx] for idx in idx_list] pi_forward_list = [pi_forward_list[idx] for idx in idx_list] if connected == True: G_list_con, idx_list = remove_disconnected(G_list) # if there is no connected graphs at all, then remain the disconnected ones. if len(G_list_con) > 0: # @todo: ?????????????????????????? G_list = G_list_con dis_list = [dis_list[idx] for idx in idx_list] pi_forward_list = [pi_forward_list[idx] for idx in idx_list] # import matplotlib.pyplot as plt # for g in G_list: # nx.draw_networkx(g) # plt.show() # print(g.nodes(data=True)) # print(g.edges(data=True)) # get the best median graphs # dis_list, pi_forward_list = median_distance(G_list, Gn_median) G_min_list, pi_forward_min_list, dis_min = best_median_graphs( G_list, pi_forward_list, dis_list) # for g in G_min_list: # nx.draw_networkx(g) # plt.show() # print(g.nodes(data=True)) # print(g.edges(data=True)) return G_min_list, dis_min
def commonwalkkernel(*args, node_label='atom', edge_label='bond_type', n=None, weight=1, compute_method=None, n_jobs=None, verbose=True): """Calculate common walk graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. n : integer Longest length of walks. Only useful when applying the 'brute' method. weight: integer Weight coefficient of different lengths of walks, which represents beta in 'exp' method and gamma in 'geo'. compute_method : string Method used to compute walk kernel. The Following choices are available: 'exp' : exponential serial method applied on the direct product graph, as shown in reference [1]. The time complexity is O(n^6) for graphs with n vertices. 'geo' : geometric serial method applied on the direct product graph, as shown in reference [1]. The time complexity is O(n^6) for graphs with n vertices. 'brute' : brute force, simply search for all walks and compare them. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is a common walk kernel between 2 graphs. """ compute_method = compute_method.lower() # arrange all graphs in a list Gn = args[0] if len(args) == 1 else [args[0], args[1]] # remove graphs with only 1 node, as they do not have adjacency matrices len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they have only 1 node.\n' % (len_gn - len(Gn))) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') if not ds_attrs['is_directed']: # convert Gn = [G.to_directed() for G in Gn] start_time = time.time() Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare # direct product graph method - exponential if compute_method == 'exp': do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) # direct product graph method - geometric elif compute_method == 'geo': do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, verbose=verbose) # pool = Pool(n_jobs) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) # if len_itr < 1000 * n_jobs: # chunksize = int(len_itr / n_jobs) + 1 # else: # chunksize = 1000 # # # direct product graph method - exponential # if compute_method == 'exp': # do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) # # direct product graph method - geometric # elif compute_method == 'geo': # do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) # # for i, j, kernel in tqdm( # pool.imap_unordered(do_partial, itr, chunksize), # desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- direct running, normally use single CPU core. ---- # # direct product graph method - exponential # itr = combinations_with_replacement(range(0, len(Gn)), 2) # if compute_method == 'exp': # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, # edge_label, weight) # Kmatrix[j][i] = Kmatrix[i][j] # # # direct product graph method - geometric # elif compute_method == 'geo': # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, # edge_label, weight) # Kmatrix[j][i] = Kmatrix[i][j] # # search all paths use brute force. # elif compute_method == 'brute': # n = int(n) # # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. # all_walks = [ # find_all_walks_until_length(Gn[i], n, node_label, edge_label) # for i in range(0, len(Gn)) # ] # # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # Kmatrix[i][j] = _commonwalkkernel_brute( # all_walks[i], # all_walks[j], # node_label=node_label, # edge_label=edge_label) # Kmatrix[j][i] = Kmatrix[i][j] run_time = time.time() - start_time if verbose: print( "\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', connected=True): """See my name, then you know what I do. """ # Gn = Gn[0:10] Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] # phase 1: initilize. # compute set-median. dis_min = np.inf pi_p = [] pi_all = [] for idx1, G_p in enumerate(Gn): dist_sum = 0 pi_all.append([]) for idx2, G_p_prime in enumerate(Gn): dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime) pi_all[idx1].append(pi_tmp) dist_sum += dist_tmp if dist_sum < dis_min: dis_min = dist_sum G = G_p.copy() idx_min = idx1 # list of edit operations. pi_p = pi_all[idx_min] # phase 2: iteration. ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], edge_label=edge_label) for itr in range(0, 10): # @todo: the convergence condition? G_new = G.copy() # update vertex labels. # pre-compute h_i0 for each label. # for label in get_node_labels(Gn, node_label): # print(label) # for nd in G.nodes(data=True): # pass if not ds_attrs['node_attr_dim']: # labels are symbolic for nd, _ in G.nodes(data=True): h_i0_list = [] label_list = [] for label in get_node_labels(Gn, node_label): h_i0 = 0 for idx, g in enumerate(Gn): pi_i = pi_p[idx][nd] if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: h_i0 += 1 h_i0_list.append(h_i0) label_list.append(label) # choose one of the best randomly. idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() idx_rdm = random.randint(0, len(idx_max) - 1) G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] else: # labels are non-symbolic for nd, _ in G.nodes(data=True): Si_norm = 0 phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) for idx, g in enumerate(Gn): pi_i = pi_p[idx][nd] if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? Si_norm += 1 phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) phi_i_bar /= Si_norm G_new.nodes[nd]['attributes'] = phi_i_bar # update edge labels and adjacency matrix. if ds_attrs['edge_labeled']: for nd1, nd2, _ in G.edges(data=True): h_ij0_list = [] label_list = [] for label in get_edge_labels(Gn, edge_label): h_ij0 = 0 for idx, g in enumerate(Gn): pi_i = pi_p[idx][nd1] pi_j = pi_p[idx][nd2] h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j) and g.edges[pi_i, pi_j][edge_label] == label) h_ij0 += h_ij0_p h_ij0_list.append(h_ij0) label_list.append(label) # choose one of the best randomly. idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() h_ij0_max = h_ij0_list[idx_max[0]] idx_rdm = random.randint(0, len(idx_max) - 1) best_label = label_list[idx_max[idx_rdm]] # check whether a_ij is 0 or 1. sij_norm = 0 for idx, g in enumerate(Gn): pi_i = pi_p[idx][nd1] pi_j = pi_p[idx][nd2] if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): sij_norm += 1 if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): if not G_new.has_edge(nd1, nd2): G_new.add_edge(nd1, nd2) G_new.edges[nd1, nd2][edge_label] = best_label else: if G_new.has_edge(nd1, nd2): G_new.remove_edge(nd1, nd2) else: # if edges are unlabeled for nd1, nd2, _ in G.edges(data=True): sij_norm = 0 for idx, g in enumerate(Gn): pi_i = pi_p[idx][nd1] pi_j = pi_p[idx][nd2] if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): sij_norm += 1 if sij_norm > len(Gn) * c_er / (c_er + c_ei): if not G_new.has_edge(nd1, nd2): G_new.add_edge(nd1, nd2) else: if G_new.has_edge(nd1, nd2): G_new.remove_edge(nd1, nd2) G = G_new.copy() # update pi_p pi_p = [] for idx1, G_p in enumerate(Gn): dist_tmp, pi_tmp, _ = GED(G, G_p) pi_p.append(pi_tmp) return G
def weisfeilerlehmankernel(*args, node_label='atom', edge_label='bond_type', height=0, base_kernel='subtree', parallel=None, n_jobs=None, verbose=True): """Calculate Weisfeiler-Lehman kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. node_label : string Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. height : int Subtree height. base_kernel : string Base kernel used in each iteration of WL kernel. Only default 'subtree' kernel can be applied for now. # The default base # kernel is subtree kernel. For user-defined kernel, base_kernel is the # name of the base kernel function used in each iteration of WL kernel. # This function returns a Numpy matrix, each element of which is the # user-defined Weisfeiler-Lehman kernel between 2 praphs. parallel : None Which paralleliztion method is applied to compute the kernel. No parallelization can be applied for now. n_jobs : int Number of jobs for parallelization. The default is to use all computational cores. This argument is only valid when one of the parallelization method is applied and can be ignored for now. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. Notes ----- This function now supports WL subtree kernel only. """ # pre-process base_kernel = base_kernel.lower() Gn = args[0] if len(args) == 1 else [args[0], args[1] ] # arrange all graphs in a list Gn = [g.copy() for g in Gn] ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], node_label=node_label) if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') start_time = time.time() # for WL subtree kernel if base_kernel == 'subtree': Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose) # for WL shortest path kernel elif base_kernel == 'sp': Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height) # for WL edge kernel elif base_kernel == 'edge': Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height) # for user defined base kernel else: Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel) run_time = time.time() - start_time if verbose: print( "\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), run_time)) return Kmatrix, run_time
def marginalizedkernel(*args, node_label='atom', edge_label='bond_type', p_quit=0.5, n_iteration=20, remove_totters=False, n_jobs=None, verbose=True): """Calculate marginalized graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. p_quit : integer the termination probability in the random walks generating step n_iteration : integer time of iterations to calculate R_inf remove_totters : boolean whether to remove totters. The default value is True. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the marginalized kernel between 2 praphs. """ # pre-process n_iteration = int(n_iteration) Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) if not ds_attrs['node_labeled'] or node_label == None: node_label = 'atom' for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled'] or edge_label == None: edge_label = 'bond_type' for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() if remove_totters: # ---- use pool.imap_unordered to parallel and track progress. ---- pool = Pool(n_jobs) untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 for i, g in tqdm( pool.imap_unordered( untotter_partial, range(0, len(Gn)), chunksize), desc='removing tottering', file=sys.stdout): Gn[i] = g pool.close() pool.join() # # ---- direct running, normally use single CPU core. ---- # Gn = [ # untotterTransformation(G, node_label, edge_label) # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) # ] Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_marg_do, node_label, edge_label, p_quit, n_iteration) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) # # ---- direct running, normally use single CPU core. ---- ## pbar = tqdm( ## total=(1 + len(Gn)) * len(Gn) / 2, ## desc='calculating kernels', ## file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): ## print(i, j) # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, # edge_label, p_quit, n_iteration) # Kmatrix[j][i] = Kmatrix[i][j] ## pbar.update(1) run_time = time.time() - start_time if verbose: print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def randomwalkkernel( *args, # params for all method. compute_method=None, weight=1, p=None, q=None, edge_weight=None, # params for conjugate and fp method. node_kernels=None, edge_kernels=None, node_label='atom', edge_label='bond_type', # params for spectral method. sub_kernel=None, n_jobs=None, verbose=True): """Calculate random walk graph kernels. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. h : integer Longest length of walks. method : string Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the path kernel up to d between 2 praphs. """ compute_method = compute_method.lower() Gn = args[0] if len(args) == 1 else [args[0], args[1]] eweight = None if edge_weight == None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, float) or isinstance(some_weight, int): eweight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes(Gn, attr_names=[ 'node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed' ], node_label=node_label, edge_label=edge_label) # remove graphs with no edges, as no walk can be found in their structures, # so the weight matrix between such a graph and itself might be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() # # get vertex and edge concatenated labels for each graph # label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed']) # gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed']) if compute_method == 'sylvester': if verbose: import warnings warnings.warn('All labels are ignored.') Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs, verbose=verbose) elif compute_method == 'conjugate': Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, verbose=verbose) elif compute_method == 'fp': Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, verbose=verbose) elif compute_method == 'spectral': if verbose: import warnings warnings.warn( 'All labels are ignored. Only works for undirected graphs.') Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=verbose) elif compute_method == 'kron': pass for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j], node_label, edge_label) Kmatrix[j][i] = Kmatrix[i][j] else: raise Exception( 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".' ) run_time = time.time() - start_time if verbose: print( "\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None, n_jobs=None, verbose=True): """Calculate shortest-path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. node_label : string Node attribute used as label. The default node label is atom. edge_weight : string Edge attribute name corresponding to the edge weight. node_kernels : dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the sp kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] weight = None if edge_weight is None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], node_label=node_label) # remove graphs with no edges, as no sp can be found in their structures, # so the kernel between such a graph and itself will be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() pool = Pool(n_jobs) # get shortest path graphs of Gn getsp_partial = partial(wrapper_getSPGraph, weight) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: # # use default chunksize as pool.map when iterable is less than 100 # chunksize, extra = divmod(len(Gn), n_jobs * 4) # if extra: # chunksize += 1 chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 if verbose: iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting sp graphs', file=sys.stdout) else: iterator = pool.imap_unordered(getsp_partial, itr, chunksize) for i, g in iterator: Gn[i] = g pool.close() pool.join() # # ---- direct running, normally use single CPU core. ---- # for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): # i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) # # ---- use pool.map to parallel ---- # result_sp = pool.map(getsp_partial, range(0, len(Gn))) # for i in result_sp: # Gn[i[0]] = i[1] # or # getsp_partial = partial(wrap_getSPGraph, Gn, weight) # for i, g in tqdm( # pool.map(getsp_partial, range(0, len(Gn))), # desc='getting sp graphs', # file=sys.stdout): # Gn[i] = g # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # sp_ml = [0] * len(Gn) # shortest path matrices # for i in result_sp: # sp_ml[i[0]] = i[1] # edge_x_g = [[] for i in range(len(sp_ml))] # edge_y_g = [[] for i in range(len(sp_ml))] # edge_w_g = [[] for i in range(len(sp_ml))] # for idx, item in enumerate(sp_ml): # for i1 in range(len(item)): # for i2 in range(i1 + 1, len(item)): # if item[i1, i2] != np.inf: # edge_x_g[idx].append(i1) # edge_y_g[idx].append(i2) # edge_w_g[idx].append(item[i1, i2]) # print(len(edge_x_g[0])) # print(len(edge_y_g[0])) # print(len(edge_w_g[0])) Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, verbose=verbose) # # ---- use pool.map to parallel. ---- # # result_perf = pool.map(do_partial, itr) # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j, kernel in tqdm( # pool.map(do_partial, itr), desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- use joblib.Parallel to parallel and track progress. ---- # result_perf = Parallel( # n_jobs=n_jobs, verbose=10)( # delayed(do_partial)(ij) # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) # result_perf = [ # do_partial(ij) # for ij in combinations_with_replacement(range(0, len(Gn)), 2) # ] # for i in result_perf: # Kmatrix[i[0]][i[1]] = i[2] # Kmatrix[i[1]][i[0]] = i[2] # # ---- direct running, normally use single CPU core. ---- # from itertools import combinations_with_replacement # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel run_time = time.time() - start_time if verbose: print( "\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def randomwalkkernel( *args, # params for all method. compute_method=None, weight=1, p=None, q=None, edge_weight=None, # params for conjugate and fp method. node_kernels=None, edge_kernels=None, node_label='atom', edge_label='bond_type', # params for spectral method. sub_kernel=None, n_jobs=None, verbose=True): """Calculate random walk graph kernels. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. compute_method : string Method used to compute kernel. The Following choices are available: 'sylvester' - Sylvester equation method. 'conjugate' - conjugate gradient method. 'fp' - fixed-point iterations. 'spectral' - spectral decomposition. weight : float A constant weight set for random walks of length h. p : None Initial probability distribution on the unlabeled direct product graph of two graphs. It is set to be uniform over all vertices in the direct product graph. q : None Stopping probability distribution on the unlabeled direct product graph of two graphs. It is set to be uniform over all vertices in the direct product graph. edge_weight: float Edge attribute name corresponding to the edge weight. node_kernels: dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when nodes are unlabeled. This argument is designated to conjugate gradient method and fixed-point iterations. edge_kernels: dict A dictionary of kernel functions for edges, including 3 items: 'symb' for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' for both labels. The first 2 functions take two edge labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two edges. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when edges are unlabeled. This argument is designated to conjugate gradient method and fixed-point iterations. node_label: string Node attribute used as label. The default node label is atom. This argument is designated to conjugate gradient method and fixed-point iterations. edge_label : string Edge attribute used as label. The default edge label is bond_type. This argument is designated to conjugate gradient method and fixed-point iterations. sub_kernel: string Method used to compute walk kernel. The Following choices are available: 'exp' : method based on exponential serials. 'geo' : method based on geometric serials. n_jobs: int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the path kernel up to d between 2 praphs. """ compute_method = compute_method.lower() Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] eweight = None if edge_weight == None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, float) or isinstance(some_weight, int): eweight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes(Gn, attr_names=[ 'node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed' ], node_label=node_label, edge_label=edge_label) # remove graphs with no edges, as no walk can be found in their structures, # so the weight matrix between such a graph and itself might be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() # # get vertex and edge concatenated labels for each graph # label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed']) # gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed']) if compute_method == 'sylvester': if verbose: import warnings warnings.warn('All labels are ignored.') Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs, verbose=verbose) elif compute_method == 'conjugate': Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, verbose=verbose) elif compute_method == 'fp': Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, verbose=verbose) elif compute_method == 'spectral': if verbose: import warnings warnings.warn( 'All labels are ignored. Only works for undirected graphs.') Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=verbose) elif compute_method == 'kron': pass for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j], node_label, edge_label) Kmatrix[j][i] = Kmatrix[i][j] else: raise Exception( 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".' ) run_time = time.time() - start_time if verbose: print( "\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def treeletkernel(*args, sub_kernel, node_label='atom', edge_label='bond_type', parallel='imap_unordered', n_jobs=None, verbose=True): """Calculate treelet graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. sub_kernel : function The sub-kernel between 2 real number vectors. Each vector counts the numbers of isomorphic treelets in a graph. node_label : string Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. parallel : string/None Which paralleliztion method is applied to compute the kernel. The Following choices are available: 'imap_unordered': use Python's multiprocessing.Pool.imap_unordered method. None: no parallelization is applied. n_jobs : int Number of jobs for parallelization. The default is to use all computational cores. This argument is only valid when one of the parallelization method is applied. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the treelet kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] Kmatrix = np.zeros((len(Gn), len(Gn))) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) labeled = False if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']: labeled = True if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() # ---- use pool.imap_unordered to parallel and track progress. ---- if parallel == 'imap_unordered': # get all canonical keys of all graphs before calculating kernels to save # time, but this may cost a lot of memory for large dataset. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 canonkeys = [[] for _ in range(len(Gn))] get_partial = partial(wrapper_get_canonkeys, node_label, edge_label, labeled, ds_attrs['is_directed']) if verbose: iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize), desc='getting canonkeys', file=sys.stdout) else: iterator = pool.imap_unordered(get_partial, itr, chunksize) for i, ck in iterator: canonkeys[i] = ck pool.close() pool.join() # compute kernels. def init_worker(canonkeys_toshare): global G_canonkeys G_canonkeys = canonkeys_toshare do_partial = partial(wrapper_treeletkernel_do, sub_kernel) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(canonkeys, ), n_jobs=n_jobs, verbose=verbose) # ---- do not use parallelization. ---- elif parallel == None: # get all canonical keys of all graphs before calculating kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys = [] for g in (tqdm(Gn, desc='getting canonkeys', file=sys.stdout) if verbose else Gn): canonkeys.append( get_canonkeys(g, node_label, edge_label, labeled, ds_attrs['is_directed'])) # compute kernels. from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(Gn)), 2) for i, j in (tqdm(itr, desc='getting canonkeys', file=sys.stdout) if verbose else itr): Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j], sub_kernel) Kmatrix[j][i] = Kmatrix[i][ j] # @todo: no directed graph considered? else: raise Exception('No proper parallelization method designated.') run_time = time.time() - start_time if verbose: print( "\n --- treelet kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def structuralspkernel(*args, node_label='atom', edge_weight=None, edge_label='bond_type', node_kernels=None, edge_kernels=None, compute_method='naive', n_jobs=None, verbose=True): """Calculate mean average structural shortest path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_weight : string Edge attribute name corresponding to the edge weight. edge_label : string edge attribute used as label. The default edge label is bond_type. node_kernels: dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when nodes are unlabeled. edge_kernels: dict A dictionary of kernel functions for edges, including 3 items: 'symb' for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' for both labels. The first 2 functions take two edge labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two edges. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when edges are unlabeled. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the mean average structural shortest path kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] weight = None if edge_weight is None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes(Gn, attr_names=[ 'node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed' ], node_label=node_label, edge_label=edge_label) start_time = time.time() # get shortest paths of each graph in Gn splist = [None] * len(Gn) pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 # get shortest path graphs of Gn if compute_method == 'trie': getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) else: getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) if verbose: iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting shortest paths', file=sys.stdout) else: iterator = pool.imap_unordered(getsp_partial, itr, chunksize) for i, sp in iterator: splist[i] = sp # time.sleep(10) pool.close() pool.join() # ss = 0 # ss += sys.getsizeof(splist) # for spss in splist: # ss += sys.getsizeof(spss) # for spp in spss: # ss += sys.getsizeof(spp) # time.sleep(20) # # ---- direct running, normally use single CPU core. ---- # splist = [] # if compute_method == 'trie': # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): # splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed'])) # else: # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): # splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # sp_ml = [0] * len(Gn) # shortest path matrices # for i in result_sp: # sp_ml[i[0]] = i[1] # edge_x_g = [[] for i in range(len(sp_ml))] # edge_y_g = [[] for i in range(len(sp_ml))] # edge_w_g = [[] for i in range(len(sp_ml))] # for idx, item in enumerate(sp_ml): # for i1 in range(len(item)): # for i2 in range(i1 + 1, len(item)): # if item[i1, i2] != np.inf: # edge_x_g[idx].append(i1) # edge_y_g[idx].append(i2) # edge_w_g[idx].append(item[i1, i2]) # print(len(edge_x_g[0])) # print(len(edge_y_g[0])) # print(len(edge_w_g[0])) Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(spl_toshare, gs_toshare): global G_spl, G_gs G_spl = spl_toshare G_gs = gs_toshare if compute_method == 'trie': do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, node_kernels, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) else: do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, node_kernels, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) # # ---- use pool.map to parallel. ---- # pool = Pool(n_jobs) # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # for i, j, kernel in tqdm( # pool.map(do_partial, itr), desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- use pool.imap_unordered to parallel and track progress. ---- # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) # if len_itr < 1000 * n_jobs: # chunksize = int(len_itr / n_jobs) + 1 # else: # chunksize = 1000 # from contextlib import closing # with closing(Pool(n_jobs)) as pool: # for i, j, kernel in tqdm( # pool.imap_unordered(do_partial, itr, 1000), # desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- direct running, normally use single CPU core. ---- # from itertools import combinations_with_replacement # itr = combinations_with_replacement(range(0, len(Gn)), 2) # if compute_method == 'trie': # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j], # ds_attrs, node_label, edge_label, node_kernels, edge_kernels) # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # else: # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], # ds_attrs, node_label, edge_label, node_kernels, edge_kernels) # # if(kernel > 1): # # print("error here ") # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel run_time = time.time() - start_time if verbose: print( "\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def untilhpathkernel(*args, node_label='atom', edge_label='bond_type', depth=10, k_func='MinMax', compute_method='trie', n_jobs=None, verbose=True): """Calculate path graph kernels up to depth/hight h between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. node_label : string Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. depth : integer Depth of search. Longest length of paths. k_func : function A kernel function applied using different notions of fingerprint similarity, defining the type of feature map and normalization method applied for the graph kernel. The Following choices are available: 'MinMax': use the MiniMax kernel and counting feature map. 'tanimoto': use the Tanimoto kernel and binary feature map. None: no sub-kernel is used, the kernel is computed directly. compute_method : string Computation method to store paths and compute the graph kernel. The Following choices are available: 'trie': store paths as tries. 'naive': store paths to lists. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the path kernel up to h between 2 praphs. """ # pre-process depth = int(depth) Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] Kmatrix = np.zeros((len(Gn), len(Gn))) ds_attrs = get_dataset_attributes(Gn, attr_names=[ 'node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed' ], node_label=node_label, edge_label=edge_label) if k_func != None: if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() # ---- use pool.imap_unordered to parallel and track progress. ---- # get all paths of all graphs before calculating kernels to save time, # but this may cost a lot of memory for large datasets. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 all_paths = [[] for _ in range(len(Gn))] if compute_method == 'trie' and k_func != None: getps_partial = partial(wrapper_find_all_path_as_trie, depth, ds_attrs, node_label, edge_label) elif compute_method != 'trie' and k_func != None: getps_partial = partial(wrapper_find_all_paths_until_length, depth, ds_attrs, node_label, edge_label, True) else: getps_partial = partial(wrapper_find_all_paths_until_length, depth, ds_attrs, node_label, edge_label, False) if verbose: iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), desc='getting paths', file=sys.stdout) else: iterator = pool.imap_unordered(getps_partial, itr, chunksize) for i, ps in iterator: all_paths[i] = ps pool.close() pool.join() # for g in Gn: # if compute_method == 'trie' and k_func != None: # find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label) # elif compute_method != 'trie' and k_func != None: # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label) # else: # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False) ## size = sys.getsizeof(all_paths) ## for item in all_paths: ## size += sys.getsizeof(item) ## for pppps in item: ## size += sys.getsizeof(pppps) ## print(size) # ## ttt = time.time() ## # ---- ---- use pool.map to parallel ---- ## for i, ps in tqdm( ## pool.map(getps_partial, range(0, len(Gn))), ## desc='getting paths', file=sys.stdout): ## all_paths[i] = ps ## print(time.time() - ttt) if compute_method == 'trie' and k_func != None: def init_worker(trie_toshare): global G_trie G_trie = trie_toshare do_partial = partial(wrapper_uhpath_do_trie, k_func) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, verbose=verbose) elif compute_method != 'trie' and k_func != None: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare do_partial = partial(wrapper_uhpath_do_naive, k_func) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, verbose=verbose) else: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, verbose=verbose) # # ---- direct running, normally use single CPU core. ---- # all_paths = [ # find_all_paths_until_length( # Gn[i], # depth, # ds_attrs, # node_label=node_label, # edge_label=edge_label) for i in tqdm( # range(0, len(Gn)), desc='getting paths', file=sys.stdout) # ] # # if compute_method == 'trie': # pbar = tqdm( # total=((len(Gn) + 1) * len(Gn) / 2), # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # Kmatrix[i][j] = _untilhpathkernel_do_trie(all_paths[i], # all_paths[j], k_func) # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) # else: # pbar = tqdm( # total=((len(Gn) + 1) * len(Gn) / 2), # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # Kmatrix[i][j] = _untilhpathkernel_do_naive(all_paths[i], all_paths[j], # k_func) # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) run_time = time.time() - start_time if verbose: print( "\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" % (depth, len(Gn), run_time)) # print(Kmatrix[0][0:10]) return Kmatrix, run_time
def iam_upgraded( Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50, epsilon=0.001, node_label='atom', edge_label='bond_type', connected=False, removeNodes=True, allBestInit=False, allBestNodes=False, allBestEdges=False, allBestOutput=False, params_ged={ 'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP', 'edit_cost_constant': [], 'stabilizer': None, 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' }): """See my name, then you know what I do. """ # Gn_median = Gn_median[0:10] # Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] node_ir = np.inf # corresponding to the node remove and insertion. label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. ds_attrs = get_dataset_attributes( Gn_median + Gn_candidate, attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'], edge_label=edge_label) node_label_set = get_node_labels(Gn_median, node_label) edge_label_set = get_edge_labels(Gn_median, edge_label) def generate_graph(G, pi_p_forward): G_new_list = [G.copy() ] # all "best" graphs generated in this iteration. # nx.draw_networkx(G) # import matplotlib.pyplot as plt # plt.show() # print(pi_p_forward) # update vertex labels. # pre-compute h_i0 for each label. # for label in get_node_labels(Gn, node_label): # print(label) # for nd in G.nodes(data=True): # pass if not ds_attrs['node_attr_dim']: # labels are symbolic for ndi, (nd, _) in enumerate(G.nodes(data=True)): h_i0_list = [] label_list = [] for label in node_label_set: h_i0 = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][ndi] if pi_i != node_ir and g.nodes[pi_i][ node_label] == label: h_i0 += 1 h_i0_list.append(h_i0) label_list.append(label) # case when the node is to be removed. if removeNodes: h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above. for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][ndi] if pi_i == node_ir: h_i0_remove += 1 h_i0_list.append(h_i0_remove) label_list.append(label_r) # get the best labels. idx_max = np.argwhere( h_i0_list == np.max(h_i0_list)).flatten().tolist() if allBestNodes: # choose all best graphs. nlabel_best = [label_list[idx] for idx in idx_max] # generate "best" graphs with regard to "best" node labels. G_new_list_nd = [] for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. for nl in nlabel_best: g_tmp = g.copy() if nl == label_r: g_tmp.remove_node(nd) else: g_tmp.nodes[nd][node_label] = nl G_new_list_nd.append(g_tmp) # nx.draw_networkx(g_tmp) # import matplotlib.pyplot as plt # plt.show() # print(g_tmp.nodes(data=True)) # print(g_tmp.edges(data=True)) G_new_list = [ggg.copy() for ggg in G_new_list_nd] else: # choose one of the best randomly. idx_rdm = random.randint(0, len(idx_max) - 1) best_label = label_list[idx_max[idx_rdm]] h_i0_max = h_i0_list[idx_max[idx_rdm]] g_new = G_new_list[0] if best_label == label_r: g_new.remove_node(nd) else: g_new.nodes[nd][node_label] = best_label G_new_list = [g_new] else: # labels are non-symbolic for ndi, (nd, _) in enumerate(G.nodes(data=True)): Si_norm = 0 phi_i_bar = np.array( [0.0 for _ in range(ds_attrs['node_attr_dim'])]) for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][ndi] if g.has_node( pi_i ): #@todo: what if no g has node? phi_i_bar = 0? Si_norm += 1 phi_i_bar += np.array([ float(itm) for itm in g.nodes[pi_i]['attributes'] ]) phi_i_bar /= Si_norm G_new_list[0].nodes[nd]['attributes'] = phi_i_bar # for g in G_new_list: # import matplotlib.pyplot as plt # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) # plt.show() # print(g.nodes(data=True)) # print(g.edges(data=True)) # update edge labels and adjacency matrix. if ds_attrs['edge_labeled']: G_new_list_edge = [] for g_new in G_new_list: nd_list = [n for n in g_new.nodes()] g_tmp_list = [g_new.copy()] for nd1i in range(nx.number_of_nodes(g_new)): nd1 = nd_list[ nd1i] # @todo: not just edges, but all pairs of nodes for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)): nd2 = nd_list[nd2i] # for nd1, nd2, _ in g_new.edges(data=True): h_ij0_list = [] label_list = [] for label in edge_label_set: h_ij0 = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][nd1i] pi_j = pi_p_forward[idx][nd2i] h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j) and g.edges[pi_i, pi_j][edge_label] == label) h_ij0 += h_ij0_p h_ij0_list.append(h_ij0) label_list.append(label) # get the best labels. idx_max = np.argwhere(h_ij0_list == np.max( h_ij0_list)).flatten().tolist() if allBestEdges: # choose all best graphs. elabel_best = [label_list[idx] for idx in idx_max] h_ij0_max = [h_ij0_list[idx] for idx in idx_max] # generate "best" graphs with regard to "best" node labels. G_new_list_ed = [] for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now. for idxl, el in enumerate(elabel_best): g_tmp_copy = g_tmp.copy() # check whether a_ij is 0 or 1. sij_norm = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][nd1i] pi_j = pi_p_forward[idx][nd2i] if g.has_node(pi_i) and g.has_node(pi_j) and \ g.has_edge(pi_i, pi_j): sij_norm += 1 if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \ sij_norm * (1 - (c_er + c_ei) / c_es): if not g_tmp_copy.has_edge(nd1, nd2): g_tmp_copy.add_edge(nd1, nd2) g_tmp_copy.edges[nd1, nd2][ edge_label] = elabel_best[idxl] else: if g_tmp_copy.has_edge(nd1, nd2): g_tmp_copy.remove_edge(nd1, nd2) G_new_list_ed.append(g_tmp_copy) g_tmp_list = [ggg.copy() for ggg in G_new_list_ed] else: # choose one of the best randomly. idx_rdm = random.randint(0, len(idx_max) - 1) best_label = label_list[idx_max[idx_rdm]] h_ij0_max = h_ij0_list[idx_max[idx_rdm]] # check whether a_ij is 0 or 1. sij_norm = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][nd1i] pi_j = pi_p_forward[idx][nd2i] if g.has_node(pi_i) and g.has_node( pi_j) and g.has_edge(pi_i, pi_j): sij_norm += 1 if h_ij0_max > len( Gn_median) * c_er / c_es + sij_norm * ( 1 - (c_er + c_ei) / c_es): if not g_new.has_edge(nd1, nd2): g_new.add_edge(nd1, nd2) g_new.edges[nd1, nd2][edge_label] = best_label else: # elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): if g_new.has_edge(nd1, nd2): g_new.remove_edge(nd1, nd2) g_tmp_list = [g_new] G_new_list_edge += g_tmp_list G_new_list = [ggg.copy() for ggg in G_new_list_edge] else: # if edges are unlabeled # @todo: is this even right? G or g_tmp? check if the new one is right # @todo: works only for undirected graphs. for g_tmp in G_new_list: nd_list = [n for n in g_tmp.nodes()] for nd1i in range(nx.number_of_nodes(g_tmp)): nd1 = nd_list[nd1i] for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)): nd2 = nd_list[nd2i] sij_norm = 0 for idx, g in enumerate(Gn_median): pi_i = pi_p_forward[idx][nd1i] pi_j = pi_p_forward[idx][nd2i] if g.has_node(pi_i) and g.has_node( pi_j) and g.has_edge(pi_i, pi_j): sij_norm += 1 if sij_norm > len(Gn_median) * c_er / (c_er + c_ei): # @todo: should we consider if nd1 and nd2 in g_tmp? # or just add the edge anyway? if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \ and not g_tmp.has_edge(nd1, nd2): g_tmp.add_edge(nd1, nd2) else: # @todo: which to use? # elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): if g_tmp.has_edge(nd1, nd2): g_tmp.remove_edge(nd1, nd2) # do not change anything when equal. # for i, g in enumerate(G_new_list): # import matplotlib.pyplot as plt # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True) ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG") # plt.show() # print(g.nodes(data=True)) # print(g.edges(data=True)) # # find the best graph generated in this iteration and update pi_p. # @todo: should we update all graphs generated or just the best ones? dis_list, pi_forward_list = ged_median(G_new_list, Gn_median, params_ged=params_ged) # @todo: should we remove the identical and connectivity check? # Don't know which is faster. if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: G_new_list, idx_list = remove_duplicates(G_new_list) pi_forward_list = [pi_forward_list[idx] for idx in idx_list] dis_list = [dis_list[idx] for idx in idx_list] # if connected == True: # G_new_list, idx_list = remove_disconnected(G_new_list) # pi_forward_list = [pi_forward_list[idx] for idx in idx_list] # idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() # dis_min = dis_list[idx_min_tmp_list[0]] # pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list] # G_new_list = [G_new_list[idx] for idx in idx_min_list] # for g in G_new_list: # import matplotlib.pyplot as plt # nx.draw_networkx(g) # plt.show() # print(g.nodes(data=True)) # print(g.edges(data=True)) return G_new_list, pi_forward_list, dis_list def best_median_graphs(Gn_candidate, pi_all_forward, dis_all): idx_min_list = np.argwhere( dis_all == np.min(dis_all)).flatten().tolist() dis_min = dis_all[idx_min_list[0]] pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list] G_min_list = [Gn_candidate[idx] for idx in idx_min_list] return G_min_list, pi_forward_min_list, dis_min def iteration_proc(G, pi_p_forward, cur_sod): G_list = [G] pi_forward_list = [pi_p_forward] old_sod = cur_sod * 2 sod_list = [cur_sod] dis_list = [cur_sod] # iterations. itr = 0 # @todo: what if difference == 0? # while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or # np.abs(old_sod - cur_sod) == 0): while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon: # while itr < ite_max: # for itr in range(0, 5): # the convergence condition? print('itr_iam is', itr) G_new_list = [] pi_forward_new_list = [] dis_new_list = [] for idx, g in enumerate(G_list): # label_set = get_node_labels(Gn_median + [g], node_label) G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph( g, pi_forward_list[idx]) G_new_list += G_tmp_list pi_forward_new_list += pi_forward_tmp_list dis_new_list += dis_tmp_list # @todo: need to remove duplicates here? G_list = [ggg.copy() for ggg in G_new_list] pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list] dis_list = dis_new_list[:] old_sod = cur_sod cur_sod = np.min(dis_list) sod_list.append(cur_sod) itr += 1 # @todo: do we return all graphs or the best ones? # get the best ones of the generated graphs. G_list, pi_forward_list, dis_min = best_median_graphs( G_list, pi_forward_list, dis_list) if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: G_list, idx_list = remove_duplicates(G_list) pi_forward_list = [pi_forward_list[idx] for idx in idx_list] # dis_list = [dis_list[idx] for idx in idx_list] # import matplotlib.pyplot as plt # for g in G_list: # nx.draw_networkx(g) # plt.show() # print(g.nodes(data=True)) # print(g.edges(data=True)) print('\nsods:', sod_list, '\n') return G_list, pi_forward_list, dis_min, sod_list def remove_duplicates(Gn): """Remove duplicate graphs from list. """ Gn_new = [] idx_list = [] for idx, g in enumerate(Gn): dupl = False for g_new in Gn_new: if graph_isIdentical(g_new, g): dupl = True break if not dupl: Gn_new.append(g) idx_list.append(idx) return Gn_new, idx_list def remove_disconnected(Gn): """Remove disconnected graphs from list. """ Gn_new = [] idx_list = [] for idx, g in enumerate(Gn): if nx.is_connected(g): Gn_new.append(g) idx_list.append(idx) return Gn_new, idx_list ########################################################################### # phase 1: initilize. # compute set-median. dis_min = np.inf dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median, params_ged=params_ged, parallel=True) print('finish computing GEDs.') # find all smallest distances. if allBestInit: # try all best init graphs. idx_min_list = range(len(dis_list)) dis_min = dis_list else: idx_min_list = np.argwhere( dis_list == np.min(dis_list)).flatten().tolist() dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list) idx_min_rdm = random.randint(0, len(idx_min_list) - 1) idx_min_list = [idx_min_list[idx_min_rdm]] sod_set_median = np.min(dis_min) # phase 2: iteration. G_list = [] dis_list = [] pi_forward_list = [] G_set_median_list = [] # sod_list = [] for idx_tmp, idx_min in enumerate(idx_min_list): # print('idx_min is', idx_min) G = Gn_candidate[idx_min].copy() G_set_median_list.append(G.copy()) # list of edit operations. pi_p_forward = pi_forward_all[idx_min] # pi_p_backward = pi_all_backward[idx_min] Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc( G, pi_p_forward, dis_min[idx_tmp]) G_list += Gi_list dis_list += [dis_i_min] * len(Gi_list) pi_forward_list += pi_i_forward_list if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0: G_list, idx_list = remove_duplicates(G_list) dis_list = [dis_list[idx] for idx in idx_list] pi_forward_list = [pi_forward_list[idx] for idx in idx_list] if connected == True: G_list_con, idx_list = remove_disconnected(G_list) # if there is no connected graphs at all, then remain the disconnected ones. if len(G_list_con) > 0: # @todo: ?????????????????????????? G_list = G_list_con dis_list = [dis_list[idx] for idx in idx_list] pi_forward_list = [pi_forward_list[idx] for idx in idx_list] # import matplotlib.pyplot as plt # for g in G_list: # nx.draw_networkx(g) # plt.show() # print(g.nodes(data=True)) # print(g.edges(data=True)) # get the best median graphs G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs( G_list, pi_forward_list, dis_list) # for g in G_gen_median_list: # nx.draw_networkx(g) # plt.show() # print(g.nodes(data=True)) # print(g.edges(data=True)) if not allBestOutput: # randomly choose one graph. idx_rdm = random.randint(0, len(G_gen_median_list) - 1) G_gen_median_list = [G_gen_median_list[idx_rdm]] return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None, n_jobs=None): """Calculate shortest-path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_weight : string Edge attribute name corresponding to the edge weight. node_kernels: dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the sp kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] weight = None if edge_weight is None: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], node_label=node_label) ds_attrs['node_attr_dim'] = 0 # remove graphs with no edges, as no sp can be found in their structures, # so the kernel between such a graph and itself will be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() pool = Pool(n_jobs) # get shortest path graphs of Gn getsp_partial = partial(wrapper_getSPGraph, weight) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: # # use default chunksize as pool.map when iterable is less than 100 # chunksize, extra = divmod(len(Gn), n_jobs * 4) # if extra: # chunksize += 1 chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 for i, g in tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting sp graphs', file=sys.stdout): Gn[i] = g pool.close() pool.join() Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs) run_time = time.time() - start_time print( "\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None, n_jobs=None, chunksize=1): """Calculate shortest-path kernels between graphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] weight = None if edge_weight is None: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], node_label=node_label) # remove graphs with no edges, as no sp can be found in their structures, # so the kernel between such a graph and itself will be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() pool = Pool(n_jobs) # get shortest path graphs of Gn getsp_partial = partial(wrapper_getSPGraph, weight) itr = zip(Gn, range(0, len(Gn))) for i, g in tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting sp graphs', file=sys.stdout): Gn[i] = g pool.close() pool.join() Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) itr = combinations_with_replacement(range(0, len(Gn)), 2) with Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, )) as pool: for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize), desc='calculating kernels', file=sys.stdout): Kmatrix[i][j] = kernel Kmatrix[j][i] = kernel # # ---- direct running, normally use single CPU core. ---- # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel run_time = time.time() - start_time print( "\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
'extra_params': { 'am_sp_al_nl_el': [1, 1, 2, 0, -1] } }, { 'name': 'NCI-HIV', 'dataset': '../../datasets/NCI-HIV/AIDO99SD.sdf', 'dataset_y': '../../datasets/NCI-HIV/aids_conc_may04.txt', }, # # not working below # {'name': 'PTC_FM', 'dataset': '../../datasets/PTC/Train/FM.ds',}, # {'name': 'PTC_FR', 'dataset': '../../datasets/PTC/Train/FR.ds',}, # {'name': 'PTC_MM', 'dataset': '../../datasets/PTC/Train/MM.ds',}, # {'name': 'PTC_MR', 'dataset': '../../datasets/PTC/Train/MR.ds',}, ] for ds in dslist: dataset, y = loadDataset( ds['dataset'], filename_y=(ds['dataset_y'] if 'dataset_y' in ds else None), extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) attrs = get_dataset_attributes(dataset, target=y, node_label='atom', edge_label='bond_type') print() print(ds['name'] + ':') for atr in attrs: print(atr, ':', attrs[atr]) print()
def pathkernel(*args, node_label='atom', edge_label='bond_type'): """Calculate mean average path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. Return ------ Kmatrix/kernel : Numpy matrix/float Kernel matrix, each element of which is the path kernel between 2 praphs. / Path kernel between 2 graphs. """ Gn = args[0] if len(args) == 1 else [args[0], args[1]] Kmatrix = np.zeros((len(Gn), len(Gn))) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) try: some_weight = list(nx.get_edge_attributes(Gn[0], edge_label).values())[0] weight = edge_label if isinstance(some_weight, float) or isinstance( some_weight, int) else None except: weight = None start_time = time.time() splist = [ get_shortest_paths(Gn[i], weight) for i in tqdm( range(0, len(Gn)), desc='getting shortest paths', file=sys.stdout) ] pbar = tqdm(total=((len(Gn) + 1) * len(Gn) / 2), desc='calculating kernels', file=sys.stdout) if ds_attrs['node_labeled']: if ds_attrs['edge_labeled']: for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _pathkernel_do_l(Gn[i], Gn[j], splist[i], splist[j], node_label, edge_label) Kmatrix[j][i] = Kmatrix[i][j] pbar.update(1) else: for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _pathkernel_do_nl(Gn[i], Gn[j], splist[i], splist[j], node_label) Kmatrix[j][i] = Kmatrix[i][j] pbar.update(1) else: if ds_attrs['edge_labeled']: for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _pathkernel_do_el(Gn[i], Gn[j], splist[i], splist[j], edge_label) Kmatrix[j][i] = Kmatrix[i][j] pbar.update(1) else: for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _pathkernel_do_unl(Gn[i], Gn[j], splist[i], splist[j]) Kmatrix[j][i] = Kmatrix[i][j] pbar.update(1) run_time = time.time() - start_time print( "\n --- mean average path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time