def _compute_gm_imap_unordered(self): self._all_graphs_have_edges(self._graphs) # get shortest path graph of each graph. pool = Pool(self._n_jobs) get_sp_graphs_fun = self._wrapper_get_sp_graphs itr = zip(self._graphs, range(0, len(self._graphs))) if len(self._graphs) < 100 * self._n_jobs: chunksize = int(len(self._graphs) / self._n_jobs) + 1 else: chunksize = 100 iterator = get_iters(pool.imap_unordered(get_sp_graphs_fun, itr, chunksize), desc='getting sp graphs', file=sys.stdout, length=len(self._graphs), verbose=(self._verbose >= 2)) for i, g in iterator: self._graphs[i] = g pool.close() pool.join() # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) def init_worker(gs_toshare): global G_gs G_gs = gs_toshare do_fun = self._wrapper_sp_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) return gram_matrix
def _compute_gm_imap_unordered(self): self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored.') # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) if self._q is None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. iterator = get_iters(self._graphs, desc='compute adjacency matrices', file=sys.stdout, verbose=(self._verbose >= 2)) A_wave_list = [nx.adjacency_matrix(G, self._edge_weight).todense().transpose() for G in iterator] # @todo: parallel? if self._p is None: # p is uniform distribution as default. def init_worker(A_wave_list_toshare): global G_A_wave_list G_A_wave_list = A_wave_list_toshare do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(A_wave_list,), n_jobs=self._n_jobs, verbose=self._verbose) else: # @todo pass else: # @todo pass return gram_matrix
def _compute_gm_imap_unordered(self): self.__add_dummy_labels(self._graphs) if self.__remove_totters: pool = Pool(self._n_jobs) itr = range(0, len(self._graphs)) if len(self._graphs) < 100 * self._n_jobs: chunksize = int(len(self._graphs) / self._n_jobs) + 1 else: chunksize = 100 remove_fun = self._wrapper_untotter if self._verbose >= 2: iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize), desc='removing tottering', file=sys.stdout) else: iterator = pool.imap_unordered(remove_fun, itr, chunksize) for i, g in iterator: self._graphs[i] = g pool.close() pool.join() # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) return gram_matrix
def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, chunksize, verbose): """Compute kernel matrix using the base kernel. """ if parallel == 'imap_unordered': # compute kernels. def init_worker(alllabels_toshare): global G_alllabels G_alllabels = alllabels_toshare do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_num_of_each_label, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) elif parallel is None: for i in range(len(Kmatrix)): for j in range(i, len(Kmatrix)): Kmatrix[i][j] = compute_subtree_kernel( all_num_of_each_label[i], all_num_of_each_label[j], Kmatrix[i][j]) Kmatrix[j][i] = Kmatrix[i][j]
def __compute_gram_matrix(self, gram_matrix, all_num_of_each_label, Gn): """Compute Gram matrix using the base kernel. """ if self._parallel == 'imap_unordered': # compute kernels. def init_worker(alllabels_toshare): global G_alllabels G_alllabels = alllabels_toshare do_partial = partial(self._wrapper_compute_subtree_kernel, gram_matrix) parallel_gm(do_partial, gram_matrix, Gn, init_worker=init_worker, glbv=(all_num_of_each_label, ), n_jobs=self._n_jobs, verbose=self._verbose) elif self._parallel is None: for i in range(len(gram_matrix)): for j in range(i, len(gram_matrix)): gram_matrix[i][j] = self.__compute_subtree_kernel( all_num_of_each_label[i], all_num_of_each_label[j], gram_matrix[i][j]) gram_matrix[j][i] = gram_matrix[i][j]
def _compute_gm_imap_unordered(self): self._add_dummy_labels(self._graphs) # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. pool = Pool(self._n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) if len(self._graphs) < 100 * self._n_jobs: chunksize = int(len(self._graphs) / self._n_jobs) + 1 else: chunksize = 100 all_paths = [[] for _ in range(len(self._graphs))] if self._compute_method == 'trie' and self._k_func is not None: get_ps_fun = self._wrapper_find_all_path_as_trie elif self._compute_method != 'trie' and self._k_func is not None: get_ps_fun = partial(self._wrapper_find_all_paths_until_length, True) else: get_ps_fun = partial(self._wrapper_find_all_paths_until_length, False) iterator = get_iters(pool.imap_unordered(get_ps_fun, itr, chunksize), desc='getting paths', file=sys.stdout, length=len(self._graphs), verbose=(self._verbose >= 2)) for i, ps in iterator: all_paths[i] = ps pool.close() pool.join() # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) if self._compute_method == 'trie' and self._k_func is not None: def init_worker(trie_toshare): global G_trie G_trie = trie_toshare do_fun = self._wrapper_kernel_do_trie elif self._compute_method != 'trie' and self._k_func is not None: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare do_fun = self._wrapper_kernel_do_naive else: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare do_fun = self._wrapper_kernel_do_kernelless # @todo: what is this? parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(all_paths,), n_jobs=self._n_jobs, verbose=self._verbose) return gram_matrix
def _compute_gm_imap_unordered(self): # get shortest paths of each graph in the graphs. splist = [None] * len(self._graphs) pool = Pool(self._n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) if len(self._graphs) < 100 * self._n_jobs: chunksize = int(len(self._graphs) / self._n_jobs) + 1 else: chunksize = 100 # get shortest path graphs of self._graphs if self.__compute_method == 'trie': get_sps_fun = self._wrapper_get_sps_trie else: get_sps_fun = self._wrapper_get_sps_naive if self.verbose >= 2: iterator = tqdm(pool.imap_unordered(get_sps_fun, itr, chunksize), desc='getting shortest paths', file=sys.stdout) else: iterator = pool.imap_unordered(get_sps_fun, itr, chunksize) for i, sp in iterator: splist[i] = sp pool.close() pool.join() # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) def init_worker(spl_toshare, gs_toshare): global G_spl, G_gs G_spl = spl_toshare G_gs = gs_toshare if self.__compute_method == 'trie': do_fun = self.__wrapper_ssp_do_trie else: do_fun = self._wrapper_ssp_do_naive parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(splist, self._graphs), n_jobs=self._n_jobs, verbose=self._verbose) return gram_matrix
def _compute_gm_imap_unordered(self): self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) if self._verbose >= 2: import warnings warnings.warn('All labels are ignored. Only works for undirected graphs.') # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) if self._q is None: # precompute the spectral decomposition of each graph. P_list = [] D_list = [] iterator = get_iters(self._graphs, desc='spectral decompose', file=sys.stdout, verbose=(self._verbose >= 2)) for G in iterator: # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. A = nx.adjacency_matrix(G, self._edge_weight).todense().transpose() ew, ev = np.linalg.eig(A) D_list.append(ew) P_list.append(ev) # @todo: parallel? if self._p is None: # p is uniform distribution as default. q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in self._graphs] # @todo: parallel? def init_worker(q_T_list_toshare, P_list_toshare, D_list_toshare): global G_q_T_list, G_P_list, G_D_list G_q_T_list = q_T_list_toshare G_P_list = P_list_toshare G_D_list = D_list_toshare do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(q_T_list, P_list, D_list), n_jobs=self._n_jobs, verbose=self._verbose) else: # @todo pass else: # @todo pass return gram_matrix
def _compute_gm_imap_unordered(self): self.__add_dummy_labels(self._graphs) # get all canonical keys of all graphs before calculating kernels to save # time, but this may cost a lot of memory for large dataset. pool = Pool(self._n_jobs) itr = zip(self._graphs, range(0, len(self._graphs))) if len(self._graphs) < 100 * self._n_jobs: chunksize = int(len(self._graphs) / self._n_jobs) + 1 else: chunksize = 100 canonkeys = [[] for _ in range(len(self._graphs))] get_fun = self._wrapper_get_canonkeys if self._verbose >= 2: iterator = tqdm(pool.imap_unordered(get_fun, itr, chunksize), desc='getting canonkeys', file=sys.stdout) else: iterator = pool.imap_unordered(get_fun, itr, chunksize) for i, ck in iterator: canonkeys[i] = ck pool.close() pool.join() # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) def init_worker(canonkeys_toshare): global G_canonkeys G_canonkeys = canonkeys_toshare do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(canonkeys, ), n_jobs=self._n_jobs, verbose=self._verbose) return gram_matrix
def _compute_gm_imap_unordered(self): self._check_edge_weight(self._graphs, self._verbose) self._check_graphs(self._graphs) # Compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) # @todo: parallel this. # Reindex nodes using consecutive integers for the convenience of kernel computation. iterator = get_iters(self._graphs, desc='Reindex vertices', file=sys.stdout, verbose=(self._verbose >= 2)) self._graphs = [ nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in iterator ] if self._p is None and self._q is None: # p and q are uniform distributions as default. def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_fun = self._wrapper_kernel_do parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(self._graphs, ), n_jobs=self._n_jobs, verbose=self._verbose) else: # @todo pass return gram_matrix
def _compute_gm_imap_unordered(self): self._add_dummy_node_labels(self._graphs) if self._base_kernel == 'subtree': gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) # for i in range(len(self._graphs)): # for j in range(i, len(self._graphs)): # gram_matrix[i][j] = self.pairwise_kernel(self._graphs[i], self._graphs[j]) # gram_matrix[j][i] = gram_matrix[i][j] def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_fun = self._wrapper_pairwise parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) return gram_matrix else: if self._verbose >= 2: import warnings warnings.warn('This base kernel is not parallelized. The serial computation is used instead.') return self._compute_gm_series()
def _compute_gm_imap_unordered(self): self._check_graphs(self._graphs) self._add_dummy_labels(self._graphs) if not self._ds_infos['directed']: # convert self._graphs = [G.to_directed() for G in self._graphs] # compute Gram matrix. gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) # def init_worker(gn_toshare): # global G_gn # G_gn = gn_toshare # direct product graph method - exponential if self._compute_method == 'exp': do_fun = self._wrapper_kernel_do_exp # direct product graph method - geometric elif self._compute_method == 'geo': do_fun = self._wrapper_kernel_do_geo parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=_init_worker_gm, glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) return gram_matrix
def treeletkernel(*args, sub_kernel, node_label='atom', edge_label='bond_type', parallel='imap_unordered', n_jobs=None, chunksize=None, verbose=True): """Compute treelet graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. sub_kernel : function The sub-kernel between 2 real number vectors. Each vector counts the numbers of isomorphic treelets in a graph. node_label : string Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. parallel : string/None Which paralleliztion method is applied to compute the kernel. The Following choices are available: 'imap_unordered': use Python's multiprocessing.Pool.imap_unordered method. None: no parallelization is applied. n_jobs : int Number of jobs for parallelization. The default is to use all computational cores. This argument is only valid when one of the parallelization method is applied. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the treelet kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] Kmatrix = np.zeros((len(Gn), len(Gn))) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) labeled = False if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']: labeled = True if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() # ---- use pool.imap_unordered to parallel and track progress. ---- if parallel == 'imap_unordered': # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if chunksize is None: if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 canonkeys = [[] for _ in range(len(Gn))] get_partial = partial(wrapper_get_canonkeys, node_label, edge_label, labeled, ds_attrs['is_directed']) if verbose: iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize), desc='getting canonkeys', file=sys.stdout) else: iterator = pool.imap_unordered(get_partial, itr, chunksize) for i, ck in iterator: canonkeys[i] = ck pool.close() pool.join() # compute kernels. def init_worker(canonkeys_toshare): global G_canonkeys G_canonkeys = canonkeys_toshare do_partial = partial(wrapper_treeletkernel_do, sub_kernel) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(canonkeys, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # ---- do not use parallelization. ---- elif parallel is None: # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys = [] for g in (tqdm(Gn, desc='getting canonkeys', file=sys.stdout) if verbose else Gn): canonkeys.append( get_canonkeys(g, node_label, edge_label, labeled, ds_attrs['is_directed'])) # compute kernels. from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(Gn)), 2) for i, j in (tqdm(itr, desc='getting canonkeys', file=sys.stdout) if verbose else itr): Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j], sub_kernel) Kmatrix[j][i] = Kmatrix[i][ j] # @todo: no directed graph considered? else: raise Exception('No proper parallelization method designated.') run_time = time.time() - start_time if verbose: print( "\n --- treelet kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None, parallel='imap_unordered', n_jobs=None, verbose=True): """Calculate shortest-path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. node_label : string Node attribute used as label. The default node label is atom. edge_weight : string Edge attribute name corresponding to the edge weight. node_kernels : dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the sp kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] weight = None if edge_weight is None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], node_label=node_label) # remove graphs with no edges, as no sp can be found in their structures, # so the kernel between such a graph and itself will be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() if parallel == 'imap_unordered': pool = Pool(n_jobs) # get shortest path graphs of Gn getsp_partial = partial(wrapper_getSPGraph, weight) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: # # use default chunksize as pool.map when iterable is less than 100 # chunksize, extra = divmod(len(Gn), n_jobs * 4) # if extra: # chunksize += 1 chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 if verbose: iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting sp graphs', file=sys.stdout) else: iterator = pool.imap_unordered(getsp_partial, itr, chunksize) for i, g in iterator: Gn[i] = g pool.close() pool.join() elif parallel is None: pass # # ---- direct running, normally use single CPU core. ---- # for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): # i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) # # ---- use pool.map to parallel ---- # result_sp = pool.map(getsp_partial, range(0, len(Gn))) # for i in result_sp: # Gn[i[0]] = i[1] # or # getsp_partial = partial(wrap_getSPGraph, Gn, weight) # for i, g in tqdm( # pool.map(getsp_partial, range(0, len(Gn))), # desc='getting sp graphs', # file=sys.stdout): # Gn[i] = g # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # sp_ml = [0] * len(Gn) # shortest path matrices # for i in result_sp: # sp_ml[i[0]] = i[1] # edge_x_g = [[] for i in range(len(sp_ml))] # edge_y_g = [[] for i in range(len(sp_ml))] # edge_w_g = [[] for i in range(len(sp_ml))] # for idx, item in enumerate(sp_ml): # for i1 in range(len(item)): # for i2 in range(i1 + 1, len(item)): # if item[i1, i2] != np.inf: # edge_x_g[idx].append(i1) # edge_y_g[idx].append(i2) # edge_w_g[idx].append(item[i1, i2]) # print(len(edge_x_g[0])) # print(len(edge_y_g[0])) # print(len(edge_w_g[0])) Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, verbose=verbose) # # ---- use pool.map to parallel. ---- # # result_perf = pool.map(do_partial, itr) # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j, kernel in tqdm( # pool.map(do_partial, itr), desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- use joblib.Parallel to parallel and track progress. ---- # result_perf = Parallel( # n_jobs=n_jobs, verbose=10)( # delayed(do_partial)(ij) # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) # result_perf = [ # do_partial(ij) # for ij in combinations_with_replacement(range(0, len(Gn)), 2) # ] # for i in result_perf: # Kmatrix[i[0]][i[1]] = i[2] # Kmatrix[i[1]][i[0]] = i[2] # # ---- direct running, normally use single CPU core. ---- # from itertools import combinations_with_replacement # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel run_time = time.time() - start_time if verbose: print( "\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def marginalizedkernel(*args, node_label='atom', edge_label='bond_type', p_quit=0.5, n_iteration=20, remove_totters=False, n_jobs=None, chunksize=None, verbose=True): """Compute marginalized graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. node_label : string Node attribute used as symbolic label. The default node label is 'atom'. edge_label : string Edge attribute used as symbolic label. The default edge label is 'bond_type'. p_quit : integer The termination probability in the random walks generating step. n_iteration : integer Time of iterations to compute R_inf. remove_totters : boolean Whether to remove totterings by method introduced in [2]. The default value is False. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the marginalized kernel between 2 praphs. """ # pre-process n_iteration = int(n_iteration) Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] Gn = [g.copy() for g in Gn] ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) if not ds_attrs['node_labeled'] or node_label is None: node_label = 'atom' for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled'] or edge_label is None: edge_label = 'bond_type' for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() if remove_totters: # ---- use pool.imap_unordered to parallel and track progress. ---- pool = Pool(n_jobs) untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) if chunksize is None: if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 for i, g in tqdm(pool.imap_unordered(untotter_partial, range(0, len(Gn)), chunksize), desc='removing tottering', file=sys.stdout): Gn[i] = g pool.close() pool.join() # # ---- direct running, normally use single CPU core. ---- # Gn = [ # untotterTransformation(G, node_label, edge_label) # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) # ] Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_marg_do, node_label, edge_label, p_quit, n_iteration) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # # ---- direct running, normally use single CPU core. ---- ## pbar = tqdm( ## total=(1 + len(Gn)) * len(Gn) / 2, ## desc='Computing kernels', ## file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): ## print(i, j) # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, # edge_label, p_quit, n_iteration) # Kmatrix[j][i] = Kmatrix[i][j] ## pbar.update(1) run_time = time.time() - start_time if verbose: print( "\n --- marginalized kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def untilhpathkernel(*args, node_label='atom', edge_label='bond_type', depth=10, k_func='MinMax', compute_method='trie', parallel='imap_unordered', n_jobs=None, chunksize=None, verbose=True): """Compute path graph kernels up to depth/hight h between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. node_label : string Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. depth : integer Depth of search. Longest length of paths. k_func : function A kernel function applied using different notions of fingerprint similarity, defining the type of feature map and normalization method applied for the graph kernel. The Following choices are available: 'MinMax': use the MiniMax kernel and counting feature map. 'tanimoto': use the Tanimoto kernel and binary feature map. None: no sub-kernel is used, the kernel is computed directly. compute_method : string Computation method to store paths and compute the graph kernel. The Following choices are available: 'trie': store paths as tries. 'naive': store paths to lists. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the path kernel up to h between 2 praphs. """ # pre-process depth = int(depth) Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] Kmatrix = np.zeros((len(Gn), len(Gn))) ds_attrs = get_dataset_attributes(Gn, attr_names=[ 'node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed' ], node_label=node_label, edge_label=edge_label) if k_func is not None: if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() if parallel == 'imap_unordered': # ---- use pool.imap_unordered to parallel and track progress. ---- # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if chunksize is None: if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 all_paths = [[] for _ in range(len(Gn))] if compute_method == 'trie' and k_func is not None: getps_partial = partial(wrapper_find_all_path_as_trie, depth, ds_attrs, node_label, edge_label) elif compute_method != 'trie' and k_func is not None: getps_partial = partial(wrapper_find_all_paths_until_length, depth, ds_attrs, node_label, edge_label, True) else: getps_partial = partial(wrapper_find_all_paths_until_length, depth, ds_attrs, node_label, edge_label, False) if verbose: iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), desc='getting paths', file=sys.stdout) else: iterator = pool.imap_unordered(getps_partial, itr, chunksize) for i, ps in iterator: all_paths[i] = ps pool.close() pool.join() # for g in Gn: # if compute_method == 'trie' and k_func is not None: # find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label) # elif compute_method != 'trie' and k_func is not None: # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label) # else: # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False) ## size = sys.getsizeof(all_paths) ## for item in all_paths: ## size += sys.getsizeof(item) ## for pppps in item: ## size += sys.getsizeof(pppps) ## print(size) # ## ttt = time.time() ## # ---- ---- use pool.map to parallel ---- ## for i, ps in tqdm( ## pool.map(getps_partial, range(0, len(Gn))), ## desc='getting paths', file=sys.stdout): ## all_paths[i] = ps ## print(time.time() - ttt) if compute_method == 'trie' and k_func is not None: def init_worker(trie_toshare): global G_trie G_trie = trie_toshare do_partial = partial(wrapper_uhpath_do_trie, k_func) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) elif compute_method != 'trie' and k_func is not None: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare do_partial = partial(wrapper_uhpath_do_naive, k_func) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) else: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) elif parallel is None: # from pympler import asizeof # ---- direct running, normally use single CPU core. ---- # print(asizeof.asized(all_paths, detail=1).format()) if compute_method == 'trie': all_paths = [ find_all_path_as_trie(Gn[i], depth, ds_attrs, node_label=node_label, edge_label=edge_label) for i in tqdm(range(0, len(Gn)), desc='getting paths', file=sys.stdout) ] # sizeof_allpaths = asizeof.asizeof(all_paths) # print(sizeof_allpaths) pbar = tqdm(total=((len(Gn) + 1) * len(Gn) / 2), desc='Computing kernels', file=sys.stdout) for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _untilhpathkernel_do_trie( all_paths[i], all_paths[j], k_func) Kmatrix[j][i] = Kmatrix[i][j] pbar.update(1) else: all_paths = [ find_all_paths_until_length(Gn[i], depth, ds_attrs, node_label=node_label, edge_label=edge_label) for i in tqdm(range(0, len(Gn)), desc='getting paths', file=sys.stdout) ] # sizeof_allpaths = asizeof.asizeof(all_paths) # print(sizeof_allpaths) pbar = tqdm(total=((len(Gn) + 1) * len(Gn) / 2), desc='Computing kernels', file=sys.stdout) for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _untilhpathkernel_do_naive( all_paths[i], all_paths[j], k_func) Kmatrix[j][i] = Kmatrix[i][j] pbar.update(1) run_time = time.time() - start_time if verbose: print( "\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" % (depth, len(Gn), run_time)) # print(Kmatrix[0][0:10]) return Kmatrix, run_time
def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True): """Calculate walk graph kernels up to n between 2 graphs using Sylvester method. Parameters ---------- G1, G2 : NetworkX graph Graphs between which the kernel is calculated. node_label : string node attribute used as label. edge_label : string edge attribute used as label. Return ------ kernel : float Kernel between 2 graphs. """ Kmatrix = np.zeros((len(Gn), len(Gn))) if q == None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list actually contains the transposes of the adjacency matrices. A_wave_list = [ nx.adjacency_matrix(G, eweight).todense().transpose() for G in ( tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout ) if verbose else Gn) ] # # normalized adjacency matrices # A_wave_list = [] # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() # norm = A_tilde.sum(axis=0) # norm[norm == 0] = 1 # A_wave_list.append(A_tilde / norm) if p == None: # p is uniform distribution as default. def init_worker(Awl_toshare): global G_Awl G_Awl = Awl_toshare do_partial = partial(wrapper_se_do, lmda) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(A_wave_list, ), n_jobs=n_jobs, verbose=verbose) # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # S = lmda * A_wave_list[j] # T_t = A_wave_list[i] # # use uniform distribution if there is no prior knowledge. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) # p_times_uni = 1 / nb_pd # M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni) # X = dlyap(S, T_t, M0) # X = np.reshape(X, (-1, 1), order='F') # # use uniform distribution if there is no prior knowledge. # q_times = np.full((1, nb_pd), p_times_uni) # Kmatrix[i][j] = np.dot(q_times, X) # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) return Kmatrix
def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, verbose=True): """Calculate walk graph kernels up to n between 2 graphs using conjugate method. Parameters ---------- G1, G2 : NetworkX graph Graphs between which the kernel is calculated. node_label : string node attribute used as label. edge_label : string edge attribute used as label. Return ------ kernel : float Kernel between 2 graphs. """ Kmatrix = np.zeros((len(Gn), len(Gn))) # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1: # # this is faster from unlabeled graphs. @todo: why? # if q == None: # # don't normalize adjacency matrices if q is a uniform vector. Note # # A_wave_list actually contains the transposes of the adjacency matrices. # A_wave_list = [ # nx.adjacency_matrix(G, eweight).todense().transpose() for G in # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) # ] # if p == None: # p is uniform distribution as default. # def init_worker(Awl_toshare): # global G_Awl # G_Awl = Awl_toshare # do_partial = partial(wrapper_cg_unlabled_do, lmda) # parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, # glbv=(A_wave_list,), n_jobs=n_jobs) # else: # reindex nodes using consecutive integers for convenience of kernel calculation. Gn = [ nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in (tqdm(Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn) ] if p == None and q == None: # p and q are uniform distributions as default. def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, verbose=verbose) # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels, # node_label, edge_kernels, edge_label, lmda) # Kmatrix[i][j] = result # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) return Kmatrix
def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, verbose=True): """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method. Parameters ---------- G1, G2 : NetworkX graph Graphs between which the kernel is calculated. node_label : string node attribute used as label. edge_label : string edge attribute used as label. Return ------ kernel : float Kernel between 2 graphs. """ Kmatrix = np.zeros((len(Gn), len(Gn))) # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1: # # this is faster from unlabeled graphs. @todo: why? # if q == None: # # don't normalize adjacency matrices if q is a uniform vector. Note # # A_wave_list actually contains the transposes of the adjacency matrices. # A_wave_list = [ # nx.adjacency_matrix(G, eweight).todense().transpose() for G in # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) # ] # if p == None: # p is uniform distribution as default. # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # # use uniform distribution if there is no prior knowledge. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) # p_times_uni = 1 / nb_pd # w_times = kron(A_wave_list[i], A_wave_list[j]).todense() # p_times = np.full((nb_pd, 1), p_times_uni) # x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times)) # # use uniform distribution if there is no prior knowledge. # q_times = np.full((1, nb_pd), p_times_uni) # Kmatrix[i][j] = np.dot(q_times, x) # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) # else: # reindex nodes using consecutive integers for convenience of kernel calculation. Gn = [ nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in (tqdm(Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn) ] if p == None and q == None: # p and q are uniform distributions as default. def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, verbose=verbose) return Kmatrix
def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=True): """Calculate walk graph kernels up to n between 2 unlabeled graphs using spectral decomposition method. Labels will be ignored. Parameters ---------- G1, G2 : NetworkX graph Graphs between which the kernel is calculated. node_label : string node attribute used as label. edge_label : string edge attribute used as label. Return ------ kernel : float Kernel between 2 graphs. """ Kmatrix = np.zeros((len(Gn), len(Gn))) if q == None: # precompute the spectral decomposition of each graph. P_list = [] D_list = [] for G in (tqdm(Gn, desc='spectral decompose', file=sys.stdout) if verbose else Gn): # don't normalize adjacency matrices if q is a uniform vector. Note # A actually is the transpose of the adjacency matrix. A = nx.adjacency_matrix(G, eweight).todense().transpose() ew, ev = np.linalg.eig(A) D_list.append(ew) P_list.append(ev) # P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs? if p == None: # p is uniform distribution as default. q_T_list = [ np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn ] # q_T_list = [q.T for q in q_list] def init_worker(q_T_toshare, P_toshare, D_toshare): global G_q_T, G_P, G_D G_q_T = q_T_toshare G_P = P_toshare G_D = D_toshare do_partial = partial(wrapper_sd_do, weight, sub_kernel) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs, verbose=verbose) # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], # D_list[i], D_list[j], weight, sub_kernel) # Kmatrix[i][j] = result # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) return Kmatrix
def commonwalkkernel(*args, node_label='atom', edge_label='bond_type', # n=None, weight=1, compute_method=None, n_jobs=None, chunksize=None, verbose=True): """Compute common walk graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. node_label : string Node attribute used as symbolic label. The default node label is 'atom'. edge_label : string Edge attribute used as symbolic label. The default edge label is 'bond_type'. weight: integer Weight coefficient of different lengths of walks, which represents beta in 'exp' method and gamma in 'geo'. compute_method : string Method used to compute walk kernel. The Following choices are available: 'exp': method based on exponential serials applied on the direct product graph, as shown in reference [1]. The time complexity is O(n^6) for graphs with n vertices. 'geo': method based on geometric serials applied on the direct product graph, as shown in reference [1]. The time complexity is O(n^6) for graphs with n vertices. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is a common walk kernel between 2 graphs. """ # n : integer # Longest length of walks. Only useful when applying the 'brute' method. # 'brute': brute force, simply search for all walks and compare them. compute_method = compute_method.lower() # arrange all graphs in a list Gn = args[0] if len(args) == 1 else [args[0], args[1]] # remove graphs with only 1 node, as they do not have adjacency matrices len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they have only 1 node.\n' % (len_gn - len(Gn))) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') if not ds_attrs['is_directed']: # convert Gn = [G.to_directed() for G in Gn] start_time = time.time() Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare # direct product graph method - exponential if compute_method == 'exp': do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) # direct product graph method - geometric elif compute_method == 'geo': do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # pool = Pool(n_jobs) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) # if len_itr < 1000 * n_jobs: # chunksize = int(len_itr / n_jobs) + 1 # else: # chunksize = 1000 # # # direct product graph method - exponential # if compute_method == 'exp': # do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) # # direct product graph method - geometric # elif compute_method == 'geo': # do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) # # for i, j, kernel in tqdm( # pool.imap_unordered(do_partial, itr, chunksize), # desc='computing kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- direct running, normally use single CPU core. ---- # # direct product graph method - exponential # itr = combinations_with_replacement(range(0, len(Gn)), 2) # if compute_method == 'exp': # for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout): # Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, # edge_label, weight) # Kmatrix[j][i] = Kmatrix[i][j] # # # direct product graph method - geometric # elif compute_method == 'geo': # for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout): # Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, # edge_label, weight) # Kmatrix[j][i] = Kmatrix[i][j] # # search all paths use brute force. # elif compute_method == 'brute': # n = int(n) # # get all paths of all graphs before computing kernels to save time, but this may cost a lot of memory for large dataset. # all_walks = [ # find_all_walks_until_length(Gn[i], n, node_label, edge_label) # for i in range(0, len(Gn)) # ] # # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # Kmatrix[i][j] = _commonwalkkernel_brute( # all_walks[i], # all_walks[j], # node_label=node_label, # edge_label=edge_label) # Kmatrix[j][i] = Kmatrix[i][j] run_time = time.time() - start_time if verbose: print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def structuralspkernel(*args, node_label='atom', edge_weight=None, edge_label='bond_type', node_kernels=None, edge_kernels=None, compute_method='naive', parallel='imap_unordered', # parallel=None, n_jobs=None, verbose=True): """Calculate mean average structural shortest path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. node_label : string Node attribute used as label. The default node label is atom. edge_weight : string Edge attribute name corresponding to the edge weight. Applied for the computation of the shortest paths. edge_label : string Edge attribute used as label. The default edge label is bond_type. node_kernels : dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when nodes are unlabeled. edge_kernels : dict A dictionary of kernel functions for edges, including 3 items: 'symb' for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' for both labels. The first 2 functions take two edge labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two edges. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when edges are unlabeled. compute_method : string Computation method to store the shortest paths and compute the graph kernel. The Following choices are available: 'trie': store paths as tries. 'naive': store paths to lists. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the mean average structural shortest path kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] weight = None if edge_weight is None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed'], node_label=node_label, edge_label=edge_label) start_time = time.time() # get shortest paths of each graph in Gn if parallel == 'imap_unordered': splist = [None] * len(Gn) pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 # get shortest path graphs of Gn if compute_method == 'trie': getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) else: getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) if verbose: iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting shortest paths', file=sys.stdout) else: iterator = pool.imap_unordered(getsp_partial, itr, chunksize) for i, sp in iterator: splist[i] = sp # time.sleep(10) pool.close() pool.join() # ---- direct running, normally use single CPU core. ---- elif parallel is None: splist = [] if verbose: iterator = tqdm(Gn, desc='getting sp graphs', file=sys.stdout) else: iterator = Gn if compute_method == 'trie': for g in iterator: splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed'])) else: for g in iterator: splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) # ss = 0 # ss += sys.getsizeof(splist) # for spss in splist: # ss += sys.getsizeof(spss) # for spp in spss: # ss += sys.getsizeof(spp) # time.sleep(20) # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # sp_ml = [0] * len(Gn) # shortest path matrices # for i in result_sp: # sp_ml[i[0]] = i[1] # edge_x_g = [[] for i in range(len(sp_ml))] # edge_y_g = [[] for i in range(len(sp_ml))] # edge_w_g = [[] for i in range(len(sp_ml))] # for idx, item in enumerate(sp_ml): # for i1 in range(len(item)): # for i2 in range(i1 + 1, len(item)): # if item[i1, i2] != np.inf: # edge_x_g[idx].append(i1) # edge_y_g[idx].append(i2) # edge_w_g[idx].append(item[i1, i2]) # print(len(edge_x_g[0])) # print(len(edge_y_g[0])) # print(len(edge_w_g[0])) Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- if parallel == 'imap_unordered': def init_worker(spl_toshare, gs_toshare): global G_spl, G_gs G_spl = spl_toshare G_gs = gs_toshare if compute_method == 'trie': do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, node_kernels, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) else: do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, node_kernels, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) # ---- direct running, normally use single CPU core. ---- elif parallel is None: from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(Gn)), 2) if verbose: iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) else: iterator = itr if compute_method == 'trie': for i, j in iterator: kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j], ds_attrs, node_label, edge_label, node_kernels, edge_kernels) Kmatrix[i][j] = kernel Kmatrix[j][i] = kernel else: for i, j in iterator: kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], ds_attrs, node_label, edge_label, node_kernels, edge_kernels) # if(kernel > 1): # print("error here ") Kmatrix[i][j] = kernel Kmatrix[j][i] = kernel # # ---- use pool.map to parallel. ---- # pool = Pool(n_jobs) # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # for i, j, kernel in tqdm( # pool.map(do_partial, itr), desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- use pool.imap_unordered to parallel and track progress. ---- # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) # if len_itr < 1000 * n_jobs: # chunksize = int(len_itr / n_jobs) + 1 # else: # chunksize = 1000 # from contextlib import closing # with closing(Pool(n_jobs)) as pool: # for i, j, kernel in tqdm( # pool.imap_unordered(do_partial, itr, 1000), # desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() run_time = time.time() - start_time if verbose: print("\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time