def parse_input(self, X): """Parse input and create features, while initializing and/or calculating sub-kernels. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- base_graph_kernel : object Returns base_graph_kernel. Only if called from `fit` or `fit_transform`. K : np.array Returns the kernel matrix. Only if called from `transform` or `fit_transform`. """ # Input validation and parsing if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: nx, max_core_number, core_numbers, graphs = 0, 0, [], [] for (idx, x) in enumerate(iter(X)): is_iter = False extra = tuple() if isinstance(x, collections.Iterable): x, is_iter = list(x), True if is_iter and len(x) >= 0: if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue elif len(x) == 1: x = Graph(x[0], {}, {}, graph_format="adjacency") elif len(x) == 2: x = Graph(x[0], x[1], {}, graph_format="adjacency") elif len(x) >= 3: if len(x) > 3: extra += tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format="adjacency") elif type(x) is Graph: x.desired_format("adjacency") x = Graph( x.get_adjacency_matrix(), x.get_labels(purpose="adjacency", label_type="vertex", return_none=True), x.get_labels(purpose="adjacency", label_type="edge", return_none=True)) else: raise TypeError('each element of X must be either a ' 'graph object or a list with at least ' 'a graph like object and node labels ' 'dict \n') # workaround for leaving a sparse representation for x x.change_format(self._graph_format) c = core_number(x) max_core_number = max(max_core_number, max(c.values())) core_numbers.append(c) graphs.append((x, extra)) nx += 1 if nx == 0: raise ValueError('parsed input is empty') if max_core_number <= self.min_core: raise ValueError( 'The maximum core equals the min_core boundary set in init.') # Add the zero iteration element if self._method_calling == 2: K = np.zeros(shape=(nx, nx)) elif self._method_calling == 3: self._dummy_kernel = dict() K = np.zeros(shape=(nx, self._nx)) # Main base_graph_kernel, indexes_list = dict(), dict() for i in range(max_core_number, self.min_core, -1): subgraphs, indexes = list(), list() for (idx, (cn, (g, extra))) in enumerate(zip(core_numbers, graphs)): vertices = [k for k, v in iteritems(cn) if v >= i] if len(vertices) > 0: # Calculate subgraph and store the index of the non-empty vertices sg = g.get_subgraph(vertices) sub_extra = list() indexes.append(idx) if len(extra) > 0: vs = np.array(sg.get_vertices(purpose='any')) for e in extra: # This case will only be reached by now if the user add the propagation # kernel as subkernel with a custom propagation matrix. This is a workaround! if type(e) is np.array and len(e.shape) == 2: e = e[vs, :][:, vs] sub_extra.append(e) subgraphs.append((sg, ) + tuple(sub_extra)) else: subgraphs.append(sg) indexes = np.array(indexes) indexes_list[i] = indexes # calculate kernel if self._method_calling == 1 and indexes.shape[0] > 0: base_graph_kernel[i] = self.base_graph_kernel_(**self.params_) base_graph_kernel[i].fit(subgraphs) elif self._method_calling == 2 and indexes.shape[0] > 0: base_graph_kernel[i] = self.base_graph_kernel_(**self.params_) ft_subgraph_mat = base_graph_kernel[i].fit_transform(subgraphs) for j in range(indexes.shape[0]): K[indexes[j], indexes] += ft_subgraph_mat[j, :] elif self._method_calling == 3: if self._max_core_number < i or self._fit_indexes[i].shape[ 0] == 0: if len(indexes) > 0: # add a dummy kernel for calculating the diagonal self._dummy_kernel[i] = self.base_graph_kernel_( **self.params_) self._dummy_kernel[i].fit(subgraphs) else: if indexes.shape[0] > 0: subgraph_tmat = self.X[i].transform(subgraphs) for j in range(indexes.shape[0]): K[indexes[j], self._fit_indexes[i]] += subgraph_tmat[j, :] if self._method_calling == 1: self._nx = nx self._max_core_number = max_core_number self._fit_indexes = indexes_list return base_graph_kernel elif self._method_calling == 2: self._nx = nx self._max_core_number = max_core_number self._fit_indexes = indexes_list return K, base_graph_kernel elif self._method_calling == 3: self._t_nx = nx self._max_core_number_trans = max_core_number self._transform_indexes = indexes_list return K
def parse_input(self, X): """Parse and create features for graphlet_sampling kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list The extracted adjacency matrices for any given input. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: i = 0 proc = list() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [1, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element' + ' on index: ' + str(idx)) continue else: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is not Graph: raise TypeError('each element of X must be either a ' + 'graph or an iterable with at least 2 ' + 'and at most 3 elements\n') i += 1 x.desired_format("adjacency") Ax = x.get_adjacency_matrix() Lx = x.get_labels(purpose="adjacency") Lx = [Lx[idx] for idx in range(Ax.shape[0])] proc.append((Ax, Lx, Ax.shape[0])) out = list() for Ax, Lx, s in proc: amss = dict() labels = set(Lx) Lx = np.array(Lx) for t in product(labels, labels): selector = np.matmul(np.expand_dims(Lx == t[0], axis=1), np.expand_dims(Lx == t[1], axis=0)) amss[t] = Ax * selector out.append((amss, s)) if i == 0: raise ValueError('parsed input is empty') return out
def parse_input(self, X): """Parse input for weisfeiler lehman. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- base_kernel : object Returns base_kernel. """ if self._method_calling not in [1, 2]: raise ValueError('method call must be called either from fit ' + 'or fit-transform') elif hasattr(self, '_X_diag'): # Clean _X_diag value delattr(self, '_X_diag') # Input validation and parsing if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: nx = 0 Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or len(x) >= 2): if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue else: if len(x) > 2: extra = tuple() if len(x) > 3: extra = tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) extra = (x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True), ) + extra else: x = Graph(x[0], x[1], {}, graph_format=self._graph_format) extra = tuple() elif type(x) is Graph: x.desired_format(self._graph_format) el = x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True) if el is None: extra = tuple() else: extra = (el, ) else: raise TypeError('each element of X must be either a ' + 'graph object or a list with at least ' + 'a graph like object and node labels ' + 'dict \n') Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") extras[nx] = extra distinct_values |= set(itervalues(L[nx])) nx += 1 if nx == 0: raise ValueError('parsed input is empty') # Save the number of "fitted" graphs. self._nx = nx # get all the distinct values of current labels WL_labels_inverse = dict() # assign a number to each label label_count = 0 for dv in sorted(list(distinct_values)): WL_labels_inverse[dv] = label_count label_count += 1 # Initalize an inverse dictionary of labels for all iterations self._inv_labels = dict() self._inv_labels[0] = WL_labels_inverse def generate_graphs(label_count, WL_labels_inverse): new_graphs = list() for j in range(nx): new_labels = dict() for k in L[j].keys(): new_labels[k] = WL_labels_inverse[L[j][k]] L[j] = new_labels # add new labels new_graphs.append((Gs_ed[j], new_labels) + extras[j]) yield new_graphs for i in range(1, self._n_iter): label_set, WL_labels_inverse, L_temp = set(), dict(), dict() for j in range(nx): # Find unique labels and sort # them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): credential = str(L[j][v]) + "," + \ str(sorted([L[j][n] for n in Gs_ed[j][v].keys()])) L_temp[j][v] = credential label_set.add(credential) label_list = sorted(list(label_set)) for dv in label_list: WL_labels_inverse[dv] = label_count label_count += 1 # Recalculate labels new_graphs = list() for j in range(nx): new_labels = dict() for k in L_temp[j].keys(): new_labels[k] = WL_labels_inverse[L_temp[j][k]] L[j] = new_labels # relabel new_graphs.append((Gs_ed[j], new_labels) + extras[j]) self._inv_labels[i] = WL_labels_inverse yield new_graphs base_kernel = { i: self._base_kernel(**self._params) for i in range(self._n_iter) } if self._parallel is None: if self._method_calling == 1: for (i, g) in enumerate( generate_graphs(label_count, WL_labels_inverse)): base_kernel[i].fit(g) elif self._method_calling == 2: K = np.sum( (base_kernel[i].fit_transform(g) for (i, g) in enumerate( generate_graphs(label_count, WL_labels_inverse))), axis=0) else: if self._method_calling == 1: self._parallel( joblib.delayed(efit)(base_kernel[i], g) for (i, g) in enumerate( generate_graphs(label_count, WL_labels_inverse))) elif self._method_calling == 2: K = np.sum(self._parallel( joblib.delayed(efit_transform)(base_kernel[i], g) for (i, g) in enumerate( generate_graphs(label_count, WL_labels_inverse))), axis=0) if self._method_calling == 1: return base_kernel elif self._method_calling == 2: return K, base_kernel
def transform(self, X): """Calculate the kernel matrix, between given and fitted dataset. Parameters ---------- X : iterable Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that fitting the given graph format). If None the kernel matrix is calculated upon fit data. The test samples. Returns ------- K : numpy array, shape = [n_targets, n_input_graphs] corresponding to the kernel matrix, a calculation between all pairs of graphs between target an features """ self._method_calling = 3 # Check is fit had been called check_is_fitted(self, ['X', '_nx', '_inv_labels']) # Input validation and parsing if X is None: raise ValueError('transform input cannot be None') else: if not isinstance(X, collections.Iterable): raise ValueError('input must be an iterable\n') else: nx = 0 distinct_values = set() Gs_ed, L = dict(), dict() for (i, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [0, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(i)) continue elif len(x) in [2, 3]: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is Graph: x.desired_format("dictionary") else: raise ValueError('each element of X must have at ' + 'least one and at most 3 elements\n') Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") # Hold all the distinct values distinct_values |= set(v for v in itervalues(L[nx]) if v not in self._inv_labels[0]) nx += 1 if nx == 0: raise ValueError('parsed input is empty') nl = len(self._inv_labels[0]) WL_labels_inverse = { dv: idx for (idx, dv) in enumerate(sorted(list(distinct_values)), nl) } def generate_graphs(WL_labels_inverse, nl): # calculate the kernel matrix for the 0 iteration new_graphs = list() for j in range(nx): new_labels = dict() for (k, v) in iteritems(L[j]): if v in self._inv_labels[0]: new_labels[k] = self._inv_labels[0][v] else: new_labels[k] = WL_labels_inverse[v] L[j] = new_labels # produce the new graphs new_graphs.append([Gs_ed[j], new_labels]) yield new_graphs for i in range(1, self._n_iter): new_graphs = list() L_temp, label_set = dict(), set() nl += len(self._inv_labels[i]) for j in range(nx): # Find unique labels and sort them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): credential = str(L[j][v]) + "," + \ str(sorted([L[j][n] for n in Gs_ed[j][v].keys()])) L_temp[j][v] = credential if credential not in self._inv_labels[i]: label_set.add(credential) # Calculate the new label_set WL_labels_inverse = dict() if len(label_set) > 0: for dv in sorted(list(label_set)): idx = len(WL_labels_inverse) + nl WL_labels_inverse[dv] = idx # Recalculate labels new_graphs = list() for j in range(nx): new_labels = dict() for (k, v) in iteritems(L_temp[j]): if v in self._inv_labels[i]: new_labels[k] = self._inv_labels[i][v] else: new_labels[k] = WL_labels_inverse[v] L[j] = new_labels # Create the new graphs with the new labels. new_graphs.append([Gs_ed[j], new_labels]) yield new_graphs if self._parallel is None: # Calculate the kernel matrix without parallelization K = np.sum( (self.X[i].transform(g) for (i, g) in enumerate(generate_graphs(WL_labels_inverse, nl))), axis=0) else: # Calculate the kernel marix with parallelization K = np.sum(self._parallel( joblib.delayed(etransform)(self.X[i], g) for (i, g) in enumerate(generate_graphs(WL_labels_inverse, nl))), axis=0) self._is_transformed = True if self.normalize: X_diag, Y_diag = self.diagonal() old_settings = np.seterr(divide='ignore') K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag)))) np.seterr(**old_settings) return K
def parse_input( self, X, ): """Parse input for weisfeiler lehman. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. return_embedding_only: bool Whether to return the embedding of the graphs only, instead of computing the kernel all the way to the end. Returns ------- base_graph_kernel : object Returns base_graph_kernel. """ if self._method_calling not in [1, 2]: raise ValueError('method call must be called either from fit ' + 'or fit-transform') elif hasattr(self, '_X_diag'): # Clean _X_diag value delattr(self, '_X_diag') # Input validation and parsing if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: nx = 0 Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or len(x) >= 2): if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue else: if len(x) > 2: extra = tuple() if len(x) > 3: extra = tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) extra = (x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True), ) + extra else: x = Graph(x[0], x[1], {}, graph_format=self._graph_format) extra = tuple() elif type(x) is Graph: x.desired_format(self._graph_format) el = x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True) if el is None: extra = tuple() else: extra = (el, ) else: raise TypeError('each element of X must be either a ' + 'graph object or a list with at least ' + 'a graph like object and node labels ' + 'dict \n') Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") extras[nx] = extra distinct_values |= set(itervalues(L[nx])) nx += 1 if nx == 0: raise ValueError('parsed input is empty') # Save the number of "fitted" graphs. self._nx = nx WL_labels_inverse = OrderedDict() # assign a number to each label label_count = 0 for dv in sorted(list(distinct_values)): WL_labels_inverse[dv] = label_count label_count += 1 # Initalize an inverse dictionary of labels for all iterations self._inv_labels = OrderedDict( ) # Inverse dictionary of labels, in term of the *previous layer* self._inv_labels[0] = deepcopy(WL_labels_inverse) self.feature_dims.append( len(WL_labels_inverse)) # Update the zeroth iteration feature dim # self._inv_label_node_attr = OrderedDict() # Inverse dictionary of labels, in term of the *node attribute* # self._label_node_attr = OrderedDict() # Same as above, but with key and value inverted # self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label(WL_labels_inverse, 0) # if self.node_weights is not None: # self._feature_weight = OrderedDict() # # Ensure the order is the same # self._feature_weight[0] = self._compute_feature_weight(self.node_weights, 0, WL_labels_inverse)[1] # else: # self._feature_weight = None def generate_graphs(label_count, WL_labels_inverse): new_graphs = list() for j in range(self._nx): new_labels = dict() for k in L[j].keys(): new_labels[k] = WL_labels_inverse[L[j][k]] L[j] = new_labels # add new labels new_graphs.append((Gs_ed[j], new_labels) + extras[j]) yield new_graphs for i in range(1, self._h): label_set, WL_labels_inverse, L_temp = set(), dict(), dict() for j in range(nx): # Find unique labels and sort # them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): credential = str(L[j][v]) + "," + \ str(sorted([L[j][n] for n in Gs_ed[j][v].keys()])) L_temp[j][v] = credential label_set.add(credential) label_list = sorted(list(label_set)) for dv in label_list: WL_labels_inverse[dv] = label_count label_count += 1 # Recalculate labels new_graphs = list() for j in range(nx): new_labels = dict() for k in L_temp[j].keys(): new_labels[k] = WL_labels_inverse[L_temp[j][k]] L[j] = new_labels # relabel new_graphs.append((Gs_ed[j], new_labels) + extras[j]) self._inv_labels[i] = WL_labels_inverse # Compute the translated inverse node label # self._label_node_attr[i], self._inv_label_node_attr[i] = self.translate_label(WL_labels_inverse, i, self._label_node_attr[i - 1]) # self.feature_dims.append(self.feature_dims[-1] + len(self._label_node_attr[i])) # Compute the feature weight of the current layer # if self.node_weights is not None: # self._feature_weight[i] = self._compute_feature_weight(self.node_weights, i, self._inv_label_node_attr[i])[1] # assert len(self._feature_weight[i] == len(WL_labels_inverse)) yield new_graphs # Initialise the base graph kernel. base_graph_kernel = {} K = [] for (i, g) in enumerate(generate_graphs(label_count, WL_labels_inverse)): param = self._params # if self._feature_weight is not None: # print(self._feature_weight) # param.update({'mahalanobis_precision': self._feature_weight[i]}) base_graph_kernel.update({i: self._base_graph_kernel(**param)}) # if return_embedding_only: # K.append(base_graph_kernel[i].parse_input( # g, label_start_idx=self.feature_dims[i], label_end_idx=self.feature_dims[i + 1])) # else: if self._method_calling == 1: base_graph_kernel[i].fit(g, ) else: K.append(base_graph_kernel[i].fit_transform(g, )) # if return_embedding_only: # return K if self._method_calling == 1: return base_graph_kernel elif self._method_calling == 2: # if self.as_tensor: # K = torch.stack(K, dim=0).sum(dim=0) # return K, base_graph_kernel return np.sum(K, axis=0), base_graph_kernel
def parse_input(self, X): """Fast ML Graph Kernel. See supplementary material :cite:`kondor2016multiscale`, algorithm 1. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list A list of tuples with S matrices inverses and their 4th-root determinants. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: ng = 0 out = list() data = dict() neighborhoods = dict() for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): is_iter, x = True, list(x) if is_iter and len(x) in [0, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element ' + 'on index: ' + str(idx)) continue else: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is Graph: x.desired_format(self._graph_format) else: raise TypeError('each element of X must be either a ' 'graph or an iterable with at least 1 ' 'and at most 3 elements\n') phi_d = x.get_labels() A = x.get_adjacency_matrix() try: phi = np.array([list(phi_d[i]) for i in range(A.shape[0])]) except TypeError: raise TypeError('Features must be iterable and castable ' 'in total to a numpy array.') Lap = laplacian(A).astype(float) _increment_diagonal_(Lap, self.heta) data[ng] = {0: A, 1: phi, 2: inv(Lap)} neighborhoods[ng] = x ng += 1 if ng == 0: raise ValueError('parsed input is empty') # Define a function for calculating the S's of subgraphs of each iteration def calculate_C(k, j, l): if type(neighborhoods[k]) is Graph: neighborhoods[k] = neighborhoods[k].produce_neighborhoods( r=self.L, sort_neighbors=False) indexes = neighborhoods[k][l][j] L = laplacian(data[k][0][indexes, :][:, indexes]).astype(float) _increment_diagonal_(L, self.heta) U = data[k][1][indexes, :] S = multi_dot((U.T, inv(L), U)) _increment_diagonal_(S, self.gamma) return (inv(S), np.sum(np.log(np.real(eigvals(S))))) if self._method_calling == 1: V = [(k, j) for k in range(ng) for j in range(data[k][0].shape[0])] ns = min(len(V), self.n_samples) self.random_state_.shuffle(V) vs = V[:ns] phi_k = np.array([data[k][1][j, :] for (k, j) in vs]) # w the eigen vectors, v the eigenvalues K = phi_k.dot(phi_k.T) # Calculate eigenvalues v, w = eig(K) v, w = np.real(v), np.real(w.T) # keep only the positive vpos = np.argpartition(v, -self.P)[-self.P:] vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)] # ksi.shape = (k, Ns) * (Ns, P) ksi = w[vpos].dot(phi_k).T / np.sqrt(v[vpos]) for j in range(ng): # (n_samples, k) * (k, P) data[j][1] = data[j][1].dot(ksi) self._data_level = {0: ksi} for l in range(1, self.L + 1): # Take random samples from all the vertices of all graphs self.random_state_.shuffle(V) vs = V[:ns] # Compute the reference subsampled Gram matrix K_proj = { k: np.zeros(shape=(data[k][0].shape[0], ns)) for k in range(ng) } K, C = np.zeros(shape=(len(vs), len(vs))), dict() for (m, (k, j)) in enumerate(vs): C[m] = calculate_C(k, j, l) K_proj[k][j, m] = K[m, m] = self.pairwise_operation( C[m], C[m]) for (s, (k2, j2)) in enumerate(vs): if s < m: K[s, m] = K[m, s] \ = K_proj[k2][j2, m] \ = K_proj[k][j, s] \ = self.pairwise_operation(C[s], C[m]) else: break # Compute the kernels of the relations of the reference to everything else for (k, j) in V[ns:]: for (m, _) in enumerate(vs): K_proj[k][j, m] = self.pairwise_operation( C[m], calculate_C(k, j, l)) # w the eigen vectors, v the eigenvalues v, w = eig(K) v, w = np.real(v), np.real(w.T) # keep only the positive vpos = np.argpartition(v, -self.P)[-self.P:] vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)] # Q shape=(k, P) Q = w[vpos].T / np.sqrt(v[vpos]) for j in range(ng): # (n, ns) * (ns, P) data[j][1] = K_proj[j].dot(Q) self._data_level[l] = (C, Q) elif self._method_calling == 3: ksi = self._data_level[0] for j in range(ng): # (n, k) * (k, P) data[j][1] = data[j][1].dot(ksi) for l in range(1, self.L + 1): C, Q = self._data_level[l] for j in range(ng): K_proj = np.zeros(shape=(data[j][0].shape[0], len(C))) for n in range(data[j][0].shape[0]): for m in range(len(C)): K_proj[n, m] = self.pairwise_operation( C[m], calculate_C(j, n, l)) data[j][1] = K_proj.dot(Q) # Apply the final calculation of S. for k in range(ng): S = multi_dot((data[k][1].T, data[k][2], data[k][1])) _increment_diagonal_(S, self.gamma) out.append((inv(S), np.sum(np.log(np.real(eigvals(S)))))) return out
def parse_input(self, X): """Parse and create features for multiscale_laplacian kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list Tuples consisting of the Adjacency matrix, phi, phi_outer dictionary of neihborhood indexes and inverse laplacians up to level self.L and the inverse Laplacian of A. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: ng = 0 out = list() start = time.time() for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): is_iter, x = True, list(x) if is_iter and len(x) in [0, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element ' + 'on index: ' + str(idx)) continue else: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is not Graph: x.desired_format(self._graph_format) else: raise TypeError('each element of X must be either a ' + 'graph or an iterable with at least 1 ' + 'and at most 3 elements\n') ng += 1 phi_d = x.get_labels() A = x.get_adjacency_matrix() N = x.produce_neighborhoods(r=self.L, sort_neighbors=False) try: phi = np.array([list(phi_d[i]) for i in range(A.shape[0])]) except TypeError: raise TypeError('Features must be iterable and castable ' + 'in total to a numpy array.') phi_outer = np.dot(phi, phi.T) Lap = laplacian(A).astype(float) _increment_diagonal_(Lap, self.heta) L = inv(Lap) Q = dict() for level in range(1, self.L + 1): Q[level] = dict() for (key, item) in iteritems(N[level]): Q[level][key] = dict() Q[level][key]["n"] = np.array(item) if len(item) < A.shape[0]: laplac = laplacian(A[item, :][:, item]).astype(float) _increment_diagonal_(laplac, self.heta) laplac = inv(laplac) else: laplac = L Q[level][key]["l"] = laplac out.append((A, phi, phi_outer, Q, L)) if self.verbose: print("Preprocessing took:", time.time() - start, "s.") if ng == 0: raise ValueError('parsed input is empty') return out
def parse_input(self, X): """Parse input for weisfeiler lehman optimal assignment. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- Hs : numpy array, shape = [n_input_graphs, hierarchy_size] An array where the rows contain the histograms of the graphs. """ if self._method_calling not in [1, 2]: raise ValueError('method call must be called either from fit ' + 'or fit-transform') elif hasattr(self, '_X_diag'): # Clean _X_diag value delattr(self, '_X_diag') # Input validation and parsing if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: nx = 0 Gs_ed, L, distinct_values = dict(), dict(), set() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or len(x) >= 2): if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue else: if len(x) > 2: extra = tuple() if len(x) > 3: extra = tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) extra = (x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True), ) + extra else: x = Graph(x[0], x[1], {}, graph_format=self._graph_format) extra = tuple() elif type(x) is Graph: x.desired_format(self._graph_format) else: raise TypeError('each element of X must be either a ' + 'graph object or a list with at least ' + 'a graph like object and node labels ' + 'dict \n') Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") distinct_values |= set(itervalues(L[nx])) nx += 1 if nx == 0: raise ValueError('parsed input is empty') # Save the number of "fitted" graphs. self._nx = nx # Initialize hierarchy self._hierarchy = dict() self._hierarchy['root'] = dict() self._hierarchy['root']['parent'] = None self._hierarchy['root']['children'] = list() self._hierarchy['root']['w'] = 0 self._hierarchy['root']['omega'] = 0 # get all the distinct values of current labels WL_labels_inverse = dict() # assign a number to each label label_count = 0 for dv in sorted(list(distinct_values)): WL_labels_inverse[dv] = label_count self._insert_into_hierarchy(label_count, 'root') label_count += 1 # Initalize an inverse dictionary of labels for all iterations self._inv_labels = dict() self._inv_labels[0] = WL_labels_inverse for j in range(nx): new_labels = dict() for k in L[j].keys(): new_labels[k] = WL_labels_inverse[L[j][k]] L[j] = new_labels for i in range(1, self._n_iter): new_previous_label_set, WL_labels_inverse, L_temp = set(), dict(), dict() for j in range(nx): # Find unique labels and sort # them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): credential = str(L[j][v]) + "," + \ str(sorted([L[j][n] for n in Gs_ed[j][v].keys()])) L_temp[j][v] = credential new_previous_label_set.add((credential, L[j][v])) label_list = sorted(list(new_previous_label_set), key=lambda tup: tup[0]) for dv, previous_label in label_list: WL_labels_inverse[dv] = label_count self._insert_into_hierarchy(label_count, previous_label) label_count += 1 # Recalculate labels for j in range(nx): new_labels = dict() for k in L_temp[j].keys(): new_labels[k] = WL_labels_inverse[L_temp[j][k]] L[j] = new_labels self._inv_labels[i] = WL_labels_inverse # Compute the vector representation of each graph if self.sparse: Hs = lil_matrix((nx, len(self._hierarchy))) else: Hs = np.zeros((nx, len(self._hierarchy))) for j in range(nx): for k in L[j].keys(): current_label = L[j][k] while self._hierarchy[current_label]['parent'] is not None: Hs[j, current_label] += self._hierarchy[current_label]['omega'] current_label = self._hierarchy[current_label]['parent'] return Hs
def transform(self, X): """Calculate the kernel matrix, between given and fitted dataset. Parameters ---------- X : iterable Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that fitting the given graph format). If None the kernel matrix is calculated upon fit data. The test samples. Returns ------- K : numpy array, shape = [n_targets, n_input_graphs] corresponding to the kernel matrix, a calculation between all pairs of graphs between target an features """ self._method_calling = 3 # Check is fit had been called check_is_fitted(self, ['X', '_nx', '_hierarchy', '_inv_labels']) # Input validation and parsing if X is None: raise ValueError('transform input cannot be None') else: if not isinstance(X, collections.Iterable): raise ValueError('input must be an iterable\n') else: nx = 0 distinct_values = set() Gs_ed, L = dict(), dict() for (i, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [0, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(i)) continue elif len(x) in [2, 3]: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is Graph: x.desired_format("dictionary") else: raise ValueError('each element of X must have at ' + 'least one and at most 3 elements\n') Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") # Hold all the distinct values distinct_values |= set( v for v in itervalues(L[nx]) if v not in self._inv_labels[0]) nx += 1 if nx == 0: raise ValueError('parsed input is empty') # get all the distinct values of new labels WL_labels_inverse = dict() # assign a number to each label label_count = sum([len(self._inv_labels[i]) for i in range(len(self._inv_labels))]) for dv in sorted(list(distinct_values)): WL_labels_inverse[dv] = label_count self._insert_into_hierarchy(label_count, 'root') label_count += 1 for j in range(nx): new_labels = dict() for (k, v) in iteritems(L[j]): if v in self._inv_labels[0]: new_labels[k] = self._inv_labels[0][v] else: new_labels[k] = WL_labels_inverse[v] L[j] = new_labels for i in range(1, self._n_iter): L_temp, new_previous_label_set = dict(), set() for j in range(nx): # Find unique labels and sort them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): credential = str(L[j][v]) + "," + \ str(sorted([L[j][n] for n in Gs_ed[j][v].keys()])) L_temp[j][v] = credential if credential not in self._inv_labels[i]: new_previous_label_set.add((credential, L[j][v])) # Calculate the new label_set WL_labels_inverse = dict() if len(new_previous_label_set) > 0: for dv, previous_label in sorted(list(new_previous_label_set), key=lambda tup: tup[0]): WL_labels_inverse[dv] = label_count self._insert_into_hierarchy(label_count, previous_label) label_count += 1 # Recalculate labels for j in range(nx): new_labels = dict() for (k, v) in iteritems(L_temp[j]): if v in self._inv_labels[i]: new_labels[k] = self._inv_labels[i][v] else: new_labels[k] = WL_labels_inverse[v] L[j] = new_labels # Compute the vector representation of each graph if self.sparse: Hs = lil_matrix((nx, len(self._hierarchy))) else: Hs = np.zeros((nx, len(self._hierarchy))) for j in range(nx): for k in L[j].keys(): current_label = L[j][k] while self._hierarchy[current_label]['parent'] is not None: Hs[j, current_label] += self._hierarchy[current_label]['omega'] current_label = self._hierarchy[current_label]['parent'] self.Y = Hs # Compute the histogram intersection kernel K = np.zeros((nx, self._nx)) if self.sparse: for i in range(self._nx): for j in range(i, self._nx): K[i, j] = np.sum(Hs[i, :self.X.shape[1]].minimum(self.X[j, :])) else: for i in range(nx): for j in range(self._nx): K[i, j] = np.sum(np.min([Hs[i, :self.X.shape[1]], self.X[j, :]], axis=0)) self._is_transformed = True if self.normalize: X_diag, Y_diag = self.diagonal() old_settings = np.seterr(divide='ignore') K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag)))) np.seterr(**old_settings) return K