def get_coords( axes = 'gene', rows = None, time_val = None, spatial_idxs = None, ids = None): bdnet = nio.getBDTNP() gene_matrix = array([v['vals'][:,time_val] for v in bdnet.values() if str(time_val + 1) in v['steps']]) gene_matrix_keys = [k for k in bdnet.keys() if str(time_val +1) in v['steps']] if axes == 'gene': import scipy.sparse as ssp import scipy.sparse.linalg as las import scipy.sparse.lil as ll adj = ssp.csr_matrix(gene_matrix.T) n_c = 3 U,s, Vh = svd = las.svds(adj, n_c) filtered_genes = ll.lil_matrix(U)*ll.lil_matrix(diag(s)) *ll.lil_matrix(Vh) xs_gene = U[ids,0] ys_gene = U[ids,1] zs_gene = U[ids,2] elif axes == 'space': space_space =array([[ [r[idxs] for idxs in sidxs] for sidxs in spatial_idxs] for r in rows]) space_space = space_space[:, : , time_val] xs_gene = space_space[ids, 0] ys_gene = space_space[ids, 1] zs_gene = space_space[ids, 2] return xs_gene, ys_gene, zs_gene
def sparse_matrix_to_hdf(sparse_matrix, name_to_store, hdf_file_path): nonzero_indices = np.nonzero(sparse_matrix > 0) if len(nonzero_indices[0]) == 0: raise Exception("can't store empty sparse matrix!") if issparse(sparse_matrix): if sparse_matrix.__class__ is lil_matrix: nonzero_values = sparse_matrix.tocsr()[nonzero_indices].A1 else: nonzero_values = lil_matrix( sparse_matrix).tocsr()[nonzero_indices].A1 else: nonzero_values = np.array(sparse_matrix[nonzero_indices]) # print(sparse_matrix.__class__,'=',name_to_store,sparse_matrix.shape,len(nonzero_values)) matrix_dataframe = pd.DataFrame({ "row_indexes": nonzero_indices[0], "col_indexes": nonzero_indices[1], "data": nonzero_values }) matrix_shape_series = pd.Series(sparse_matrix.shape) matrix_dataframe.to_hdf(hdf_file_path, name_to_store) matrix_shape_series.to_hdf(hdf_file_path, "%s_shape" % name_to_store) del nonzero_indices, nonzero_values, matrix_dataframe, matrix_shape_series
def _sentence_graph_from_ptb_str(ptb_str, num_tokens): # We need to have num_tokens provided here, or else we won't know for # sure how big the graph should be. (There can be tokens missing from # the graph, and even if there aren't it would take more processing # than it's worth to find the max node index in the PTB tree.) tree = ImmutableParentedTree.fromstring(ptb_str) edge_graph = lil_matrix((num_tokens, num_tokens), dtype='float') edge_labels = {} excluded_edges = [] def convert_node(parent_index, node): # Node index is whatever's after the last underscore. node_label = node.label() node_index = int(node_label[node_label.rindex('_') + 1:]) edge_label = node[0] # 0th child is always edge label if edge_label in StanfordParsedSentence.DEPTH_EXCLUDED_EDGE_LABELS: excluded_edges.append((parent_index, node_index)) else: edge_graph[parent_index, node_index] = 1.0 edge_labels[parent_index, node_index] = edge_label for child in node[ 2:]: # Skip edge label (child 0) & POS (child 1). convert_node(node_index, child) for root_child in tree: convert_node(0, root_child) # initial parent index is 0 for root return edge_graph.tocsr(), edge_labels, excluded_edges
def construct_hierarchy_matrix(hierarchy, node2index): N = len(hierarchy) hier_mat = lil_matrix(np.eye(N), dtype=bool) for child, parent in hierarchy.items(): if parent is None: continue hier_mat[node2index[child], node2index[parent]] = 1. return csr_matrix(hier_mat)
def tolil(self, copy=False): from scipy.sparse.lil import lil_matrix lil = lil_matrix(self.shape, dtype=self.dtype) self.sum_duplicates() ptr, ind, dat = self.indptr, self.indices, self.data rows, data = lil.rows, lil.data for n in range(self.shape[0]): start = ptr[n] end = ptr[n + 1] rows[n] = ind[start:end].tolist() data[n] = dat[start:end].tolist() return lil
def graph(self): """ Return the k-nearest-neighbour graph with self.k neighbours. Optionally the minimum_spanning_tree is added in, according to self.include_mst. """ if getattr(self, '_graph', None) is None: D = self.manifold_corrected_distance_matrix.toarray() idxs = np.argsort(D) r = range(D.shape[0]) idx = idxs[:, :self.k] self._graph = lil_matrix(D.shape) for neighbours in idx.T: self._graph[r, neighbours] = D[r, neighbours] if self.include_mst: mst = self.minimal_spanning_tree for i, j, v in zip(*find(mst)): if self._graph[i, j] == 0: self._graph[i, j] = v return self._graph
def graph(self): """ Return the k-nearest-neighbour graph with self.k neighbours. Optionally the minimum_spanning_tree is added in, according to self.include_mst. """ if getattr(self, '_graph', None) is None: D = self.manifold_corrected_distance_matrix.toarray() idxs = np.argsort(D) r = range(D.shape[0]) idx = idxs[:, :self.k] self._graph = lil_matrix(D.shape) for neighbours in idx.T: self._graph[r, neighbours] = D[r, neighbours] if self.include_mst: mst = self.minimal_spanning_tree for i,j,v in zip(*find(mst)): if self._graph[i,j] == 0: self._graph[i,j] = v return self._graph
def filter_sparse(g1, n_c = 5, max_edges = -1, last_component = False): ''' Filter a sparse version of the network by PCA. g1: The input network graph. n_c: The number of principal components to compute max_edges: The maximum number of edges to keep. -1 => keep all. last_component: Keep only the final principal component ''' import scipy.sparse.linalg as las import scipy.sparse.lil as ll import scipy.sparse as ssp adj = ssp.csr_matrix(nx.to_scipy_sparse_matrix(g1)) nodes = g1.nodes() U,s, Vh = svd = las.svd(adj, n_c) U[less(abs(U), .001)] = 0 Vh[less(abs(Vh), .001)] = 0 if last_component: s_last = s; s_last[1:] *= 0 filtered = ll.lil_matrix(U)*ll.lil_matrix(diag(s_last)) *ll.lil_matrix(Vh) else: filtered = ll.lil_matrix(U)*ll.lil_matrix(diag(s)) *ll.lil_matrix(Vh) if max_edges != -1: filtered.data[argsort(abs(filtered.data))[:-1 * max_edges]] = 0 filtered.eliminate_zeros() g = nx.DiGraph() g.add_nodes_from(nodes) g.add_weighted_edges_from([(nodes[nz[0]],nodes[nz[1]],nz[2]) for nz in zip(*ssp.find(filtered))]) return g
def local_scaling_sample(D:np.ndarray, k:int=7, metric:str='distance', train_ind:np.ndarray=None, test_ind:np.ndarray=None): """Transform a distance matrix with Local Scaling. --- DRAFT version --- Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` train_ind : ndarray, optional If given, use only these data points as neighbors for rescaling. test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = ConsoleLogging() # Checking input io.check_sample_shape_fits(D, train_ind) io.check_valid_metric_parameter(metric) sparse = issparse(D) n = D.shape[0] if metric == 'similarity': if train_ind is not None: raise NotImplementedError kth = n - k exclude = -np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") else: # metric == 'distance': kth = k - 1 exclude = np.inf self_value = 0 if sparse: log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) if test_ind is None: train_set_ind = slice(0, n) #take all n_ind = range(n) else: train_set_ind = np.setdiff1d(np.arange(n), test_ind) n_ind = test_ind # Exclude self distances for j, sample in enumerate(train_ind): D[sample, j] = exclude r = np.zeros(n) for i in range(n): if train_ind is None: if sparse: di = D[i, train_set_ind].toarray() else: di = D[i, train_set_ind] else: di = D[i, :] # all columns are training in this case r[i] = np.partition(di, kth=kth)[kth] if sparse: D_ls = lil_matrix(D.shape) # Number of nonzero cells per row nnz = D.getnnz(axis=1) else: D_ls = np.zeros_like(D) if metric == 'similarity': for i in n_ind: if sparse and nnz[i] <= k: # Don't rescale if there are too few D_ls[i, :] = D[i, :] # neighbors in the current row else: D_ls[i, :] = np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind])) else: for i in n_ind: D_ls[i, :] = 1 - np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind])) if test_ind is None: if sparse: return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls else: # Ensure correct self distances for j, sample in enumerate(train_ind): D_ls[sample, j] = self_value return D_ls[test_ind]
break fh.seek(node_pos) print "Getting number of nodes ..." # Get number of nodes node_lines = fh.readline().replace(" ", "").strip() while not node_lines.endswith("</node>"): node_lines += fh.readline().replace(" ", "").strip() try: NUM_NODES = int(re.search("(?<=id=['\"]n)\d+", node_lines).group(0))+1 # +1 for 0-based indexing except Exception, msg: print "Cannot determine number of nodes from input file. Check graphml <node> syntax" # Got the nodes g = lil_matrix((NUM_NODES, NUM_NODES)) # Put back file handle iterator fh.seek(pos) print "Getting edges ..." line = "" while True: line += fh.readline().replace(" ", "").strip() # remove if inefficient if line.endswith("</edge>"): edge = get_edge(line) g[edge[0], edge[1]] = edge[2] # Naive i.e slow. TODO: Optimize line = "" elif line.endswith("</graphml>"):
def shortest_path_kernel(self,graph_db, hashed_attributes, param): label_name = param.get('label_name', None) num_vertices = 0 for g in graph_db: num_vertices += g.number_of_nodes() offset = 0 graph_indices = [] colors_0 = np.zeros(num_vertices, dtype=np.int64) # Get labels (colors) from all graph instances offset = 0 for g in graph_db: graph_indices.append((offset, offset + g.number_of_nodes() - 1)) if label_name: for i, label in enumerate(nx.get_node_attributes(g,label_name).values()): colors_0[i + offset] = label offset += g.number_of_nodes() _, colors_0 = np.unique(colors_0, return_inverse=True) colors_1 = hashed_attributes triple_indices = [] triple_offset = 0 triples = [] # Solve APSP problem for every graphs in graph data base for i, g in enumerate(graph_db): M = dict(nx.all_pairs_shortest_path_length(g)) # index is a tuple giving index of first and last node for graph h index = graph_indices[i] if label_name: l = colors_0[index[0]:index[1] + 1] h = colors_1[index[0]:index[1] + 1] else: h = colors_1[index[0]:index[1] + 1] d = len(M) # For each pair of vertices collect labels, hashed attributes, and shortest-path distance pairs = list(it.product(range(d), repeat=2)) if label_name: t = [hash((l[k], h[k], l[j], h[j], M[k][j])) for (k, j) in pairs if (k != j and ~np.isinf(M[k].get(j, np.inf)))] else: t = [hash((h[k], h[j], M[k][j])) for (k, j) in pairs if (k != j and ~np.isinf(M[k].get(j, np.inf)))] triples.extend(t) triple_indices.append((triple_offset, triple_offset + len(t) - 1)) triple_offset += len(t) _, colors = np.unique(triples, return_inverse=True) m = np.amax(colors) + 1 # Compute feature vectors feature_vectors = [] for i, index in enumerate(triple_indices): feature_vectors.append(np.bincount(colors[index[0]:index[1] + 1], minlength=m)) return lil.lil_matrix(feature_vectors, dtype=np.float64) # each feature vector will be row
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance', test_ind:np.ndarray=None, n_jobs:int=1): """Transform a distance matrix with Local Scaling. Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. n_jobs : int, optional, default: 1 Number of processes for parallel computations. - `1`: Don't use multiprocessing. - `-1`: Use all CPUs Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = ConsoleLogging() # Checking input io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) sparse = issparse(D) n = D.shape[0] if n_jobs == -1: n_jobs = cpu_count() if metric == 'similarity': kth = n - k exclude = -np.inf self_tmp_value = np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") if sparse and n_jobs != 1: log.warning("Parallel processing not implemented for sparse " "matrices. Using single process instead.") n_jobs = 1 else: # metric == 'distance': kth = k - 1 exclude = np.inf self_value = 0 self_tmp_value = self_value if sparse: log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) if test_ind is None: train_ind = slice(0, n) #take all else: train_ind = np.setdiff1d(np.arange(n), test_ind) if sparse: r = np.zeros(n) for i in range(n): di = D[i, train_ind].toarray() di[i] = exclude r[i] = np.partition(di, kth=kth)[kth] D_ls = lil_matrix(D.shape) # Number of nonzero cells per row nnz = D.getnnz(axis=1) else: np.fill_diagonal(D, exclude) if n_jobs > 1: r_ctype = RawArray(ctypes.c_double, n) r = np.frombuffer(r_ctype, dtype=np.float64) with Pool(processes=n_jobs, initializer=_ls_load_shared_data, initargs=(D, train_ind, r, r_ctype)) as pool: for _ in pool.imap(func=partial(_ls_calculate_r, kth=kth), iterable=range(n)): pass # results handled within func else: r = np.partition(D[:, train_ind], kth=kth)[:, kth] if sparse or n_jobs == 1: D_ls = np.zeros_like(D) for i in range(n): # vectorized inner loop: calc only triu part tmp = np.empty(n-i) tmp[0] = self_tmp_value if metric == 'similarity': if sparse and nnz[i] <= k: # Don't rescale if there are tmp[1:] = np.nan # too few neighbors in row else: tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) else: tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) D_ls[i, i:] = tmp # copy triu to tril -> symmetric matrix (diag=zeros) # NOTE: does not affect self values, since inf+inf=inf and 0+0=0 D_ls += D_ls.T else: D_ls_ctype = RawArray(ctypes.c_double, D.size) D_ls = np.frombuffer(D_ls_ctype, dtype=np.float64).reshape(D.shape) with Pool(processes=n_jobs, initializer=_ls_load_shared_data, initargs=(D, train_ind, r, r_ctype, D_ls, D_ls_ctype)) as pool: for _ in pool.imap(func=partial(_ls_calculate_sec_dist, n=n, metric=metric, self_tmp_value=self_tmp_value), iterable=range(n)): pass # results handled within func # triu is copied to tril within func if sparse: for i, nz in enumerate(nnz): if nz <= k: # too few neighbors D_ls[i, :] = D[i, :] return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls
def shortest_path_kernel(graph_db, hashed_attributes, *kwargs): compute_gram_matrix = kwargs[0] normalize_gram_matrix = kwargs[1] use_labels = kwargs[2] num_vertices = 0 for g in graph_db: num_vertices += g.num_vertices() offset = 0 graph_indices = [] colors_0 = np.zeros(num_vertices, dtype=np.int64) # Get labels (colors) from all graph instances offset = 0 for g in graph_db: graph_indices.append((offset, offset + g.num_vertices() - 1)) if use_labels == 1: for i, v in enumerate(g.vertices()): colors_0[i + offset] = g.vp.nl[v] if use_labels == 2: for i, v in enumerate(g.vertices()): colors_0[i + offset] = v.out_degree() offset += g.num_vertices() _, colors_0 = np.unique(colors_0, return_inverse=True) colors_1 = hashed_attributes triple_indices = [] triple_offset = 0 triples = [] # Solve APSP problem for every graphs in graph data base for i, g in enumerate(graph_db): a = gt.adjacency(g) M = csg.shortest_path(a, method='J', directed=False, unweighted=True) index = graph_indices[i] if use_labels: l = colors_0[index[0]:index[1] + 1] h = colors_1[index[0]:index[1] + 1] else: h = colors_1[index[0]:index[1] + 1] d = M.shape[0] # For each pair of vertices collect labels, hashed attributes, and shortest-path distance pairs = list(it.product(range(d), repeat=2)) if use_labels: t = [ hash((l[k], h[k], l[j], h[j], M[k][j])) for (k, j) in pairs if (k != j or ~np.isinf(M[k][j])) ] else: t = [ hash((h[k], h[j], M[k][j])) for (k, j) in pairs if (k != j or ~np.isinf(M[k][j])) ] triples.extend(t) triple_indices.append((triple_offset, triple_offset + len(t) - 1)) triple_offset += len(t) _, colors = np.unique(triples, return_inverse=True) m = np.amax(colors) + 1 # Compute feature vectors feature_vectors = [] for i, index in enumerate(triple_indices): feature_vectors.append( np.bincount(colors[index[0]:index[1] + 1], minlength=m)) if not compute_gram_matrix: return lil.lil_matrix(feature_vectors, dtype=np.float64) else: # Make feature vectors sparse gram_matrix = csr.csr_matrix(feature_vectors, dtype=np.float64) # Compute gram matrix gram_matrix = gram_matrix.dot(gram_matrix.T) gram_matrix = gram_matrix.toarray() if normalize_gram_matrix: return aux.normalize_gram_matrix(gram_matrix) else: return gram_matrix
def load_as_ir_task(): ''' 5 source documents (index) X 57 suspicious documents (queries) 1 relevant document (source) for each query! ''' path = datasets_extractors['DATASETS_PATH'][ 'short_plagiarised_answers_dataset'] files_path = os.path.join(path, "ir_task_short_plagiarised_answers.h5") if os.path.exists(files_path): #load and return queries = pd.read_hdf(files_path, 'queries') documents = pd.read_hdf(files_path, 'documents') dataset_target = pd.read_hdf(files_path, 'targets') data = dataset_target.loc[:, 'data'].values row = dataset_target.loc[:, 'index'].values col = dataset_target.loc[:, 'col'].values dataset_target = coo_matrix( (data, (row, col)), shape=(queries.shape[0], documents.shape[0])) dataset_target = dataset_target.tolil() dataset_encoding = __DATASET_ENCODING else: spa_original, dataset_encoding = load_to_pandas() queries_dataframe, documents_dataframe = spa_original[ 0:95], spa_original[95:100] dataset_target = lil_matrix((100, 100)) del spa_original queries = [] queries_dataframe_indexes = [] documents = documents_dataframe['content'].tolist() documents_dataframe_indexes = documents_dataframe.index.values.tolist() for rowi_index, rowi in queries_dataframe[ queries_dataframe.plag_type != "non"].iterrows(): i = len(queries) for j, source_rowj in documents_dataframe.iterrows(): j -= 95 dataset_target[i, j] = source_rowj["task"] == rowi['task'] queries.append(rowi['content']) queries_dataframe_indexes.append(rowi_index) non_plagiarism = queries_dataframe[queries_dataframe.plag_type == "non"] documents = documents + non_plagiarism['content'].values.tolist() documents_dataframe_indexes = documents_dataframe_indexes + non_plagiarism.index.values.tolist( ) dataset_target = dataset_target[:len(queries), :len(documents)] # print(len(queries),len(documents),dataset_target.shape) del queries_dataframe, documents_dataframe queries = pd.DataFrame({ 'content': queries, 'original_indexes': queries_dataframe_indexes }) documents = pd.DataFrame({ 'content': documents, 'original_indexes': documents_dataframe_indexes }) queries.to_hdf(files_path, 'queries', append=True) documents.to_hdf(files_path, 'documents', append=True) ''' storing scipy sparse matrix on dataframe to dump on hdf5 ''' coo = dataset_target.tocoo() pd.DataFrame({ 'index': coo.row, 'col': coo.col, 'data': coo.data })[['index', 'col', 'data']].sort_values(['index', 'col' ]).reset_index(drop=True).to_hdf(files_path, 'targets', append=True) return queries, documents, dataset_target, dataset_encoding
def WL_kernel(self,graph_db, hashed_attributes, param): label_name = param.get('label_name', None) wl_iterations = param.get('wl_iterations') # Create one empty feature vector for each graph feature_vectors = [] for _ in graph_db: feature_vectors.append(np.zeros(0, dtype=np.float64)) # Construct block diagonal matrix of all adjacency matrices adjacency_matrices = [] for g in graph_db: adjacency_matrices.append(np.array(nx.adjacency_matrix(g).todense())) M = sp.sparse.block_diag(tuple(adjacency_matrices), dtype=np.float64, format="csr") num_vertices = M.shape[0] # Load list of precalculated logarithms of prime numbers log_primes = log_pl.log_primes[0:num_vertices] # Color vector representing labels colors_0 = np.zeros(num_vertices, dtype=np.float64) # Color vector representing hashed attributes colors_1 = hashed_attributes # Get labels (colors) from all graph instances offset = 0 graph_indices = [] for g in graph_db: if label_name: for i, label in enumerate(nx.get_node_attributes(g,label_name).values()): colors_0[i + offset] = label graph_indices.append((offset, offset + g.number_of_nodes() - 1)) offset += g.number_of_nodes() # Map labels to [0, number_of_colors) if label_name: _, colors_0 = np.unique(colors_0, return_inverse=True) for it in range(0, wl_iterations + 1): if label_name: # Map colors into a single color vector colors_all = np.array([colors_0, colors_1]) colors_all = [hash(tuple(row)) for row in colors_all.T] _, colors_all = np.unique(colors_all, return_inverse=True) max_all = int(np.amax(colors_all) + 1) # max_all = int(np.amax(colors_0) + 1) feature_vectors = [ np.concatenate((feature_vectors[i], np.bincount(colors_all[index[0]:index[1] + 1], minlength=max_all))) for i, index in enumerate(graph_indices)] # Avoid coloring computation in last iteration if it < wl_iterations: colors_0 = self.wl_coloring(M, colors_0, log_primes[0:len(colors_0)]) colors_1 = self.wl_coloring(M, colors_1, log_primes[0:len(colors_1)]) else: max_1 = int(np.amax(colors_1) + 1) feature_vectors = [ np.concatenate((feature_vectors[i], np.bincount(colors_1[index[0]:index[1] + 1], minlength=max_1))) for i, index in enumerate(graph_indices)] # Avoid coloring computation in last iteration if it < wl_iterations: colors_1 = self.wl_coloring(M, colors_1, log_primes[0:len(colors_1)]) return lil.lil_matrix(feature_vectors, dtype=np.float64) # each feature vector will be row
def __nearest_neighbors_search(pipe_to_exec, source_file_path, file_path): ''' runs "pipe_to_exec" nearest neighbors search estimator parameters: * source_file_path : hdf file in which input documents, queries and targets are stored * file_path: hdf filename where nns results will be stored ''' # print(linei.describe) d = hdf_to_sparse_matrix('documents', source_file_path) pipe_to_exec.fit(d, None) d_mean_time = pipe_to_exec.steps[0][1].fit_time print("fitted in %f s" % (d_mean_time)) del d q = hdf_to_sparse_matrix('queries', source_file_path) d_indices, qd_distances, q_mean_time = pipe_to_exec.transform(q) # print("mean retrieval time %f s"%(q_mean_time)) time_dataframe = pd.DataFrame({ 'documents_mean_time': [d_mean_time], 'queries_mean_time': [q_mean_time], }) ''' storing nearest neighbors search results ''' time_dataframe.to_hdf(file_path.replace('results.h5', 'time.h5'), 'time_dataframe') sparse_matrix_to_hdf(d_indices, 'retrieved_docs', file_path) sparse_matrix_to_hdf(lil_matrix(qd_distances), 'qd_distances', file_path) del q, d_mean_time, q_mean_time, qd_distances, time_dataframe ''' Evaluating results in terms of Precision, Recalls and MAP. ''' t = hdf_to_sparse_matrix('targets', source_file_path) retrieved_relevants = [] for q_index in range(d_indices.shape[0]): q_retrieved_relevants = np.cumsum(t[q_index, d_indices[q_index, :]].A, axis=1) retrieved_relevants.append(q_retrieved_relevants) retrieved_relevants = vstack(retrieved_relevants) ''' broadcasting ''' approachi_recalls = np.divide(retrieved_relevants, np.matrix(t.sum(axis=1))) ranking_sum = np.multiply( np.ones(retrieved_relevants.shape), np.matrix(range(1, retrieved_relevants.shape[1] + 1))) approachi_precisions = np.divide(retrieved_relevants, ranking_sum) average_precision = np.zeros((d_indices.shape[0], 1)) for q_index in range(d_indices.shape[0]): relevants_precision = np.multiply(approachi_precisions[q_index, :], t[q_index, d_indices[q_index, :]].A) average_precision[q_index, 0] = relevants_precision.mean(axis=1) # print(q_index,'.MAP =',average_precision[q_index,0]) # print(t.sum(axis=1)) # print(retrieved_relevants) del d_indices, retrieved_relevants # print("MAP=",average_precision.mean(),average_precision.std(),'precision.sum=',average_precision.sum()) # print("recalls.sum = ",approachi_recalls.sum(),'| mean = ',approachi_recalls.sum()/(approachi_recalls.shape[0]*approachi_recalls.shape[1])) for to_store, to_store_name in [(approachi_precisions, 'precisions'), (approachi_recalls, 'recalls'), (average_precision, 'average_precisions')]: if not issparse(to_store): to_store = csr_matrix(to_store) sparse_matrix_to_hdf( to_store, to_store_name, file_path.replace('results', 'results_evaluation')) del to_store
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance', test_set_ind:np.ndarray=None): """Transform a distance matrix with Local Scaling. Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = Logging.ConsoleLogging() # Checking input IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == 'similarity': sort_order = -1 exclude = -np.inf self_tmp_value = np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") else: # metric == 'distance': sort_order = 1 exclude = np.inf self_value = 0 self_tmp_value = self_value if issparse(D): log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) n = D.shape[0] if test_set_ind is None: train_set_ind = slice(0, n) #take all else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) r = np.zeros(n) for i in range(n): if issparse(D): di = D[i, train_set_ind].toarray() else: di = D[i, train_set_ind] di[i] = exclude nn = np.argsort(di)[::sort_order] r[i] = di[nn[k-1]] #largest similarities or smallest distances if issparse(D): D_ls = lil_matrix(D.shape) else: D_ls = np.zeros_like(D) for i in range(n): # vectorized inner loop: calc only triu part tmp = np.empty(n-i) tmp[0] = self_tmp_value if metric == 'similarity': tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) else: tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) D_ls[i, i:] = tmp # copy triu to tril -> symmetric matrix (diag=zeros) # NOTE: does not affect self values, since inf+inf=inf and 0+0=0 D_ls += D_ls.T if issparse(D): return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls
def mock_vec_transform(X): ret = lil_matrix((len(X), 5000)) for idx, x in enumerate(X): ret[idx] = len(x) return ret
def weisfeiler_lehman_subtree_kernel(graph_db, hashed_attributes, *kwargs): iterations = kwargs[0] compute_gram_matrix = kwargs[1] normalize_gram_matrix = kwargs[2] use_labels = kwargs[3] # Create one empty feature vector for each graph feature_vectors = [] for _ in graph_db: feature_vectors.append(np.zeros(0, dtype=np.float64)) # Construct block diagonal matrix of all adjacency matrices adjacency_matrices = [] for g in graph_db: adjacency_matrices.append(gt.adjacency(g)) M = sp.sparse.block_diag(tuple(adjacency_matrices), dtype=np.float64, format="csr") num_vertices = M.shape[0] # Load list of precalculated logarithms of prime numbers log_primes = log_pl.log_primes[0:num_vertices] # Color vector representing labels colors_0 = np.zeros(num_vertices, dtype=np.float64) # Color vector representing hashed attributes colors_1 = hashed_attributes # Get labels (colors) from all graph instances offset = 0 graph_indices = [] for g in graph_db: if use_labels == 1: for i, v in enumerate(g.vertices()): colors_0[i + offset] = g.vp.nl[v] if use_labels == 2: for i, v in enumerate(g.vertices()): colors_0[i + offset] = v.out_degree() graph_indices.append((offset, offset + g.num_vertices() - 1)) offset += g.num_vertices() # Map labels to [0, number_of_colors) if use_labels: _, colors_0 = np.unique(colors_0, return_inverse=True) for it in range(0, iterations + 1): if use_labels: # Map colors into a single color vector colors_all = np.array([colors_0, colors_1]) colors_all = [hash(tuple(row)) for row in colors_all.T] _, colors_all = np.unique(colors_all, return_inverse=True) max_all = int(np.amax(colors_all) + 1) # max_all = int(np.amax(colors_0) + 1) feature_vectors = [ np.concatenate((feature_vectors[i], np.bincount(colors_all[index[0]:index[1] + 1], minlength=max_all))) for i, index in enumerate(graph_indices) ] # Avoid coloring computation in last iteration if it < iterations: colors_0 = compute_coloring(M, colors_0, log_primes[0:len(colors_0)]) colors_1 = compute_coloring(M, colors_1, log_primes[0:len(colors_1)]) else: max_1 = int(np.amax(colors_1) + 1) feature_vectors = [ np.concatenate((feature_vectors[i], np.bincount(colors_1[index[0]:index[1] + 1], minlength=max_1))) for i, index in enumerate(graph_indices) ] # Avoid coloring computation in last iteration if it < iterations: colors_1 = compute_coloring(M, colors_1, log_primes[0:len(colors_1)]) if not compute_gram_matrix: return lil.lil_matrix(feature_vectors, dtype=np.float64) else: # Make feature vectors sparse gram_matrix = csr.csr_matrix(feature_vectors, dtype=np.float64) # Compute gram matrix gram_matrix = gram_matrix.dot(gram_matrix.T) gram_matrix = gram_matrix.toarray() if normalize_gram_matrix: return aux.normalize_gram_matrix(gram_matrix) else: return gram_matrix