def _dwpc_general_case(graph, metapath, damping=0, dtype=numpy.float64): """ A slow but general function to compute the degree-weighted path count. Works by splitting the metapath at junctions where one node is joined to multiple nodes over a metaedge. Parameters ---------- graph : hetio.hetnet.Graph metapath : hetio.hetnet.MetaPath damping : float dtype : dtype object """ dwpc_step = functools.partial(_node_to_children, graph=graph, metapath=metapath, damping=damping, dtype=dtype) start_nodes, cols, adj = metaedge_to_adjacency_matrix(graph, metapath[0]) rows, fin_nodes, adj = metaedge_to_adjacency_matrix(graph, metapath[-1]) number_start = len(start_nodes) number_end = len(fin_nodes) dwpc_matrix = [] if len(metapath) > 1: for i in range(number_start): search = numpy.zeros(number_start, dtype=dtype) search[i] = 1 step1 = [dwpc_step(node=search, metapath_index=0, history=None)] k = 1 while k < len(metapath): k += 1 step2 = [] for group in step1: for child in group['children']: hist = copy.deepcopy(group['history']) out = dwpc_step(node=child, metapath_index=group['next_index'], history=hist) if out['children']: step2.append(out) step1 = step2 final_children = [ group for group in step2 if group['children'] != [] ] end_nodes = sum([ child for group in final_children for child in group['children'] ]) if type(end_nodes) not in (list, numpy.ndarray): end_nodes = numpy.zeros(number_end) dwpc_matrix.append(end_nodes) else: dwpc_matrix = _degree_weight(adj, damping=damping, dtype=dtype) dwpc_matrix = numpy.array(dwpc_matrix, dtype=dtype) return start_nodes, fin_nodes, dwpc_matrix
def metapath_to_degree_dicts(graph, metapath): metapath = graph.metagraph.get_metapath(metapath) _, _, source_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[0], dense_threshold=0.7) _, _, target_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[-1], dense_threshold=0.7) source_degrees = source_adj_mat.sum(axis=1).flat target_degrees = target_adj_mat.sum(axis=0).flat source_degree_to_ind = degrees_to_degree_to_ind(source_degrees) target_degree_to_ind = degrees_to_degree_to_ind(target_degrees) return source_degree_to_ind, target_degree_to_ind
def _node_to_children(graph, metapath, node, metapath_index, damping=0, history=None, dtype=numpy.float64): """ Returns a history adjusted list of child nodes. Used in _dwpc_general_case. Parameters ---------- graph : hetio.hetnet.Graph metapath : hetio.hetnet.MetaPath node : numpy.ndarray metapath_index : int damping : float history : numpy.ndarray dtype : dtype object Returns ------- dict List of child nodes and a single numpy.ndarray of the newly updated history vector. """ metaedge = metapath[metapath_index] metanodes = list(metapath.get_nodes()) freq = collections.Counter(metanodes) repeated = {i for i in freq.keys() if freq[i] > 1} if history is None: history = { i.target: numpy.ones(len(metaedge_to_adjacency_matrix(graph, i)[1]), dtype=dtype) for i in metapath if i.target in repeated } history = history.copy() if metaedge.source in history: history[metaedge.source] -= numpy.array(node != 0, dtype=dtype) rows, cols, adj = metaedge_to_adjacency_matrix(graph, metaedge, dtype=dtype) adj = _degree_weight(adj, damping, dtype=dtype) vector = node @ adj if metaedge.target in history: vector *= history[metaedge.target] children = [i for i in numpy.diag(vector) if i.any()] return { 'children': children, 'history': history, 'next_index': metapath_index + 1 }
def dwwc_sequential(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64): """ Compute the degree-weighted walk count (DWWC) in which nodes can be repeated within a path. Parameters ---------- graph : hetio.hetnet.Graph metapath : hetio.hetnet.MetaPath damping : float dense_threshold : float (0 <= dense_threshold <= 1) sets the density threshold at which a sparse matrix will be converted to a dense automatically. dtype : dtype object """ dwwc_matrix = None row_names = None for metaedge in metapath: rows, cols, adj_mat = metaedge_to_adjacency_matrix( graph, metaedge, dense_threshold=dense_threshold, dtype=dtype) adj_mat = _degree_weight(adj_mat, damping, dtype=dtype) if dwwc_matrix is None: row_names = rows dwwc_matrix = adj_mat else: dwwc_matrix = dwwc_matrix @ adj_mat dwwc_matrix = sparsify_or_densify(dwwc_matrix, dense_threshold) return row_names, cols, dwwc_matrix
def dwpc_to_degrees(graph, metapath, damping=0.5, ignore_zeros=False): """ Yield a description of each cell in a DWPC matrix adding source and target node degree info as well as the corresponding path count. """ metapath = graph.metagraph.get_metapath(metapath) _, _, source_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[0], dense_threshold=0.7) _, _, target_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[-1], dense_threshold=0.7) source_degrees = source_adj_mat.sum(axis=1).flat target_degrees = target_adj_mat.sum(axis=0).flat del source_adj_mat, target_adj_mat source_path = graph.get_nodes_path(metapath.source(), file_format='tsv') source_node_df = pandas.read_table(source_path) source_node_names = list(source_node_df['name']) target_path = graph.get_nodes_path(metapath.target(), file_format='tsv') target_node_df = pandas.read_table(target_path) target_node_names = list(target_node_df['name']) row_names, col_names, dwpc_matrix = graph.read_path_counts(metapath, 'dwpc', damping) dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean()) if scipy.sparse.issparse(dwpc_matrix): dwpc_matrix = dwpc_matrix.toarray() _, _, path_count = graph.read_path_counts(metapath, 'dwpc', 0.0) if scipy.sparse.issparse(path_count): path_count = path_count.toarray() row_inds, col_inds = range(len(row_names)), range(len(col_names)) for row_ind, col_ind in itertools.product(row_inds, col_inds): dwpc_value = dwpc_matrix[row_ind, col_ind] if ignore_zeros and dwpc_value == 0: continue row = { 'source_id': row_names[row_ind], 'target_id': col_names[col_ind], 'source_name': source_node_names[row_ind], 'target_name': target_node_names[col_ind], 'source_degree': source_degrees[row_ind], 'target_degree': target_degrees[col_ind], 'path_count': path_count[row_ind, col_ind], 'dwpc': dwpc_value, } yield collections.OrderedDict(row)
def _multi_dot(metapath, order, i, j, graph, damping, dense_threshold, dtype): """ Perform matrix multiplication with the given order. Modified from numpy.linalg.linalg._multi_dot (https://git.io/vh31f) which is released under a 3-Clause BSD License (https://git.io/vhCDC). """ if i == j: _, _, adj_mat = metaedge_to_adjacency_matrix( graph, metapath[i], dense_threshold=dense_threshold, dtype=dtype) adj_mat = _degree_weight(adj_mat, damping=damping, dtype=dtype) return adj_mat return _multi_dot(metapath, order, i, order[i, j], graph, damping, dense_threshold, dtype) \ @ _multi_dot(metapath, order, order[i, j] + 1, j, graph, damping, dense_threshold, dtype)
def metaedge_to_data_array(graph, metaedge, dtype=numpy.bool_): """ Return an xarray.DataArray that's an adjacency matrix where source nodes are columns and target nodes are rows. """ source_node_ids, target_node_ids, adjacency_matrix = ( metaedge_to_adjacency_matrix(graph, metaedge, dtype=dtype)) dims = metaedge.source.identifier, metaedge.target.identifier coords = source_node_ids, target_node_ids data_array = xarray.DataArray(adjacency_matrix, coords=coords, dims=dims, name=metaedge.get_unicode_str()) return data_array
def _dwpc_short_repeat(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64): """ One metanode repeated 3 or fewer times (A-A-A), not (A-A-A-A) This can include other random inserts, so long as they are not repeats. Must start and end with the repeated node. Acceptable examples: (A-B-A-A), (A-B-A-C-D-E-F-A), (A-B-A-A), etc. """ segments = get_segments(graph.metagraph, metapath) assert len(segments) <= 3 # Account for different head and tail possibilities. head_segment = None tail_segment = None dwpc_matrix = None dwpc_tail = None # Label the segments as head, tail, and repeat for i, segment in enumerate(segments): if segment.source() == segment.target(): repeat_segment = segment else: if i == 0: head_segment = segment else: tail_segment = segment # Calculate DWPC for the middle ("repeat") segment repeated_metanode = repeat_segment.source() index_of_repeats = [ i for i, v in enumerate(repeat_segment.get_nodes()) if v == repeated_metanode ] for metaedge in repeat_segment[:index_of_repeats[1]]: rows, cols, adj = metaedge_to_adjacency_matrix( graph, metaedge, dtype=dtype, dense_threshold=dense_threshold) adj = _degree_weight(adj, damping, dtype=dtype) if dwpc_matrix is None: row_names = rows dwpc_matrix = adj else: dwpc_matrix = dwpc_matrix @ adj dwpc_matrix = remove_diag(dwpc_matrix, dtype=dtype) # Extra correction for random metanodes in the repeat segment if len(index_of_repeats) == 3: for metaedge in repeat_segment[index_of_repeats[1]:]: rows, cols, adj = metaedge_to_adjacency_matrix( graph, metaedge, dtype=dtype, dense_threshold=dense_threshold) adj = _degree_weight(adj, damping, dtype=dtype) if dwpc_tail is None: dwpc_tail = adj else: dwpc_tail = dwpc_tail @ adj dwpc_tail = remove_diag(dwpc_tail, dtype=dtype) dwpc_matrix = dwpc_matrix @ dwpc_tail dwpc_matrix = remove_diag(dwpc_matrix, dtype=dtype) col_names = cols if head_segment: row_names, cols, head_dwpc = dwpc(graph, head_segment, damping=damping, dense_threshold=dense_threshold, dtype=dtype) dwpc_matrix = head_dwpc @ dwpc_matrix if tail_segment: rows, col_names, tail_dwpc = dwpc(graph, tail_segment, damping=damping, dense_threshold=dense_threshold, dtype=dtype) dwpc_matrix = dwpc_matrix @ tail_dwpc return row_names, col_names, dwpc_matrix