def shortest_path_cover_logn_apx(g: gt.Graph, weight: gt.EdgePropertyMap): started_with_directed = g.is_directed() if not g.is_directed(): reversed_edges = np.fliplr(g.get_edges()) g.set_directed(True) g.add_edge_list(reversed_edges) weight.a[-reversed_edges.shape[0]:] = weight.a[:reversed_edges. shape[0]] if weight.value_type() not in [ "bool", "int", "int16_t", "int32_t", "int64_t" ]: #min = np.min(weight.a) #min_second = np.min(weight.a[weight.a > min]) eps = 1 #min_second - min scaled_weight = (np.ceil(weight.a / eps) * (g.num_vertices() + 1)).astype(np.int) # ints >= 1 else: scaled_weight = weight.a * (g.num_vertices() + 1) summed_edge_weight = np.sum(scaled_weight) adjusted_weight = g.new_edge_property("long", vals=scaled_weight - 1) paths = [] covered_vertices = set() while len(covered_vertices) != g.num_vertices(): curr_paths = shortest_path_visiting_most_nodes(g, adjusted_weight, covered_vertices, summed_edge_weight) for path in curr_paths: paths.append(path) #if len(path) <= 2 switch to fast mode and just add single edges/vertices until done. path_vertices = set(path) for v in path_vertices.difference(covered_vertices): for w in g.get_in_neighbors(v): adjusted_weight[g.edge(w, v)] += 1 #.a[list()] -= 1 if adjusted_weight[g.edge( w, v)] % (g.num_vertices() + 1) != 0: exit(5) new_covered = path_vertices.difference(covered_vertices) covered_vertices = covered_vertices.union(path_vertices) print(len(new_covered), len(path), len(covered_vertices), path) if not started_with_directed: g.set_directed(False) for e in reversed_edges: g.remove_edge(g.edge(e[0], e[1])) return paths
def cumulative_cooccurrence_graph(steps, sequences, directed=False): '''cumulative_cooccurrence_graph Creates a cumulative cooccurrence graph. Parameters ---------- steps : :obj:`iter` of :obj:`int` or :obj:`str` A series that contains sequential labels for the nested groups. sequences : :obj:`iter` of :obj:`iter` of :obj:`int` Nested iterable of integers representing vertices in the graph. Number of nested iterables should be equal to `len(steps)`. directed : :obj:`bool` Currently has no effect. In future this will determine whether to build a bi-directional cooccurrence graph. Returns ------- g : :obj:`graph_tool.Graph` A graph. Vertices are elements. Edges link terms that have cooccurred at least once in the series. o_props : :obj:`dict` Property maps with vertex occurrence values at each step. o_cumsum_props : :obj:`dict` Property maps with cumulative vertex cooccurrence values at each step. co_props : :obj:`dict` Property maps with edge cooccurrnce values at each step. co_cumsum_props : :obj:`dict` Property maps with cumulative edge cooccurrence values at each step. ''' g = Graph(directed=directed) o_total = Counter(chain(*chain(*sequences))) n_vertices = len(o_total) g.add_vertex(n_vertices) o_max = dict_to_vertex_prop(g, o_total, 'int') co_total = cooccurrence_counts(chain(*sequences)) edge_list = ((c[0], c[1], count) for c, count in co_total.items()) co_max = g.new_edge_property('int') g.add_edge_list(edge_list, eprops=[co_max]) edges = g.get_edges() edge_indices = dict(zip([(e[0], e[1]) for e in edges], edges[:, 2])) o_props = {} co_props = {} o_cumsum_props = {} co_cumsum_props = {} for i, (step, seq) in enumerate(zip(steps[:-1], sequences[:-1])): logging.info(f'Calculating cooccurrences at step {step}') o_step = Counter(chain(*seq)) o_props[step] = dict_to_vertex_prop(g, o_step, 'int') combos = (combinations(sorted(ids), 2) for ids in seq) co_step = Counter(chain(*combos)) co_props[step] = dict_to_edge_prop(g, co_step, 'int', edge_indices) o_cumsum = g.new_vertex_property('int') co_cumsum = g.new_edge_property('int') if i == 0: o_cumsum.a = o_cumsum.a + o_props[step].a co_cumsum.a = co_cumsum.a + co_props[step].a else: o_cumsum.a = o_cumsum_props[steps[i - 1]].a + o_props[step].a co_cumsum.a = co_cumsum_props[steps[i - 1]].a + co_props[step].a o_cumsum_props[step] = o_cumsum co_cumsum_props[step] = co_cumsum # fill in the last step without needing to count occurrences # or cooccurrences step_max = steps[-1] o = g.new_vertex_property('int') co = g.new_edge_property('int') o.a = o_max.a - o_cumsum.a co.a = co_max.a - co_cumsum.a o_props[step_max] = o co_props[step_max] = co o_cumsum_props[step_max] = o_max co_cumsum_props[step_max] = co_max steps_prop = g.new_graph_property('vector<int>') steps_prop.set_value(steps) g.gp['steps'] = steps_prop return g, o_props, o_cumsum_props, co_props, co_cumsum_props
class GraphDataset: """ Class for managing datasets with graph data """ def __init__(self, name, edges, object_ids, weights, hidden_graph=None): """ Params: name (str): unique string to name this dataset (for pickling and unpickling) edges (numpy.ndarray): numpy array of shape [num_edges, 2] containing the indices of nodes in all edges objects (List[str]): string object ids for all nodes weights (numpy.ndarray): numpy array of shape [num_edges] containing edge weights hidden_graph (GraphDataset): Graph data that should be excluded but not considered as negative edges. (i.e. train edges should not be in eval dataset but they shouldn't be counted as negatives either) """ self.name = name self.edges = edges self.object_ids = np.asarray(object_ids) self.weights = weights self.hidden_graph = hidden_graph self.graph = Graph(directed=False) self.graph.add_vertex(len(object_ids)) edge_weights = [[edge[0], edge[1], weight] for edge, weight in zip(self.edges, self.weights)] self.weight_property = self.graph.new_edge_property("float") eprops = [self.weight_property] self.graph.add_edge_list(edge_weights, eprops=eprops) self.manifold_nns = None def gen_neighbor_data(self, verbose=True) -> Dict: """ Generates the graph data needed to run the cython iterator Returns a dict with the neighbor data which will have values - 'non_empty_vertices' the indices of vertices which have edges emanating from them - 'all_graph_neighbors' a list of lists of ints such that the list of edges emanating from the vertex with index non_empty_vertices[i] is stored in all_graph_neighbors[i] - 'all_graph_weights' a list of lists of ints such that all_graph_weights[i][j] represents the weight of the connection in all_graph_neighbors[i][j] - 'N' number of nodes in the graph Parameters: verbose (bool): should graph loading be printed out """ all_graph_neighbors = [] all_graph_weights = [] non_empty_vertices = [] empty_vertices = [] if verbose: iterator = tqdm(range(self.n_nodes()), desc="Generating Neighbor Data", dynamic_ncols=True) else: iterator = range(self.n_nodes()) for i in iterator: in_edges = self.graph.get_in_edges(i, [self.weight_property]) out_edges = self.graph.get_out_edges(i, [self.weight_property]) if in_edges.size + out_edges.size > 0: non_empty_vertices.append(i) if in_edges.size == 0: all_graph_neighbors.append(out_edges[:, 1].astype(np.int64)) all_graph_weights.append(out_edges[:, 2].astype(np.float32)) elif out_edges.size == 0: all_graph_neighbors.append(in_edges[:, 1].astype(np.int64)) all_graph_weights.append(in_edges[:, 2].astype(np.float32)) else: all_graph_neighbors.append( np.concatenate([in_edges[:, 0], out_edges[:, 1]]).astype(np.int64)) all_graph_weights.append( np.concatenate([in_edges[:, 2], out_edges[:, 2]]).astype(np.float32)) else: empty_vertices.append(i) # graph_neighbors = np.concatenate(all_graph_neighbors) # graph_neighbor_weights = np.concatenate(all_graph_weights) non_empty_vertices = np.array(non_empty_vertices, dtype=np.int64) empty_vertices = np.array(empty_vertices, dtype=np.int64) return { "all_graph_neighbors": all_graph_neighbors, "all_graph_weights": all_graph_weights, "non_empty_vertices": non_empty_vertices, "empty_vertices": empty_vertices, "N": self.n_nodes() } def add_manifold_nns(self, graph_embedder: GraphEmbedder): manifold = graph_embedder.get_manifold() data_points = graph_embedder.retrieve_nodes(self.n_nodes()) self.manifold_nns = ManifoldNNS(data_points, manifold) def n_nodes(self) -> int: """ Returns the number of nodes in the graph """ return len(self.object_ids) def collapse_nodes(self, node_ids): all_new_edges = [] for node_id in tqdm(node_ids, desc="Collapsing Nodes", dynamic_ncols=True): in_edges = self.graph.get_in_edges(node_id, [self.weight_property]) out_edges = self.graph.get_out_edges(node_id, [self.weight_property]) neighbors = np.concatenate([out_edges[:, 1:3], in_edges[:, 0:3:2]]) if neighbors.shape[0] > 1: neighbor_combos = \ neighbors[comb_index(neighbors.shape[0], 2)] neighbor_combos = \ neighbor_combos.reshape(neighbor_combos.shape[0], 4) new_edges = np.zeros((neighbor_combos.shape[0], 3)) new_edges[:, :2] += neighbor_combos[:, 0:3:2] new_edges[:,2] += (neighbor_combos[:,1] + \ neighbor_combos[:,3])/4 all_new_edges.append(new_edges) self.graph.add_edge_list(np.concatenate(all_new_edges), eprops=[self.weight_property]) self.object_ids = np.delete(self.object_ids, np.array(node_ids)) self.graph.remove_vertex(node_ids) edges_weights = self.graph.get_edges(eprops=[self.weight_property]) edges = edges_weights[:, 0:2] weights = edges_weights[:, 2] self.edges = edges self.weights = weights def get_neighbor_iterator( self, graph_sampling_config: GraphSamplingConfig, data_fraction: float = 1, ) -> Iterator[GraphDataBatch]: """ Gets an efficient iterator of edge batches """ neighbor_data = load_or_gen(f"GraphDataset.{self.name}", self.gen_neighbor_data) if self.hidden_graph is None: # GraphDataBatchIterator is defined in cython with these arguments. # noinspection PyArgumentList iterator = GraphDataBatchIterator(neighbor_data, graph_sampling_config) iterator.data_fraction = data_fraction else: hidden_neighbor_data = load_or_gen( f"GraphDataset.{self.hidden_graph.name}", self.hidden_graph.gen_neighbor_data) # GraphDataBatchIterator is defined in cython with these arguments. # noinspection PyArgumentList iterator = GraphDataBatchIterator(neighbor_data, graph_sampling_config, hidden_neighbor_data) iterator.data_fraction = data_fraction if self.manifold_nns is not None: sampling_config = get_config().sampling _, nns = \ self.manifold_nns.knn_query_all(sampling_config.manifold_nn_k) all_manifold_neighbors = [ nns[i][1:].astype(np.int64) for i in range(self.n_nodes()) ] iterator.refresh_manifold_nn(all_manifold_neighbors) return iterator @classmethod def make_train_eval_split(cls, name, edges, object_ids, weights): """ Returns a tuple of a train eval split of the graph as defined in the data config. """ data_config = get_config().data np.random.seed(data_config.split_seed) if data_config.split_by_edges: # TODO Doesn't save to file in this mode shuffle_order = np.arange(edges.shape[0]) np.random.shuffle(shuffle_order) num_eval = floor(edges.shape[0] * data_config.split_size) eval_indices = shuffle_order[:num_eval] train_indices = shuffle_order[num_eval:] train_edges = edges[train_indices] train_weights = weights[train_indices] eval_edges = edges[eval_indices] eval_weights = weights[eval_indices] else: shuffle_order = np.arange(len(object_ids)) np.random.shuffle(shuffle_order) num_eval = floor(len(object_ids) * data_config.split_size) eval_indices = shuffle_order[:num_eval] test_set = data_config.generate_test_set if test_set: test_indices = shuffle_order[num_eval:2 * num_eval] train_indices = shuffle_order[2 * num_eval:] if test_set else \ shuffle_order[num_eval:] train_edges = [] eval_edges = [] train_weights = [] eval_weights = [] if test_set: test_edges = [] test_weights = [] for edge, weight in zip(edges, weights): if test_set and (edge[0] in test_indices or edge[1] in test_indices): test_edges.append(edge) test_weights.append(weight) elif edge[0] in eval_indices or edge[1] in eval_indices: eval_edges.append(edge) eval_weights.append(weight) else: train_edges.append(edge) train_weights.append(weight) if test_set: save_graph_data(test_edges, test_weights, object_ids, data_config.test_path) save_graph_data(train_edges, train_weights, object_ids, data_config.train_path) save_graph_data(eval_edges, eval_weights, object_ids, data_config.eval_path) train_edges = np.array(train_edges) eval_edges = np.array(eval_edges) train_weights = np.array(train_weights) eval_weights = np.array(eval_weights) train_data = GraphDataset(f"{name}_train", train_edges, object_ids, train_weights) eval_data = GraphDataset(f"{name}_eval", eval_edges, object_ids, eval_weights, hidden_graph=train_data) return train_data, eval_data