def __init__(self, graph: MultiDiGraph, deep_copy: bool = True, strong_components: dict = None, links_to_components: dict = None): if deep_copy: self.graph = graph.copy() else: self.graph = graph self.strong_components = strong_components self.links_to_components = links_to_components self.visited_nodes = [] self.nodes_threat = {}
def graphs_variance(previous_graph: nx.MultiDiGraph, current_graph: nx.MultiDiGraph) -> dict: """ TODO: Add doc. :param previous_graph: :param current_graph: :return: """ graph_1 = previous_graph.copy() graph_2 = current_graph.copy() graph_1_nodes = list(graph_1.nodes) graph_2_nodes = list(graph_2.nodes) diff_1_2 = my_list.diff(graph_1_nodes, graph_2_nodes) diff_2_1 = my_list.diff(graph_2_nodes, graph_1_nodes) return { 'loss': len(diff_1_2), 'gain': len(diff_2_1), 'lost_nodes': diff_1_2, 'gain_nodes': diff_2_1 }
def remove_nondot_keys(graph: nx.MultiDiGraph, inplace=False) -> nx.MultiDiGraph: if not inplace: graph = graph.copy() allowed = set(config.graphviz_attrs) def clean_attr(attrs: Dict): for key in attrs.keys() - allowed: del attrs[key] for node in graph: clean_attr(graph.nodes[node]) # noinspection PyArgumentList for u, v, attr in graph.edges(keys=False, data=True): clean_attr(attr) return graph
def collapse_timeline(graph: nx.MultiDiGraph) -> nx.MultiDiGraph: """ Returns a new graph in which unneeded datetime nodes are removed. """ g: nx.MultiDiGraph = graph.copy() timeline = sorted(node for node in g.nodes() if isinstance(node, date)) if not timeline: return g # nothing to do for node in timeline[1:]: pred = first(g.predecessors(node)) succ = first(g.successors(node)) if g.in_degree(node) == 1 and g.out_degree(node) == 1 \ and isinstance(pred, date) and isinstance(succ, date): g.add_edge(pred, succ, **g[pred][node][0]) g.remove_node(node) return g
def convert_to_digraph(G_orig: nx.MultiDiGraph) -> nx.DiGraph: # Prevent upstream impacts G = G_orig.copy() dupes_dict = {} for node_id in G.nodes(): nodes_to = [] for fr, to in G.out_edges(node_id): nodes_to.append(to) to_collection = collections.Counter(nodes_to).items() dupes = [item for item, count in to_collection if count > 1] if len(dupes) > 0: dupes_dict[node_id] = {} for dupe in dupes: in_consideration = [] # Get all the edge attributes for this node pair dupe_count = G.number_of_edges(node_id, dupe) for i in range(dupe_count): e = G.edges[node_id, dupe, i] in_consideration.append(e) # From the results, we optimistically select the fastest # edge value and all associated key/values from the list fastest_e = min(in_consideration, key=lambda x: x['length']) dupes_dict[node_id][dupe] = fastest_e # Now that we have a list of issue duplicates, we can # iterate through the list and remove and replace edges for fr in dupes_dict.keys(): to_dict = dupes_dict[fr] for to in to_dict.keys(): # Remove all the edges that exist, we are going # to start with a fresh slate (also, NetworkX makes # it really hard to control which edges you are # removing, otherwise) for i in range(G.number_of_edges(fr, to)): G.remove_edge(fr, to) # Now let's start fresh and add a new, single, edge G.add_edge(fr, to, **to_dict[to]) # Now we should be safe to return a clean directed graph object return nx.DiGraph(G)
def test_EncodeNodes_llvm_program_graph(llvm_program_graph_nx: nx.MultiDiGraph): """Black-box test encoding LLVM program graphs.""" encoder = node_encoder.GraphNodeEncoder() g = llvm_program_graph_nx.copy() encoder.EncodeNodes(g) # This assumes that all of the test graphs have at least one statement. num_statements = sum( 1 if data["type"] == programl_pb2.Node.STATEMENT else 0 for _, data in g.nodes(data=True) ) assert num_statements >= 1 # Check for the presence of expected node attributes. for _, data in g.nodes(data=True): assert len(data["x"]) == 1 assert len(data["y"]) == 0 assert "preprocessed_text" in data
def linefy_all_geom(G_original: nx.MultiDiGraph): ''' Simplifies the shape shown in the image. :param G_original: networkx graph object. :return: SImplified networkx graph object. ''' G = G_original.copy() for e in G.edges(data=True): u, v, info = e ax = G.nodes[u]['x'] ay = G.nodes[u]['y'] bx = G.nodes[v]['x'] by = G.nodes[v]['y'] info['geometry'] = LineString([[ax, ay], [bx, by]]) return G # import queue # def add_direct_edges(G_original: nx.MultiDiGraph, threshold = 2000): # G = G_original.copy() # for n in G.nodes: # q = queue.Queue() # neighbors = list(G.successors(n)) # q.put(n) # #neighbors = set() # distances = {} # distances[n] = 0 # while q.qsize() > 0: # v = q.get() # for e in list(G.out_edges(v, data=True, keys=True)): # _, successor, _, info = e # if (successor not in distances or distances[v] + info['length'] < distances[successor])\ # and distances[v] + info['length'] < threshold: # q.put(successor) # #neighbors.add(successor) # distances[successor] = distances[v] + info['length'] # # for successor in distances.keys(): # if successor not in neighbors: # G.add_edge(n, successor, length=distances[successor]) # # print("%d to %d by add edge ." % (G_original.number_of_edges(), G.number_of_edges())) # # return G
def __init__(self, multiDiGraph: MultiDiGraph): # Nx MultiDiGraph used to compute threat self.nxGraph = multiDiGraph self.previousCopy = multiDiGraph.copy() # Threat calculator with an ability to remember threat of node self.threat_calc = ThreatCalculator(self.nxGraph) self.components_index = 0 self.graph_threat = 0 self.strong_components = {} self.links_to_strong_components = {} self.nodes_threat = {} self.strong_components_to_update = set() self.vulns = {} self.nodes = [] self.edges = [] self._init_graph() pass
def simplify_graph_remove_boundary_nodes(G_original: nx.MultiDiGraph): ''' Removes dangling roads at the boundary. :param G_original: networkx graph object. :return: SImplified networkx graph object. ''' G = G_original.copy() while True: to_remove = [] for n, info in list(G.nodes(data=True)): ins = list(G.predecessors(n)) outs = list(G.successors(n)) if G.in_degree(n) == 1 and G.out_degree(n) == 1 and len(ins) == 1 and len(outs) == 1 and ins[0] == outs[0]: to_remove.append(n) if len(to_remove) == 0: break G.remove_nodes_from(to_remove) print("Boundary Removed") print_graph_info(G) return G
def r1_minus(g: nx.MultiDiGraph) -> nx.MultiDiGraph: g = g.copy() del_nodes = [] for node, parity in g.nodes(data="parity"): # print(node, parity) if (parity == "Odd") and (node in g.neighbors(node)): data = g[node][node][0] if data["Tu"] != data["Tv"]: for pred_node, data in g.pred[node].items(): if pred_node != node: u = pred_node Tu = data[0]["Tu"] for succ_node, data in g.succ[node].items(): if succ_node != node: v = succ_node Tv = data[0]["Tv"] # print(u, v, Tu, Tv) del_nodes.append(node) g.add_edge(u, v, Tu=Tu, Tv=Tv) for node in del_nodes: g.remove_node(node) return g
def simplify_graph(original_graph: nx.MultiDiGraph) -> nx.MultiDiGraph: """ Creates a copy of the graph that contains only simple types, so it can be serialized to, e.g., GEXF """ graph = original_graph.copy() translation = {} for node, attrs in graph.nodes.data(): if isinstance(node, date): attrs['kind'] = 'date' translation[node] = node.isoformat() elif isinstance(node, Reference): attrs['label'] = node.label translation[node] = node.uri if isinstance(node, SplitReference): attrs['kind'] = node.side.value else: attrs['kind'] = node.__class__.__name__ else: attrs['kind'] = type(node).__name__ attrs['label'] = str(node) translation[node] = base_n(hash(node), 62) # use a stable, short representation _simplify_attrs(attrs) nx.relabel_nodes(graph, translation, copy=False) for u, v, attrs in graph.edges(data=True): if 'source' in attrs and not 'label' in attrs: source_ = attrs['source'] if isinstance(source_, Sequence) and not isinstance(source_, str): attrs['label'] = '\n'.join(f"{s.citation}: {s.detail}" if s.detail else s.citation for s in source_) else: attrs['label'] = str(source_) _simplify_attrs(attrs) # noinspection PyTypeChecker return graph
def collapse_edges_by_source(graph: nx.MultiDiGraph) -> nx.MultiDiGraph: """ Returns a new graph with all parallel edges from the same source collapsed. """ result = graph.copy() edge_groups = defaultdict(list) for u, v, k, attr in result.edges(keys=True, data=True): if 'source' in attr: edge_groups[(u, v, attr['kind'], attr['source'].uri)].append( (u, v, k, attr)) for (u, v, kind, source_uri), group in edge_groups.items(): if len(group) > 1: logger.debug('Collapsing group %s', group) group_attr = dict( weight=sum(attr.get('weight', 1) for u, v, k, attr in group), kind=kind, collapsed=len(group), source=BiblSource(source_uri), sources=[attr['source'] for u, v, k, attr in group], xml=[attr['xml'] for u, v, k, attr in group]) result.remove_edges_from(group) result.add_edge(u, v, **group_attr) return result
def simplify_graph_remove22(G_original: nx.MultiDiGraph): ''' Simplifies <=><=> shaped road to <=>. :param G_original: networkx graph object. :return: SImplified networkx graph object. ''' G = G_original.copy() def set_geometry(e): u, v, _, edge_info = e if 'geometry' not in edge_info: ux = G.nodes[u]['x'] uy = G.nodes[u]['y'] vx = G.nodes[v]['x'] vy = G.nodes[v]['y'] edge_info['geometry'] = LineString([[ux, uy], [vx, vy]]) while True: to_remove = [] for n in list(G.nodes()): # a0 -> b0 -> c0 # a1 <- b1 <- c1 outs = list(G.out_edges(n, data=True, keys=True)) ins = list(G.in_edges(n, data=True, keys=True)) if len(ins) == 2 and len(outs) == 2: ins.sort(key=lambda x: x[0]) outs.sort(key=lambda x: x[1]) if ins[0][0] == outs[0][1] and ins[1][0] == outs[1][ 1] and ins[0][0] != outs[1][1] and (n != ins[0][0] and n != ins[1][0]): a = ins[0][0] c = ins[1][0] b = n for e in ins: set_geometry(e) for e in outs: set_geometry(e) l_ab = ins[0][3] l_cb = ins[1][3] l_ba = outs[0][3] l_bc = outs[1][3] l_ac = copy.deepcopy(l_ab) l_ca = copy.deepcopy(l_cb) gac = MultiLineString([l_ab['geometry'], l_bc['geometry']]) gac = ops.linemerge(gac) gca = MultiLineString([l_cb['geometry'], l_ba['geometry']]) gca = ops.linemerge(gca) l_ac['length'] = l_ab['length'] + l_bc['length'] l_ac['geometry'] = gac l_ca['length'] = l_cb['length'] + l_ba['length'] l_ca['geometry'] = gca G.remove_node(n) G.add_edge(a, c, **l_ac) G.add_edge(c, a, **l_ca) to_remove.append(b) if len(to_remove) == 0: break print("== Removed") print_graph_info(G) return G
def write_dot(graph: nx.MultiDiGraph, target: Optional[Union[PathLike, str]] = 'base_graph.dot', style: Optional[Dict] = None, highlight: Optional[Union[Node, Sequence[Node]]] = None, highlight_path: Optional[Tuple[Node, Node]] = None, record: Union[bool, str] = 'auto', edge_labels: bool = True) -> AGraph: """ Writes a properly styled graphviz file for the given graph. Args: graph: the subgraph to draw target: dot file that should be written, may be a Path. If none, nothing is written but the AGraph returns style (dict): rules for styling the graph highlight: if a node, highlight that in the graph. highlight_path: If a tuple of nodes, highlight the shortest path(s) from the first to the second node record: record in the queue for `render_all`. If ``"auto"`` dependent on graph size edge_labels (bool): Should we paint edge labels? Returns: the AGraph, can be used to write the thing yourself. """ if style is None: style = config.styles logger.info('Writing %s ...', target) try: if record == 'auto' and config.render_node_limit >= 0: record = graph.number_of_nodes() < config.render_node_limit if not record: logger.info('%s is too large to be rendered automatically (%d nodes)', target, graph.number_of_nodes()) except Exception as e: logger.warning('Auto edges limit configuration error: %s', e) vis = graph.copy() add_timeline_edges(vis) for node in vis: if isinstance(node, Reference): vis.nodes[node]['URL'] = node.filename.stem vis.nodes[node]['target'] = '_top' # single node highlight if highlight is not None and not isinstance(highlight, Sequence): highlight = [highlight] if highlight_path is not None: if highlight is None: highlight = list(highlight_path) else: highlight = list(highlight) highlight.extend(highlight_path) if 'highlight' in style['edge']: try: vis.edges[highlight].update(style['edge']['highlight']) except KeyError: logger.warning('Highlight key %s not found while writing %s', highlight, target) if highlight is not None: if not isinstance(highlight, Sequence): highlight = [highlight] for node in list(highlight): if isinstance(node, SplitReference) and node.other: highlight.append(node.other) if 'highlight' in style['node']: for highlight_node in highlight: try: vis.nodes[highlight_node].update(style['node']['highlight']) except KeyError: logger.warning('Highlight key %s not found while writing %s', highlight, target) # noinspection PyTypeChecker simplified: MultiDiGraph = simplify_graph(vis) # now style by kind: if 'edge' in style: for u, v, k, attr in simplified.edges(data=True, keys=True): kind = attr.get('kind', None) if attr.get('delete', False): attr['URL'] = pathlink(u, v).stem attr['target'] = '_top' if kind in style['edge']: simplified.edges[u, v, k].update(style['edge'][kind]) for styled_attr in attr.keys() & style['edge']: if attr[styled_attr]: simplified.edges[u, v, k].update(style['edge'][styled_attr]) if 'topo' in attr and 'constraint' in attr: del attr['constraint'] if 'node' in style: for node, attr in simplified.nodes(data=True): kind = attr.get('kind', None) if kind in style['node']: simplified.nodes[node].update(style['node'][kind]) for styled_attr in attr.keys() & style['node']: if attr[styled_attr]: attr.update(style['node'][styled_attr]) if not edge_labels: for u, v, k, attr in simplified.edges(data=True, keys=True): if 'label' in attr: del attr['label'] if config.clean_gv_files: remove_nondot_keys(simplified, inplace=True) agraph: AGraph = nx.nx_agraph.to_agraph(simplified) agraph.edge_attr['fontname'] = 'Ubuntu derivative Faust' agraph.edge_attr['fontsize'] = 8 agraph.node_attr['fontname'] = 'Ubuntu derivative Faust' agraph.node_attr['fontsize'] = 12 agraph.graph_attr['rankdir'] = 'LR' agraph.graph_attr['stylesheet'] = '/css/webfonts.css' # extract the timeline timeline = agraph.add_subgraph([node for node in agraph.nodes() if node.attr['kind'] == 'date'], name='cluster_timeline') if 'timeline' in style: timeline_style = style['timeline'] for t in ('graph', 'edge', 'node'): if t in timeline_style: getattr(timeline, t + '_attr', {}).update(timeline_style[t]) logger.debug('timeline style: %s = %s', t, getattr(timeline, t + '_attr').items()) ## Doesn’t work if target is not None: target_path = Path(target) target_path.parent.mkdir(exist_ok=True, parents=True) dotfilename = str(target) agraph.write(dotfilename) if record: _render_queue.append(dotfilename) else: logger.warning('%s has not been queued for rendering', dotfilename) return agraph
def MakeAliasSetGraphs( g: nx.MultiDiGraph, bytecode: str, n: typing.Optional[int] = None, false=False, true=True, ) -> typing.Iterable[nx.MultiDiGraph]: """Produce up to `n` alias set graphs. Args: g: The unlabelled input graph. bytecode: The bytecode which produced the input graph. n: The maximum number of graphs to produce. Multiple graphs are produced by selecting different root pointers for alias sets. If `n` is provided, the number of graphs generated will be in the range 1 <= x <= min(num_alias_sets, n), where num_alias_sets is the number of alias sets larger than --alias_set_min_size. If n is None, num_alias_sets graphs will be produced. false: TODO(github.com/ChrisCummins/ProGraML/issues/2): Unused. This method is hardcoded to use 3-class 1-hots. true: TODO(github.com/ChrisCummins/ProGraML/issues/2): Unused. This method is hardcoded to use 3-class 1-hots. Returns: A generator of annotated graphs, where each graph has 'x' and 'y' labels on the statement nodes, and additionally a 'data_flow_max_steps_required' attribute which is set to the number of pointers in the alias set. """ # TODO(github.com/ChrisCummins/ProGraML/issues/2): Replace true/false args # with a list of class values for all graph annotator functions. del false del true # Build the alias sets for the given bytecode. alias_sets_by_function = opt_util.GetAliasSetsByFunction(bytecode) functions = { function for node, function in g.nodes(data="function") # Not all nodes have a 'function' attribute, e.g. the magic root node. if function } # Silently drop alias sets for functions which don't exist in the graph. alias_sets_to_delete = [] for function in alias_sets_by_function: if function not in functions: alias_sets_to_delete.append(function) if alias_sets_to_delete: for function in alias_sets_to_delete: del alias_sets_by_function[function] app.Log( 2, "Removed %d alias sets generated from bytecode but not found in " "graph: %s", len(alias_sets_to_delete), alias_sets_to_delete, ) function_alias_set_pairs: typing.List[ typing.Tuple[str, opt_util.AliasSet] ] = [] # Flatten the alias set dictionary and ignore any alias sets that are smaller # than the threshold size. for function, alias_sets in alias_sets_by_function.items(): function_alias_set_pairs += [ (function, alias_set) for alias_set in alias_sets if len(alias_set.pointers) >= FLAGS.alias_set_min_size ] # Select `n` random alias sets to generate labelled graphs for. if n and len(function_alias_set_pairs) > n: random.shuffle(function_alias_set_pairs) function_alias_set_pairs = function_alias_set_pairs[:n] for function, alias_set in function_alias_set_pairs: # Translate the must/may alias property into 3-class 1-hot labels. if alias_set.type == "may alias": false = np.array([1, 0, 0], np.int64) true = np.array([0, 1, 0], np.int64) elif alias_set.type == "must alias": false = np.array([1, 0, 0], np.int64) true = np.array([0, 0, 1], np.int64) else: raise ValueError(f"Unknown alias set type `{alias_set.type}`") # Transform pointer name into the node names produced by the ComposeGraphs() # method in the graph builder. When we compose multiple graphs, we add the # function name as a prefix, and `_operand` suffix to identifier nodes. pointers = [ f"{function}_{p.identifier}_operand" for p in alias_set.pointers ] root_pointer = random.choice(pointers) labelled = g.copy() labelled.data_flow_max_steps_required = AnnotateAliasSet( labelled, root_pointer, pointers, false=false, true=true ) yield labelled
def simplify_graph(G_orig: nx.MultiDiGraph) -> nx.MultiDiGraph: # Note: This operation borrows heavily from the operation of # the same name in OSMnx, as it existed in this state/commit: # github.com/gboeing/osmnx/blob/ # c5916aab5c9b94c951c8fb1964c841899c9467f8/osmnx/simplify.py # Function on line 203 # Prevent upstream mutation, always copy G = G_orig.copy() # Used to track updates to execute all_nodes_to_remove = [] all_edges_to_add = [] # TODO: Improve this method to not produce any mixed mode path # removal proposals # Utilize the recursive function from OSMnx that identifies paths based # on isolated successor nodes paths_to_consider = ox.simplify.get_paths_to_simplify(G) # Iterate through the resulting path arrays to target for path in paths_to_consider: # If the path is not all one mode of travel, skip the # proposed simplification if not _path_has_consistent_mode_type(G, path): continue # Keep track of the edges to be removed so we can # assemble a LineString geometry with all of them edge_attributes = {} # Work from the last edge through, "wrapped around," to the beginning for u, v in zip(path[:-1], path[1:]): # Should not be multiple edges between interstitial nodes only_one_edge = G.number_of_edges(u, v) == 1 if not only_one_edge: log(('Multiple edges between "{}" and "{}" ' 'found when simplifying').format(u, v)) # We ask for the 0th edge as we assume there is only one edge = G.edges[u, v, 0] for key in edge: if key in edge_attributes: # If key already exists in dict, append edge_attributes[key].append(edge[key]) else: # Otherwise, initialize a list edge_attributes[key] = [edge[key]] # Note: In peartree, we opt to not preserve any other elements; # we only keep length, mode and - in the case of simplified # geometries - the shape of the simplified route edge_attributes['mode'] = edge_attributes['mode'][0] edge_attributes['length'] = sum(edge_attributes['length']) # Construct the geometry from the points array points_array = [] for node in path: p = Point((G.nodes[node]['x'], G.nodes[node]['y'])) points_array.append(p) edge_attributes['geometry'] = LineString(points_array) # Add nodes and edges to respective lists for processing all_nodes_to_remove.extend(path[1:-1]) all_edges_to_add.append({ 'origin': path[0], 'destination': path[-1], 'attr_dict': edge_attributes }) # For each edge to add in the list we assembled, create a new edge between # the origin and destination for edge in all_edges_to_add: G.add_edge(edge['origin'], edge['destination'], **edge['attr_dict']) # Remove all the interstitial nodes between the new edges, which will also # knock out the related edges from the graph G.remove_nodes_from(set(all_nodes_to_remove)) # TODO: This step could be significantly optimized (as well as # parameterized, made optional) # A final step that cleans out all duplicate edges (not desired in a # simplified network) mult_edges = [] mult_edges_full = [] for fr, to, edge in G.edges(data=True): if G.number_of_edges(fr, to) > 1: mult_edges.append((fr, to)) mult_edges_full.append((fr, to, edge)) # Clean out the permutations to just one of each mult_edges = set(mult_edges) # TODO: This nested for loop is sloppy; clean up (numpy scalars, perhaps) for fr1, to1 in mult_edges: subset_edges = [] for fr2, to2, edge in mult_edges_full: if fr1 == fr2 and to1 == to2: subset_edges.append(edge) keep = max(subset_edges, key=lambda x: x['length']) # Drop all the edges edge_ct = len(subset_edges) G.remove_edges_from([(fr1, to1)] * edge_ct) # Then just re-add the one that we want G.add_edge(fr1, to1, **keep) return G
def coalesce(G_orig: nx.MultiDiGraph, resolution: float) -> nx.MultiDiGraph: # Make sure our resolution satisfies basic requirement if resolution < 1: raise ValueError('Resolution parameters must be >= 1') # Avoid upstream mutation of the graph G = G_orig.copy() # Before we continue, attempt to simplfy the current network # such that we won't generate isolated nodes that become disconnected # from key coalesced nodes (because too many intermediary nodes) G = simplify_graph(G) # Extract all x, y values grouped = {} for i, node in G.nodes(data=True): x = (round(node['x'] / resolution) * resolution) y = (round(node['y'] / resolution) * resolution) # Build the dictionary as needed if x not in grouped: grouped[x] = {} if y not in grouped[x]: grouped[x][y] = [] # Append each node under its approx. area grouping grouped[x][y].append(i) # Generate a series of reference dictionaries that allow us # to assign a new node name to each grouping of nodes counter = 0 new_node_coords = {} lookup = {} # Populate the fresh reference dictionaries for x in grouped: for y in grouped[x]: new_node_name = '{}_{}'.format(G.name, counter) new_node_coords[new_node_name] = {'x': x, 'y': y} # Pair each newly generate name to the original node id, # preserved from the original groupings resulting array for n in grouped[x][y]: lookup[n] = new_node_name # Update the counter so each new synthetic # node name will be different counter += 1 # Recast the lookup crosswalk as a series for convenience reference = pd.Series(lookup) # Get the average boarding cost for each node grouping for nni in new_node_coords: # Initialize an empty list boarding_costs = [] # Get all original nodes that have been grouped g_nodes = reference.loc[reference == nni].index.values # Iterate through and add gather costs for i in g_nodes: bc = G.nodes[i]['boarding_cost'] boarding_costs.append(bc) # Calculate the mean of the boarding costs avg_bc = np.array(boarding_costs).mean() # And assign it to the new nodes objects new_node_coords[nni]['boarding_cost'] = avg_bc # First step to creating a list of replacement edges replacement_edges_fr = [] replacement_edges_to = [] replacement_edges_len = [] for n1, n2, edge in G.edges(data=True): # This will be used to parse out which edges to keep replacement_edges_fr.append(reference[n1]) replacement_edges_to.append(reference[n2]) replacement_edges_len.append(edge['length']) # This takes the resulting matrix and converts it to a pandas DataFrame edges_df = pd.DataFrame({ 'fr': replacement_edges_fr, 'to': replacement_edges_to, 'len': replacement_edges_len }) # Next we group by the edge pattern (from -> to) grouped = edges_df.groupby(['fr', 'to'], sort=False) # With the resulting groupings, we extract values min_edges = grouped['len'].min() # Second step; which uses results from edge_df grouping/parsing edges_to_add = [] for n1, n2, edge in G.edges(data=True): rn1 = reference[n1] rn2 = reference[n2] # Make sure that this is the min edge min_length = min_edges.loc[rn1, rn2] # Skip this edge if it is not the minimum edge length if not edge['length'] == min_length: continue # If we pass the first check, we should also make sure that # the edge has not already been added by another minimum edge try: # If this works, then the edge already exists existing_edge = G[rn1][rn2] # Also sanity check that it is the min length value if not existing_edge['length'] == min_length: raise ValueError( 'Edge should have had minimum length of ' '{}, but instead had value of {}'.format(min_length)) # If this happens, then this is the first time this edge # is being added except KeyError: edges_to_add.append((rn1, rn2, edge)) # Add the new edges for n1, n2, edge in edges_to_add: # But avoid edges that now connect to the same node if not n1 == n2: G.add_edge(n1, n2, length=edge['length'], mode=edge['mode']) # Now we can remove all edges and nodes that predated the # coalescing operations for n in reference.index: # Note that this will also drop all edges G.remove_node(n) # Also make sure to update the new nodes with their summary # stats and locational data for i, node in new_node_coords.items(): # Some nodes are completely dropped in this operation # with no replacement edges (e.g. nodes that would have # connected to another node that ended up getting coalesced # into the same single node) if i not in G.nodes(): continue # For all other nodes, preserve them by re-populating for key in node: G.nodes[i][key] = node[key] return G
class MossNet: def __init__(self, moss_results_dict): '''Create a ``MossNet`` object from a 3D dictionary of downloaded MOSS results Args: ``moss_results_dict`` (``dict``): A 3D dictionary of downloaded MOSS results Returns: ``MossNet``: A ``MossNet`` object ''' if isinstance(moss_results_dict, MultiDiGraph): self.graph = moss_results_dict; return if isinstance(moss_results_dict, str): try: if moss_results_dict.lower().endswith('.gz'): moss_results_dict = load(gopen(moss_results_dict)) else: moss_results_dict = load(open(moss_results_dict,'rb')) except: raise ValueError("Unable to load dictionary: %s" % moss_results_dict) if not isinstance(moss_results_dict, dict): raise TypeError("moss_results_dict must be a 3D dictionary of MOSS results") self.graph = MultiDiGraph() for u in moss_results_dict: u_edges = moss_results_dict[u] if not isinstance(u_edges, dict): raise TypeError("moss_results_dict must be a 3D dictionary of MOSS results") for v in u_edges: u_v_links = u_edges[v] if not isinstance(u_edges[v], dict): raise TypeError("moss_results_dict must be a 3D dictionary of MOSS results") for f in u_v_links: try: left, right = u_v_links[f] except: raise TypeError("moss_results_dict must be a 3D dictionary of MOSS results") self.graph.add_edge(u, v, attr_dict = {'files':f, 'left':left, 'right':right}) def save(self, outfile): '''Save this ``MossNet`` object as a 3D dictionary of MOSS results Args: ``outfile`` (``str``): The desired output file's path ''' out = dict() for u in self.graph.nodes: u_edges = dict(); out[u] = u_edges for v in self.graph.neighbors(u): u_v_links = dict(); u_edges[v] = u_v_links; u_v_edge_data = self.graph.get_edge_data(u,v) for k in u_v_edge_data: edge = u_v_edge_data[k]['attr_dict']; u_v_links[edge['files']] = (edge['left'], edge['right']) if outfile.lower().endswith('.gz'): f = gopen(outfile, mode='wb', compresslevel=9) else: f = open(outfile, 'wb') pkldump(out, f); f.close() def __add__(self, o): if not isinstance(o, MossNet): raise TypeError("unsupported operand type(s) for +: 'MossNet' and '%s'" % type(o).__name__) g = MultiDiGraph() g.add_edges_from(list(self.graph.edges(data=True)) + list(o.graph.edges(data=True))) g.add_nodes_from(list(self.graph.nodes(data=True)) + list(o.graph.nodes(data=True))) return MossNet(g) def get_networkx(self): '''Return a NetworkX ``MultiDiGraph`` equivalent to this ``MossNet`` object Returns: ``MultiDiGraph``: A NetworkX ``DiGraph`` equivalent to this ``MossNet`` object ''' return self.graph.copy() def get_nodes(self): '''Returns a ``set`` of node labels in this ``MossNet`` object Returns: ``set``: The node labels in this ``MossNet`` object ''' return set(self.graph.nodes) def get_pair(self, u, v, style='tuples'): '''Returns the links between nodes ``u`` and ``v`` Args: ``u`` (``str``): A node label ``v`` (``str``): A node label not equal to ``u`` ``style`` (``str``): The representation of a given link * ``"tuples"``: Links are ``((u_percent, u_html), (v_percent, v_html))`` tuples * ``"html"``: Links are HTML representation (one HTML for all links) * ``"htmls"``: Links are HTML representations (one HTML per link) Returns: ``dict``: The links between ``u`` and ``v`` (keys are filenames) ''' if style not in {'tuples', 'html', 'htmls'}: raise ValueError("Invalid link style: %s" % style) if u == v: raise ValueError("u and v cannot be equal: %s" % u) for node in [u,v]: if not self.graph.has_node(node): raise ValueError("Nonexistant node: %s" % node) links = self.graph.get_edge_data(u,v) out = dict() for k in sorted(links.keys(), key=lambda x: links[x]['attr_dict']['files']): d = links[k]['attr_dict'] u_fn, v_fn = d['files'] u_percent, u_html = d['left'] v_percent, v_html = d['right'] if style == 'tuples': out[(u_fn, v_fn)] = ((u_percent, u_html), (v_percent, v_html)) elif style in {'html', 'htmls'}: out[(u_fn, v_fn)] = '<html><table style="width:100%%" border="1"><tr><td colspan="2"><center><b>%s/%s --- %s/%s</b></center></td></tr><tr><td>%s (%d%%)</td><td>%s (%d%%)</td></tr><tr><td><pre>%s</pre></td><td><pre>%s</pre></td></tr></table></html>' % (u, u_fn, v, v_fn, u, u_percent, v, v_percent, u_html, v_html) if style == 'html': out = '<html>' + '<br>'.join(out[fns].replace('<html>','').replace('</html>','') for fns in sorted(out.keys())) + '</html>' return out def get_summary(self, style='html'): '''Returns a summary of this ``MossNet`` Args: ``style`` (``str``): The representation of this ``MossNet`` Returns: ``dict``: A summary of this ``MossNet``, where keys are filenames ''' if style not in {'html'}: raise ValueError("Invalid summary style: %s" % style) matches = list() # list of (u_path, u_percent, v_path, v_percent) tuples for u,v in self.traverse_pairs(order=None): links = self.graph.get_edge_data(u,v) for k in links: d = links[k]['attr_dict'] u_fn, v_fn = d['files'] u_percent, u_html = d['left'] v_percent, v_html = d['right'] matches.append(('%s/%s' % (u,u_fn), u_percent, '%s/%s' % (v,v_fn), v_percent)) matches.sort(reverse=True, key=lambda x: max(x[1],x[3])) return '<html><table style="width:100%%" border="1">%s</table></html>' % ''.join(('<tr><td>%s (%d%%)</td><td>%s (%d%%)</td></tr>' % tup) for tup in matches) def num_links(self, u, v): '''Returns the number of links between ``u`` and ``v`` Args: ``u`` (``str``): A node label ``v`` (``str``): A node label not equal to ``u`` Returns: ``int``: The number of links between ``u`` and ``v`` ''' for node in [u,v]: if not self.graph.has_node(node): raise ValueError("Nonexistant node: %s" % node) return len(self.graph.get_edge_data(u,v)) def num_nodes(self): '''Returns the number of nodes in this ``MossNet`` object Returns: ``int``: The number of nodes in this ``MossNet`` object ''' return self.graph.number_of_nodes() def num_edges(self): '''Returns the number of (undirected) edges in this ``MossNet`` object (including parallel edges) Returns: ``int``: The number of (undirected) edges in this ``MossNet`` object (including parallel edges) ''' return int(self.graph.number_of_edges()/2) def outlier_pairs(self): '''Predict which student pairs are outliers (i.e., too many problem similarities). The distribution of number of links between student pairs (i.e., histogram) is modeled as y = A/(B^x), where x = a number of links, and y = the number of student pairs with that many links Returns: ``list`` of ``tuple``: The student pairs expected to be outliers (in decreasing order of significance) ''' links = dict() # key = number of links; value = set of student pairs that have that number of links for u,v in self.traverse_pairs(): n = self.num_links(u,v) if n not in links: links[n] = set() links[n].add((u,v)) mult = list(); min_links = min(len(s) for s in links.values()); max_links = max(len(s) for s in links.values()) for i in range(min_links, max_links): if i not in links or i+1 not in links or len(links[i+1]) > len(links[i]): break mult.append(float(len(links[i]))/len(links[i+1])) B = sum(mult)/len(mult) A = len(links[min_links]) * (B**min_links) n_cutoff = log(A)/log(B) out = list() for n in sorted(links.keys(), reverse=True): if n < n_cutoff: break for u,v in links[n]: out.append((n,u,v)) return out def traverse_pairs(self, order='descending'): '''Iterate over student pairs Args: ``order`` (``str``): Order to sort pairs in iteration * ``None`` to not sort (may be faster for large/dense graphs) * ``"ascending"`` to sort in ascending order of number of links * ``"descending"`` to sort in descending order of number of links ''' if order not in {None, 'None', 'none', 'ascending', 'descending'}: raise ValueError("Invalid order: %s" % order) nodes = list(self.graph.nodes) pairs = [(u,v) for u in self.graph.nodes for v in self.graph.neighbors(u) if u < v] if order == 'ascending': pairs.sort(key=lambda x: len(self.graph.get_edge_data(x[0],x[1]))) elif order == 'descending': pairs.sort(key=lambda x: len(self.graph.get_edge_data(x[0],x[1])), reverse=True) for pair in pairs: yield pair def export(self, outpath, style='html', gte=0, verbose=False): '''Export the links in this ``MossNet`` in the specified style Args: ``outpath`` (``str``): Path to desired output folder/file ``style`` (``str``): Desired output style ``gte`` (``int``): The minimum number of links for an edge to be exported * ``"dot"`` to export as a GraphViz DOT file * ``"gexf"`` to export as a Graph Exchange XML Format (GEXF) file * ``"html"`` to export one HTML file per pair ``verbose`` (``bool``): ``True`` to show verbose messages, otherwise ``False`` ''' if style not in {'dot', 'gexf', 'html'}: raise ValueError("Invalid export style: %s" % style) if isdir(outpath) or isfile(outpath): raise ValueError("Output path exists: %s" % outpath) if not isinstance(gte, int): raise TypeError("'gte' must be an 'int', but you provided a '%s'" % type(gte).__name__) if gte < 0: raise ValueError("'gte' must be non-negative, but yours was %d" % gte) # export as folder of HTML files if style == 'html': summary = self.get_summary(style='html') pairs = list(self.traverse_pairs(order=None)) makedirs(outpath) f = open('%s/summary.html' % outpath, 'w'); f.write(summary); f.close() for i,pair in enumerate(pairs): if verbose: print("Exporting pair %d of %d..." % (i+1, len(pairs)), end='\r') u,v = pair if self.num_links(u,v) < gte: continue if style == 'html': f = open("%s/%d_%s_%s.html" % (outpath, self.num_links(u,v), u, v), 'w') f.write(self.get_pair(u, v, style='html')) f.close() if verbose: print("Successfully exported %d pairs" % len(pairs)) # export as GraphViz DOT or a GEXF file elif style in {'dot', 'gexf'}: if verbose: print("Computing colors...", end='') max_links = max(self.num_links(u,v) for u,v in self.traverse_pairs()) try: from seaborn import color_palette except: raise RuntimeError("Exporting as a DOT or GEXF file currently requires seaborn") pal = color_palette("Reds", max_links) if verbose: print(" done") print("Computing node information...", end='') nodes = list(self.get_nodes()) index = {u:i for i,u in enumerate(nodes)} if verbose: print(" done") print("Writing output file...", end='') outfile = open(outpath, 'w') if style == 'dot': pal = [str(c).upper() for c in pal.as_hex()] outfile.write("graph G {\n") for u in nodes: outfile.write(' node%d[label="%s"]\n' % (index[u], u)) for u,v in self.traverse_pairs(): curr_num_links = self.num_links(u,v) if curr_num_links < gte: continue outfile.write(' node%d -- node%d[color="%s"]\n' % (index[u], index[v], pal[curr_num_links-1])) outfile.write('}\n') elif style == 'gexf': from datetime import datetime pal = [(int(255*c[0]), int(255*c[1]), int(255*c[2])) for c in pal] outfile.write('<?xml version="1.0" encoding="UTF-8"?>\n') outfile.write('<gexf xmlns="http://www.gexf.net/1.3draft" xmlns:viz="http://www.gexf.net/1.3draft/viz">\n') outfile.write(' <meta lastmodifieddate="%s">\n' % datetime.today().strftime('%Y-%m-%d')) outfile.write(' <creator>MossNet</creator>\n') outfile.write(' <description>A MossNet network exported to GEXF</description>\n') outfile.write(' </meta>\n') outfile.write(' <graph mode="static" defaultedgetype="undirected">\n') outfile.write(' <nodes>\n') for u in nodes: outfile.write(' <node id="%d" label="%s"/>\n' % (index[u], u)) outfile.write(' </nodes>\n') outfile.write(' <edges>\n') for i,pair in enumerate(self.traverse_pairs()): u,v = pair curr_num_links = self.num_links(u,v) if curr_num_links == 0: continue color = pal[curr_num_links-1] outfile.write(' <edge id="%d" source="%d" target="%d">\n' % (i, index[u], index[v])) outfile.write(' <viz:color r="%d" g="%d" b="%d"/>\n' % (color[0], color[1], color[2])) outfile.write(' </edge>\n') outfile.write(' </edges>\n') outfile.write(' </graph>\n') outfile.write('</gexf>\n') outfile.close() if verbose: print(" done")
def coalesce( G_orig: nx.MultiDiGraph, resolution: float, edge_summary_method=lambda x: x.max(), boarding_cost_summary_method=lambda x: x.mean(), ) -> nx.MultiDiGraph: # Note: Feature is experimental. For more details, see # https://github.com/kuanb/peartree/issues/126 warnings.warn(('coalesce method is experimental - method risks ' 'deformation of relative graph structure')) # Make sure our resolution satisfies basic requirement if resolution < 1: raise ValueError('Resolution parameters must be >= 1') # Avoid upstream mutation of the graph G = G_orig.copy() # Before we continue, attempt to simplfy the current network # such that we won't generate isolated nodes that become disconnected # from key coalesced nodes (because too many intermediary nodes) G = simplify_graph(G) # Extract all x, y values grouped = {} for i, node in G.nodes(data=True): x = (round(node['x'] / resolution) * resolution) y = (round(node['y'] / resolution) * resolution) # Build the dictionary as needed if x not in grouped: grouped[x] = {} if y not in grouped[x]: grouped[x][y] = [] # Append each node under its approx. area grouping grouped[x][y].append(i) # Generate a series of reference dictionaries that allow us # to assign a new node name to each grouping of nodes counter = 0 new_node_coords = {} lookup = {} # Populate the fresh reference dictionaries for x in grouped: for y in grouped[x]: new_node_name = '{}_{}'.format(G.name, counter) new_node_coords[new_node_name] = {'x': x, 'y': y} # Pair each newly generate name to the original node id, # preserved from the original groupings resulting array for n in grouped[x][y]: lookup[n] = new_node_name # Update the counter so each new synthetic # node name will be different counter += 1 # Recast the lookup crosswalk as a series for convenience reference = pd.Series(lookup) # Get the following attributes: # 1. average boarding cost for each node grouping # 2. modes associated with each node grouping for nni in new_node_coords: # Initialize an empty list boarding_costs = [] all_modes_related = [] # Get all original nodes that have been grouped g_nodes = reference.loc[reference == nni].index.values # Iterate through and add gather costs for i in g_nodes: specific_node = G.nodes[i] bc = specific_node['boarding_cost'] boarding_costs.append(bc) this_nodes_modes = specific_node['modes'] all_modes_related.extend(this_nodes_modes) # Calculate the summary boarding costs # and assign it to the new nodes objects new_node_coords[nni]['boarding_cost'] = (boarding_cost_summary_method( np.array(boarding_costs))) # Get all unique modes and assign it to the new nodes objects sorted_set_list = sorted(list(set(all_modes_related))) new_node_coords[nni]['modes'] = sorted_set_list # First step to creating a list of replacement edges replacement_edges_fr = [] replacement_edges_to = [] replacement_edges_len = [] for n1, n2, edge in G.edges(data=True): # This will be used to parse out which edges to keep replacement_edges_fr.append(reference[n1]) replacement_edges_to.append(reference[n2]) replacement_edges_len.append(edge['length']) # This takes the resulting matrix and converts it to a pandas DataFrame edges_df = pd.DataFrame({ 'fr': replacement_edges_fr, 'to': replacement_edges_to, 'len': replacement_edges_len }) # Next we group by the edge pattern (from -> to) grouped = edges_df.groupby(['fr', 'to'], sort=False) # With the resulting groupings, we extract values # TODO: Also group on modes processed_edge_costs = edge_summary_method(grouped['len']) # Second step; which uses results from edge_df grouping/parsing edges_to_add = [] for n1, n2, edge in G.edges(data=True): # Get corresponding ids of new nodes (grid corners) ref_n1 = reference[n1] ref_n2 = reference[n2] # Retrieve pair value from previous grouping operation avg_length = processed_edge_costs.loc[ref_n1, ref_n2] edges_to_add.append((ref_n1, ref_n2, avg_length, edge['mode'])) # Add the new edges to graph for n1, n2, length, mode in edges_to_add: # Only add edge if it has not yet been added yet if G.has_edge(n1, n2): continue # Also avoid edges that now connect to the same node if n1 == n2: continue G.add_edge(n1, n2, length=length, mode=mode) # Now we can remove all edges and nodes that predated the # coalescing operations for n in reference.index: # Note that this will also drop all edges G.remove_node(n) # Also make sure to update the new nodes with their summary # stats and locational data for i, node in new_node_coords.items(): if G.has_node(i): # For all other nodes, preserve them by re-populating for key in node: G.nodes[i][key] = node[key] return G