def ensure_prov_networkx_graph(prov_doc): if isinstance(prov_doc, ProvDocument): g = prov_to_graph(prov_doc) else: assert isinstance(g, nx.Graph) g = prov_doc return g
def ensure_prov_networkx_graph(prov_doc): if isinstance(prov_doc, ProvDocument): g = prov_to_graph(prov_doc) else: # Assuming we got a NetworkX graph already # TODO Raise an exception when this is not the case g = prov_doc return g
def test_simple_graph_conversion(self): for name, doc_func in tests: prov_org = doc_func() g = prov_to_graph(prov_org) if prov_org.has_bundles(): # Cannot round-trip with documents containing bundles, skipping continue prov_doc = graph_to_prov(g) self.assertEqual(prov_doc, prov_org, "Round trip graph conversion for '{}' failed.".format(name))
def build_grakel_graphs(graphs: pd.DataFrame, dataset_path: Path): if "grakel_graphs" in graphs.columns: # nothing to do return graphs # unchanged # expecting a "graphfile" column in the input DataFrame grakel_graphs = [] for graph_filename in graphs.graph_file: filepath = dataset_path / graph_filename # load the file prov_doc = ProvDocument.deserialize(filepath) prov_graph = prov_to_graph(prov_doc) # type: nx.MultiDiGraph grakel_graphs.append(graph_from_prov_networkx_graph(prov_graph)) graphs["grakel_graphs"] = grakel_graphs return graphs
def test_simple_graph_conversion(self): for name, doc_func in tests: prov_to_graph(doc_func())
def version4(prov_doc, flat=False): results = dict() if isinstance(prov_doc, ProvDocument): g = prov_to_graph(prov_doc) else: # Assuming we got a NetworkX graph already g = prov_doc # PROV types type_counter = defaultdict(int, Counter(map(type, g.nodes()))) results["entities"] = type_counter[ProvEntity] results["agents"] = type_counter[ProvAgent] results["activities"] = type_counter[ProvActivity] # Graph size results["nodes"] = g.number_of_nodes() results["edges"] = g.size() ug = g.to_undirected(as_view=True) n_comps = nx.number_connected_components(ug) results["components"] = n_comps results["diameter"] = nx.diameter(ug) if n_comps == 1 else -1 # Clustering coefficients for all nodes # cc = nx.clustering(ug) cc = dict((n, e) for n, e in nx.clustering(nx.Graph(ug)).items() if e) # excluding zero values cc_by_type = lambda node_type: [ cc[n] for n in cc if isinstance(n, node_type) ] avg_or_0 = lambda l: sum(l) / len(l) if l else 0 results["average_clustering_coefficient"] = { "all": avg_or_0(cc.values()), "entity": avg_or_0(cc_by_type(ProvEntity)), "activity": avg_or_0(cc_by_type(ProvActivity)), "agent": avg_or_0(cc_by_type(ProvAgent)), } try: assortability = nx.degree_pearson_correlation_coefficient(g) except ValueError: assortability = -1 results["degree_assortativity_coefficient"] = ( assortability if np.isfinite(assortability) else -1) s_paths = nx.shortest_path(g) lengths = lambda g, t1, t2: [(len(s_paths[i][j]) - 1) for i in node_select(g, t1) if i in s_paths for j in node_select(g, t2) if j in s_paths[i] and i != j] def mfd(graph, t1, t2): s_distances = lengths(graph, t1, t2) return max(s_distances) if s_distances else 0 results["mfd"] = { "entity": { "entity": mfd(g, ProvEntity, ProvEntity), "activity": mfd(g, ProvEntity, ProvActivity), "agent": mfd(g, ProvEntity, ProvAgent), }, "activity": { "entity": mfd(g, ProvActivity, ProvEntity), "activity": mfd(g, ProvActivity, ProvActivity), "agent": mfd(g, ProvActivity, ProvAgent), }, "agent": { "entity": mfd(g, ProvAgent, ProvEntity), "activity": mfd(g, ProvAgent, ProvActivity), "agent": mfd(g, ProvAgent, ProvAgent), }, } distributions = dict() # Path length distributions of derivations der_paths = paths_select(g, ProvEntity, ProvEntity, ProvDerivation) der_lengths = [(len(der_paths[i][j]) - 1) for i in der_paths for j in der_paths[i]] distributions["derivations"] = der_lengths # Path length distributions of usages aee_paths = paths_select(g, ProvActivity, ProvEntity, (ProvDerivation, ProvUsage)) aee_lengths = [(len(aee_paths[i][j]) - 1) for i in aee_paths for j in aee_paths[i]] distributions["activity_entity"] = aee_lengths # Path length distributions of attributions eeag_paths = paths_select(g, ProvEntity, ProvAgent, (ProvDerivation, ProvAttribution)) eeag_lengths = [(len(eeag_paths[i][j]) - 1) for i in eeag_paths for j in eeag_paths[i]] distributions["entity_agent"] = eeag_lengths # Node degree distribution (undirected) distributions["node_degrees"] = list(dict(ug.degree()).values()) results["distributions"] = distributions # The power law exponent of node degrees power_law_fit = powerlaw.Fit(distributions["node_degrees"], discrete=True, verbose=False) if not math.isnan(power_law_fit.alpha): # Check if the distribution is likely to be following the power law R, p = power_law_fit.distribution_compare("power_law", "exponential") if R > 0 and p < 0.05: results["node_degrees_powerlaw"] = { "alpha": power_law_fit.alpha, "sigma": power_law_fit.sigma, } if not flat: return results else: return flatten_v4(results)
def version1(prov_doc): results = dict() g = prov_to_graph(prov_doc) # Graph size results["nodes"] = g.number_of_nodes() results["edges"] = g.size() ug = nx.Graph(g) n_comps = nx.number_connected_components(ug) results["components"] = n_comps results["diameter"] = nx.diameter(nx.Graph(ug)) if n_comps == 1 else -1 s_paths = nx.shortest_path(g) lengths = lambda g, t1, t2: [(len(s_paths[i][j]) - 1) for i in node_select(g, t1) if i in s_paths for j in node_select(g, t2) if j in s_paths[i] and i != j] def mfd(graph, t1, t2): s_distances = lengths(graph, t1, t2) return max(s_distances) if s_distances else 0 results["mfd"] = { "entity": { "entity": mfd(g, ProvEntity, ProvEntity), "activity": mfd(g, ProvEntity, ProvActivity), "agent": mfd(g, ProvEntity, ProvAgent), }, "activity": { "entity": mfd(g, ProvActivity, ProvEntity), "activity": mfd(g, ProvActivity, ProvActivity), "agent": mfd(g, ProvActivity, ProvAgent), }, "agent": { "entity": mfd(g, ProvAgent, ProvEntity), "activity": mfd(g, ProvAgent, ProvActivity), "agent": mfd(g, ProvAgent, ProvAgent), }, } distributions = dict() der_paths = paths_select(g, ProvEntity, ProvEntity, ProvDerivation) der_lengths = [(len(der_paths[i][j]) - 1) for i in der_paths for j in der_paths[i]] distributions["derivations"] = der_lengths aee_paths = paths_select(g, ProvActivity, ProvEntity, (ProvDerivation, ProvUsage)) aee_lengths = [(len(aee_paths[i][j]) - 1) for i in aee_paths for j in aee_paths[i]] distributions["activity_entities"] = aee_lengths eeag_paths = paths_select(g, ProvEntity, ProvAgent, (ProvDerivation, ProvGeneration)) eeag_lengths = [(len(eeag_paths[i][j]) - 1) for i in eeag_paths for j in eeag_paths[i]] distributions["entities_agent"] = eeag_lengths results["distributions"] = distributions return results
def version2(prov_doc): results = dict() g = prov_to_graph(prov_doc) if isinstance(prov_doc, ProvDocument) else prov_doc # Graph size results["nodes"] = g.number_of_nodes() results["edges"] = g.size() ug = nx.Graph(g) n_comps = nx.number_connected_components(ug) results["components"] = n_comps results["diameter"] = nx.diameter(nx.Graph(ug)) if n_comps == 1 else -1 s_paths = nx.shortest_path(g) lengths = lambda g, t1, t2: [(len(s_paths[i][j]) - 1) for i in node_select(g, t1) if i in s_paths for j in node_select(g, t2) if j in s_paths[i] and i != j] def mfd(graph, t1, t2): s_distances = lengths(graph, t1, t2) return max(s_distances) if s_distances else 0 results["mfd"] = { "entity": { "entity": mfd(g, ProvEntity, ProvEntity), "activity": mfd(g, ProvEntity, ProvActivity), "agent": mfd(g, ProvEntity, ProvAgent), }, "activity": { "entity": mfd(g, ProvActivity, ProvEntity), "activity": mfd(g, ProvActivity, ProvActivity), "agent": mfd(g, ProvActivity, ProvAgent), }, "agent": { "entity": mfd(g, ProvAgent, ProvEntity), "activity": mfd(g, ProvAgent, ProvActivity), "agent": mfd(g, ProvAgent, ProvAgent), }, } distributions = dict() # Path length distributions of derivations der_paths = paths_select(g, ProvEntity, ProvEntity, ProvDerivation) der_lengths = [(len(der_paths[i][j]) - 1) for i in der_paths for j in der_paths[i]] distributions["derivations"] = der_lengths # Path length distributions of usages aee_paths = paths_select(g, ProvActivity, ProvEntity, (ProvDerivation, ProvUsage)) aee_lengths = [(len(aee_paths[i][j]) - 1) for i in aee_paths for j in aee_paths[i]] distributions["activity_entities"] = aee_lengths # Path length distributions of attributions eeag_paths = paths_select(g, ProvEntity, ProvAgent, (ProvDerivation, ProvAttribution)) eeag_lengths = [(len(eeag_paths[i][j]) - 1) for i in eeag_paths for j in eeag_paths[i]] distributions["entities_agent"] = eeag_lengths # Node degree distribution (undirected) distributions["node_degrees"] = dict(ug.degree()).values() results["distributions"] = distributions # The power law exponent of node degrees power_law_fit = powerlaw.Fit(distributions["node_degrees"], discrete=True, verbose=False) if not math.isnan(power_law_fit.alpha): # Check if the distribution is likely to be following the power law R, p = power_law_fit.distribution_compare("power_law", "exponential") if R > 0 and p < 0.05: # print power_law_fit.alpha, power_law_fit.sigma, R, p results["node_degrees_powerlaw"] = { "alpha": power_law_fit.alpha, "sigma": power_law_fit.sigma, } return results