def test_estimate_order_strongly_connected(): """ Example with single strongly connected component in first- and two connected components in second-order network """ paths = pp.Paths() ngram_list = [ 'a,b,c', 'b,c,b', 'c,b,a', 'b,a,b', 'e,b,f', 'b,f,b', 'f,b,e', 'b,e,b' ] for ngram in ngram_list: paths.addPath(ngram) g1 = pp.HigherOrderNetwork(paths, k=1) g1.reduceToGCC() assert g1.vcount( ) == 5, "Error, wrong number of nodes in first-order network" assert g1.ecount( ) == 8, "Error, wrong number of links in first-order network" g2 = pp.HigherOrderNetwork(paths, k=2) g2.reduceToGCC() assert g2.vcount( ) == 4, "Error, wrong number of nodes in second-order network" assert g2.ecount( ) == 4, "Error, wrong number of links in second-order network" # test mapping of higher-order nodes and paths assert g2.HigherOrderNodeToPath('a-b') == ('a', 'b'), \ "Error: mapping from higher-order node to first-order path failed" assert g2.HigherOrderPathToFirstOrder(('a-b', 'b-c')) == ('a', 'b', 'c'), \ "Error: mapping from higher-order path to first-order path failed"
def test_estimate_order_strongly_connected(): """ Example with single strongly connected component in first- and two connected components in second-order network """ paths = pp.Paths() ngram_list = [ 'a,b,c', 'b,c,b', 'c,b,a', 'b,a,b', 'e,b,f', 'b,f,b', 'f,b,e', 'b,e,b' ] for ngram in ngram_list: paths.add_path(ngram) g1 = pp.HigherOrderNetwork(paths, k=1) pp.algorithms.components.reduce_to_gcc(g1) assert g1.ncount( ) == 5, "Error, wrong number of nodes in first-order network" assert g1.ecount( ) == 8, "Error, wrong number of links in first-order network" g2 = pp.HigherOrderNetwork(paths, k=2) pp.algorithms.components.reduce_to_gcc(g2) assert g2.ncount( ) == 4, "Error, wrong number of nodes in second-order network" assert g2.ecount( ) == 4, "Error, wrong number of links in second-order network" # test mapping of higher-order nodes and paths assert g2.higher_order_node_to_path('a,b') == ('a', 'b'), \ "Error: mapping from higher-order node to first-order path failed" assert g2.higher_order_path_to_first_order(('a,b', 'b,c')) == ('a', 'b', 'c'), \ "Error: mapping from higher-order path to first-order path failed"
def test_distance_matrix_equal_across_objects(random_paths): """test that the distance matrix is the same if constructed from to path objects with the same paths but different instances""" p1 = random_paths(40, 20, num_nodes=9) p2 = random_paths(40, 20, num_nodes=9) hon1 = pp.HigherOrderNetwork(paths=p1, k=1) hon2 = pp.HigherOrderNetwork(paths=p2, k=1) d_matrix1 = shortest_paths.distance_matrix(hon1) d_matrix2 = shortest_paths.distance_matrix(hon2) assert d_matrix1 == d_matrix2
def test_distance_matrix_first_order(random_paths, n_nodes, k, paths, e_sum): p = random_paths(paths, 10, n_nodes) hon_k, hon_1 = pp.HigherOrderNetwork(p, k=k), pp.HigherOrderNetwork(p, k=1) dist_k = shortest_paths.distance_matrix(hon_k) dist_1 = shortest_paths.distance_matrix(hon_1) total_distance = 0 for source, target in itertools.product(hon_1.nodes, hon_1.nodes): dist_st = dist_k[source][target] assert dist_1[source][target] <= dist_k[source][target], \ "not all distances at order k are at least as long as at order 1" if dist_st < np.inf: total_distance += dist_st assert total_distance == e_sum
def test_laplacian_matrix(random_paths): paths = random_paths(30, 10, 5) hon = pp.HigherOrderNetwork(paths, k=1) L = hon.laplacian_matrix().toarray() assert np.trace(L) > 0 assert np.tril(L, k=-1).sum() < 0 assert np.triu(L, k=1).sum() < 0
def test_get_adjacency_mat(random_paths, paths, k_order, sub, num_nodes, s_sum, s_mean): p = random_paths(paths, 10, num_nodes) hon = pp.HigherOrderNetwork(p, k=k_order) adj = hon.adjacency_matrix(include_subpaths=sub) assert adj.sum() == s_sum assert adj.mean() == pytest.approx(s_mean)
def test_fiedler_vector_dense(random_paths, k, e_sum, e_var): import numpy as np p = random_paths(90, 0, 20) hon = pp.HigherOrderNetwork(p, k=k) fv = pp.algorithms.spectral.fiedler_vector_dense(hon) assert fv.var() == pytest.approx(e_var, abs=EIGEN_ABS_TOL) assert np.sum(fv) == pytest.approx(e_sum, abs=EIGEN_ABS_TOL)
def test_strong_connected_tmp(random_temp_network): from pathpy.path_extraction.temporal_paths import paths_from_temporal_network_dag from pathpy.algorithms.components import connected_components from pathpy.classes.network import network_to_networkx from networkx import strongly_connected_components from pathpy.utils.log import Log, Severity Log.set_min_severity(Severity.WARNING) for delta in range(1, 900, 50): print(delta) tn = random_temp_network(n=10, m=100, min_t=0, max_t=800, seed=90) # type: pp.TemporalNetwork obs_times = np.array([t[-1] for t in tn.tedges]) obs_times.sort() p = paths_from_temporal_network_dag(tn, delta=delta) hn = pp.HigherOrderNetwork(p, k=2) # using NetworkX nx_network = network_to_networkx(hn) giant_size_nx = len(max(strongly_connected_components(nx_network), key=len)) # using pathpy components = connected_components(hn) if giant_size_nx > 3: print(giant_size_nx) giant_size_pp = max(len(c) for c in components) assert giant_size_nx == giant_size_pp
def main(): # TODO: make the input file a variable/command line argument? # Read input path data from the finished paths text file # my_file = open("paths_finished.txt", "r") my_file = open("basic_path_data", "r") path_data = [line.split(';') for line in my_file.read().splitlines()] # Create first graph using the basic construction algorithm print("Creating baseline graph") basic_graph = basic_db_graph_construction.constructDBGraph(path_data, 3) print("Finished") print() # Create second graph using the Divide and Conquer algorithm print("Creating graph using Divide and Conquer Algorithm") divide_conquer_graph = divide_and_conqer_db_graph_construction.constructDBGraph( path_data, 3) print("Finished") print() # Create the third graph using pathpy print("Reading path data from file into Paths object") paths = pp.Paths.read_file(filename="paths_finished.txt", separator=';', frequency=False, expand_sub_paths=False) # paths = pp.Paths() # for path in path_data: # paths.add_path(path, separator=";") print("Finished") print() print(paths) print() print("Creating HigherOrderNetwork using pathpy") pathpy_graph = pp.HigherOrderNetwork(paths, 3) print("Finished") print() # dictionary of edges to compare against the others # print(pathpy_graph.edges) comparable_pathpy_edges = convert_pathpy_edges_to_multiset( pathpy_graph.edges) basic_equals_divide_conquer = compare_dictionary_graphs( basic_graph, divide_conquer_graph) basic_equals_pathpy = compare_dictionary_graphs(basic_graph, comparable_pathpy_edges) if basic_equals_divide_conquer: print("BASIC AND DIVIDE CONQUER ARE THE SAME") else: print("BASIC AND DIVIDE CONQUER ARE DIFFERENT") if basic_equals_pathpy: print("BASIC AND PATHPY ARE THE SAME") else: print("BASIC AND PATHPY ARE DIFFERENT")
def test_algebraic_connectivity(random_paths, k, e_sum): import pathpy p = random_paths(120, 0, 40) hon = pp.HigherOrderNetwork(p, k=k) ac = pp.algorithms.spectral.algebraic_connectivity(hon, lanczos_vectors=60, maxiter=40) assert ac == pytest.approx(e_sum, rel=1e-7)
def test_eigen_centrality_hon(random_paths, sub, projection, k, e_sum, e_var): import numpy as np p = random_paths(50, 0, 8) hon = pp.HigherOrderNetwork(p, k=k) eigen = pp.algorithms.centralities.eigenvector(hon, projection, sub) values = np.array(list(eigen.values())) assert values.sum() == pytest.approx(e_sum, abs=EIGEN_ABS_TOL) assert values.var() == pytest.approx(e_var, abs=EIGEN_ABS_TOL)
def test_closeness_centrality_hon(random_paths, k, e_sum, e_var): import numpy as np p = random_paths(50, 0, 8) hon = pp.HigherOrderNetwork(p, k=k) closeness = pp.algorithms.centralities.closeness(hon) np_closeness = np.array(list(closeness.values())) assert np_closeness.sum() == pytest.approx(e_sum) assert np_closeness.var() == pytest.approx(e_var)
def test_distance_matrix_from_file(path_from_edge_file): p = path_from_edge_file hon = pp.HigherOrderNetwork(paths=p, k=1) d_matrix = shortest_paths.distance_matrix(hon) np_matrix = dict_of_dicts_to_matrix(d_matrix) assert np.sum(np_matrix) == 8 assert np.min(np_matrix) == 0 assert np.max(np_matrix) == 2
def test_distance_matrix(random_paths, paths, n_nodes, k, e_var, e_sum): p = random_paths(paths, 20, num_nodes=n_nodes) hon = pp.HigherOrderNetwork(paths=p, k=k) d_matrix = shortest_paths.distance_matrix(hon) np_matrix = dict_of_dicts_to_matrix(d_matrix) assert np.var(np_matrix) == pytest.approx(e_var) assert np.sum(np_matrix) == e_sum
def test_eigen_value_gap(random_paths, k, sub, e_gap): import numpy as np p = random_paths(200, 0, 40) hon = pp.HigherOrderNetwork(p, k=k) np.random.seed(0) eigen_gap = pp.algorithms.spectral.eigenvalue_gap(hon, include_sub_paths=sub, lanczos_vectors=90) assert eigen_gap
def test_pagerank_centrality_hon(random_paths, sub, proj, k, e_sum, e_var): import numpy as np p = random_paths(50, 0, 8) hon = pp.HigherOrderNetwork(p, k=k) page = pp.algorithms.centralities.pagerank(hon, include_sub_paths=sub, projection=proj) values = np.array(list(page.values())) assert values.sum() == pytest.approx(e_sum) assert values.var() == pytest.approx(e_var)
def test_betweenness_centrality_hon(random_paths, norm, k, e_sum, e_var, e_max): import numpy as np p = random_paths(50, 0, 8) hon = pp.HigherOrderNetwork(p, k=k) betweenness = pp.algorithms.centralities.betweenness(hon, normalized=norm) values = np.array(list(betweenness.values())) assert values.sum() == pytest.approx(e_sum) assert max(values) == pytest.approx(e_max) assert values.var() == pytest.approx(e_var)
def test_shortest_path_length(random_paths, paths, k, num_nodes, s_mean, s_var, s_max): p = random_paths(paths, 10, num_nodes=num_nodes) hon = pp.HigherOrderNetwork(p, k=k) all_paths = shortest_paths.shortest_paths(hon) distances = dict_of_dicts_to_matrix(all_paths, agg=len) assert np.mean(distances) == pytest.approx(s_mean) assert np.var(distances) == pytest.approx(s_var) assert np.max(distances) == s_max
def test_distance_matrix_first_order_eq_dist_matrix(random_paths, paths, num_nodes): """test that the distance matrix of k=1 is equal to distance_matrix_first_order""" p = random_paths(paths, 10, num_nodes) hon = pp.HigherOrderNetwork(p, k=1) dist = shortest_paths.distance_matrix(hon) dist_alt = shortest_paths.distance_matrix(hon) m = dict_of_dicts_to_matrix(dist) m_alt = dict_of_dicts_to_matrix(dist_alt) assert np.allclose(m, m_alt)
def test_estimate_order_2(): # Example with second-order correlations paths = pp.Paths() paths.addPath('a,c') paths.addPath('b,c') paths.addPath('c,d') paths.addPath('c,e') for k in range(4): paths.addPath('a,c,d') paths.addPath('b,c,e') m = pp.MultiOrderModel(paths, maxOrder=2) assert m.estimateOrder( paths) == 2, "Error, did not detect second-order correlations" x = list(map(str, _np.random.choice(range(10), 100000))) ms = pp.MarkovSequence(x) assert ms.estimateOrder(maxOrder=2, method='BIC') == 1, \ "Error, wrongly detected higher-order correlations" assert ms.estimateOrder(maxOrder=2, method='AIC') == 1, \ "Error, wrongly detected higher-order correlations" g1 = pp.HigherOrderNetwork(paths, k=1) assert g1.vcount() == 5, \ "Error, wrong number of nodes in first-order network" assert g1.ecount() == 4, \ "Error, wrong number of links in first-order network" g2 = pp.HigherOrderNetwork(paths, k=2) assert g2.vcount() == 4, \ "Error, wrong number of nodes in second-order network" assert g2.ecount() == 2, \ "Error, wrong number of links in second-order network" g2.reduceToGCC() assert g2.vcount() == 1, \ "Error, wrong number of nodes in giant connected component" assert g2.ecount() == 0, \ "Error, wrong number of links in giant connected component"
def test_transition_probability(random_paths, k, sub): paths = random_paths(30, 45, 14) hon = pp.HigherOrderNetwork(paths, k=k) T = hon.transition_matrix(include_subpaths=sub).toarray() if sub: transitions = sum(hon.nodes[w]["outweight"].sum() > 0 for w in hon.nodes) else: transitions = sum(hon.nodes[x]["outweight"][1] > 0 for x in hon.nodes) assert T.sum() == pytest.approx(transitions) assert np.all(T <= 1), "not all probabilities are smaller then 1" assert np.all(T >= 0), "not all probabilities are positive"
def main(): start_time = time.time() my_file = open("paths_finished.txt", "r") path_data = [line.split(';') for line in my_file.read().splitlines()] end_time = time.time() print("Read file in " + str(round(end_time - start_time, 2)) + " seconds") print(path_data[:10]) K = 10 # constructing each of the models from orders 1-10 using a basic method basic_models = [] start_time = time.time() for k in range(1, K + 1): basic_models.append( basic_db_graph_construction.constructDBGraph(path_data, k)) end_time = time.time() print("Basic: Constructed " + str(K) + "th order graph in " + str(round(end_time - start_time, 2)) + " seconds") # constructing each of the models from order 1-10 using divide and conquer dc_models = [] start_time = time.time() for k in range(1, K + 1): dc_models.append( divide_and_conqer_db_graph_construction.constructDBGraph( path_data, k)) end_time = time.time() print("Divide and Conquer: Constructed " + str(K) + "th order graph in " + str(round(end_time - start_time, 2)) + " seconds") # constructing a higher order network with all models of order 1-10 using pathpy start_time = time.time() # paths = pp.Paths() # for path in path_data: # paths.add_path(path, separator=";") paths = pp.Paths.read_file(filename="paths_finished.txt", separator=';', frequency=False, expand_sub_paths=False) end_time = time.time() print("Pathpy: Read file in " + str(round(end_time - start_time, 2)) + " seconds") # pathpy start_time = time.time() pathpy_graph = pp.HigherOrderNetwork(paths, K) end_time = time.time() print("Pathpy: Constructed " + str(K) + "th order graph in " + str(round(end_time - start_time, 2)) + " seconds")
def test_extract_distribute(test_data_directory, ): network_path = os.path.join(test_data_directory, 'example_network.edges') od_path = os.path.join(test_data_directory, 'example_origin_destination.csv') # read the network topology p = pp.Paths.read_edges(network_path, undirected=True) network = pp.HigherOrderNetwork(p) OD = pp.path_extraction.read_origin_destination(od_path) paths = pp.path_extraction.paths_from_origin_destination(OD, network) assert (paths.paths[3][('A', 'B', 'F', 'H')][1] == 2.0 and paths.paths[3][('A', 'C', 'G', 'H')][1] == 3.0) or \ (paths.paths[3][('A', 'B', 'F', 'H')][1] == 3.0 and paths.paths[3][('A', 'C', 'G', 'H')][1] == 2.0) assert paths.paths[3][('D', 'B', 'C', 'E')][1] == 7.0 assert paths.paths[2][('A', 'B', 'F')][1] == 3.0 assert paths.paths[2][('B', 'C', 'E')][1] == 3.0
def test_model_size(random_paths, k, n_nodes, expected): p = random_paths(20, 10, n_nodes) hon_1 = pp.HigherOrderNetwork(p, k=k) assert np.allclose(hon_1.model_size(), expected)
def test_degrees(path_from_edge_file): hon_1 = pp.HigherOrderNetwork(path_from_edge_file, k=1) expected_degrees = {'1': 52, '2': 0, '3': 2, '5': 5} for v in hon_1.nodes: assert expected_degrees[v] == hon_1.nodes[v]["outweight"][1], \ "Wrong degree calculation in HigherOrderNetwork"
""") #%% md(""" The data analysis and modelling framework outlined in these works builds on a generalisation of standard, first-order networks to $k$-dimensional De Bruijn graph models for paths in complex networks. The class `HigherOrderNetwork` allows us to generate such higher-order network models of paths. In the documentation, we find that the constructor takes a parameter `paths`, i.e. the statistics of the observed paths that we want to model. With the parameter `k` we specify the order $k$ of the higher-order model that we want to fit. To understand this better, let us do this for our toy example. <span style="color:red">**TODO:** Read the toy example from unit 1.2 from the file `data/toy_paths.ngram`, generate a **first-order** model instance `hon_1` and print a summary of the resulting instance.</span> """) #%% In [2] toy_paths = pp.Paths.read_file('data/toy_paths.ngram') print(toy_paths) hon_1 = pp.HigherOrderNetwork(toy_paths, k=1) print(hon_1) #%% md(""" This generates a first-order model of our paths, with five nodes $a,b,c,d$ and $e$, and four links $(a,c), (b,c), (c,d), (c,e)$. It is identicaly to the `Network` instance that we have previously created using `Network.from_paths`. Indeed, each `HigherOrderNetwork` instance is derived from the class `Network`, which means we can store edge and node attributes and visualise it by exactly the same methods. <span style="color:red">**TODO:** Plot the `HigherOrderModel` instance `hon_1` and print the weight of all edges.</span> """) #%% In [3] style = { 'label_offset': [0,-1], 'label_color' : 'black', 'width': 800, 'height': 250} pp.visualisation.plot(hon_1, **style) for e in hon_1.edges: print(e, hon_1.edges[e]['weight'])
t = pp.TemporalNetwork.read_file('data/temporal_clusters.tedges') style = { 'max_time': 250, 'ms_per_frame': 10, 'ts_per_frame': 1 } pp.visualisation.plot(t, **style) #%% In [17] walk = pp.algorithms.temporal_walk.generate_walk(t, 500) style['ms_per_frame'] = 250 pp.visualisation.plot_walk(pp.Network.from_temporal_network(t), walk, **style) #%% In [18] p = pp.path_extraction.paths_from_temporal_network_dag(t) hon_2 = pp.HigherOrderNetwork(p, k=2) clusters = { v: 'red' if len(v)<2 else ('green' if v.startswith('1') else 'blue') for v in p.nodes} pp.visualisation.plot(hon_2, plot_higher_order_nodes=False, node_color = clusters) #%% In [19] pp.visualisation.plot_walk(hon_2, walk, **style, plot_higher_order_nodes=False) #%% In [20] hon_3 = pp.HigherOrderNetwork(p, k=3) pp.visualisation.plot(hon_3, plot_higher_order_nodes=False, node_color = clusters) #%% In [21] print('Second-order model: {0}'.format(pp.algorithms.spectral.algebraic_connectivity(hon_2)))
def run(card_text): numFrac = 3.5 def similarity(s1, s2): wList1 = [] for i in range(0, len(s1)): wList1.append(s1[i]) wList2 = [] for i in range(0, len(s2)): wList2.append(s2[i]) denom = math.log(len(s1)) + math.log(len(s2)) count = 0 for word in wList1: if word in wList2: count += 1 score = count / denom return score fWriterInt = open('cardSpaced.txt', 'w') fullText = card_text combinedParagraphs = " ".join(line.strip() for line in fullText) LineList = combinedParagraphs.split('. ') for line in LineList: fWriterInt.write(line.strip() + "." + "\n") fWriterInt.close() fReaderSentences = open('cardSpaced.txt', 'r') fWriteNodes = open('sentenceNodes.txt', 'w') fWriteAllEdges = open('sentenceAllEdgeWeights.txt', 'w') sentences = fReaderSentences.readlines() length = len(sentences) numSentences = int(length / numFrac) for i in range(0, len(sentences)): fWriteNodes.write(str(i) + "\n") sentenceListLists = [] for sent1 in sentences: for sent2 in sentences: i1 = sentences.index(sent1) i2 = sentences.index(sent2) sentenceList = [] score = similarity(sent1, sent2) sentenceList.append(str(i1)) sentenceList.append(str(i2)) sentenceList.append(str(score)) sentenceListLists.append(sentenceList) sentenceListLists.sort(key=lambda x: float(x[2])) sentenceListLists.reverse() fWriteSortEdges = open('sentenceSortedEdgeWeights.txt', 'w') for list in sentenceListLists: fWriteSortEdges.write(list[1] + "," + list[0] + "," + str(round(float(list[2]))) + "\n") fWriteSortEdges.close() paths = pp.Paths.read_edges('sentenceSortedEdgeWeights.txt', weight=True) network = pp.HigherOrderNetwork(paths, k=1) prDict = pp.algorithms.centralities.pagerank(network, weighted=True) print(prDict) prListTotal = [] for tuple in prDict: prList = [] prList.append(tuple) prList.append(prDict[tuple]) prListTotal.append(prList) prListTotal.sort(key=lambda x: float(x[1])) prListTotal.reverse() fSummarizedSentences = open('keySentences.txt', 'w') newList = [] for i in range(0, numSentences): newList.append(int(prListTotal[i][0])) newList.sort() return newList
import pathpy as pp import time # paths = pp.Paths.read_file(filename="paths_finished.txt", separator=';', frequency=False, expand_sub_paths=False) paths = pp.Paths.read_file(filename="paths_finished.txt", separator=';', frequency=False, expand_sub_paths=False) start_time = time.time() graph = pp.HigherOrderNetwork(paths, 3) end_time = time.time() print("Running time of pathpy: " + str(round(end_time - start_time, 2)) + " seconds") print(dict(list(graph.edges.items())))
#%% In [1] import pathpy as pp toy_paths = pp.Paths() toy_paths.add_path('a,c,d', 2) toy_paths.add_path('b,c,e', 2) print(toy_paths) #%% In [2] hon_1 = pp.HigherOrderNetwork(toy_paths) pp.visualisation.plot(hon_1) print(hon_1.transition_matrix()) #%% In [3] print(hon_1.likelihood(toy_paths, log=False)) #%% In [4] hon_2 = pp.HigherOrderNetwork(toy_paths, k=2) print(hon_2.transition_matrix()) hon_2.likelihood(toy_paths, log=False) #%% In [5] hon_2_null = pp.HigherOrderNetwork(toy_paths, k=2, null_model=True) pp.visualisation.plot(hon_2_null) print(hon_2.transition_matrix()) hon_2_null.likelihood(toy_paths, log=False) #%% In [6] from scipy.stats import chi2 d = hon_2.degrees_of_freedom() - hon_1.degrees_of_freedom()