def load_data(datadir, train_datadir=None): if train_datadir is None: fnames = os.listdir(datadir) rnd.shuffle(fnames) test_num = len(fnames) // 10 test_fnames = fnames[:test_num] train_fnames = fnames[test_num:] train_datadir = datadir test_datadir = datadir else: test_datadir = datadir train_fnames = os.listdir(train_datadir) test_fnames = os.listdir(test_datadir) #if isinstance(test_fnames, str): test_fnames = file_lines(test_fnames) #if isinstance(train_fnames, str): train_fnames = file_lines(train_fnames) test_data = [] for fname in test_fnames: data, lens_labels_symbols = cop.load_premsel( os.path.join(test_datadir, fname)) test_data.append((GraphData(data), lens_labels_symbols)) train_data = [] for fname in train_fnames: data, lens_labels_symbols = cop.load_premsel( os.path.join(train_datadir, fname)) train_data.append((GraphData(data), lens_labels_symbols)) return test_data, train_data
import numpy as np from sklearn.metrics import roc_auc_score from MELL.MELL import MELL_model from utility import generate_train_test_data from graph_data import GraphData if __name__ == "__main__": path = 'Dataset/sample1' data = GraphData(path) test_rate = 0.2 train_edges, test_edges = generate_train_test_data(data.L, data.N, data.directed, data.edges, test_rate) print("==========data description==========") print("total edge : ", len(data.edges)) print("train edge : ", len(train_edges)) print("test edge : ", len(test_edges)) print("====================================") model = MELL_model(data.L, data.N, data.directed, train_edges, 128, 4, 10, 1, 1) model.train(500) y_true = np.array(test_edges)[:, 3] y_predict = [model.predict(t) for t in test_edges]
def __enumerate_and_label_all_possible_inputs__(self): # Rather than enumerating all # 2^(subgraph_size-choose-2 * (1 + is_directed)) # possible naive graph options, builds them up using the # node order canonicalizer, adding one edge (or trait) at a time. if self.__graph_data__.is_directed(): empty_graph = DirectedGraphData() else: empty_graph = GraphData() # To enumerate all possible inputs efficiently, we first generate the # possible graphs do not contain the nodes being added, deleted, or # (dis)connected by a (old)new edge. Thus we iterate over graphs of # up to subgraph_size - 2 nodes. # # Edge modification examples will add two more nodes: the two nodes # being (dis)connected. # Node addition cases will add two more nodes: the new node and the node # that the new node connects to. # Node deletion cases will add only one more node: the node being # deleted. partial_size = self.__edge_sst_size__ - 2 enumeration_nodes = [i for i in range(0, partial_size)] for node in enumeration_nodes: empty_graph.add_node(node) directed = int(self.__graph_data__.is_directed()) # Max number of possible edges. full_edges = \ int(((partial_size * (partial_size -1 )) / 2) * (1 + directed)) # Half of the max possible edges, rounded down. half_edges = int(full_edges / 2) take_complement_of_half = half_edges * 2 < full_edges partial_graph_bank = set() # `partial_graphs` functions both as a collection and a queue. partial_graphs = [empty_graph] next_graph_idx = 0 partial_graphs_back_half = [] while next_graph_idx < len(partial_graphs): graph = partial_graphs[next_graph_idx] next_graph_idx += 1 # Check to see if the complement of the graph should also be added. if graph.num_edges() < half_edges or take_complement_of_half: complement = empty_graph.copy() for i in range(0, partial_size): for j in range((i + 1) * (1 - directed), partial_size): if not graph.has_edge(i, j): complement.add_edge(i, j) partial_graphs_back_half.append(complement) # Cycle through possible additions to the graph, checking to see if # they have yet to be found. If so, add them to the collection. if graph.num_edges() < half_edges: for i in range(0, partial_size): for j in range((i + 1) * (1 - directed), partial_size): if i == j: continue if not graph.has_edge(i, j): copy = graph.copy() copy.add_edge(i, j) canonicalizer = SubgraphChangeLabeler(copy, \ subgraph_size=None, precompute=False) (node_order, _) = canonicalizer.\ __canonical_node_order__(enumeration_nodes) graph_hash = canonicalizer.\ __subgraph_representation__(node_order) if graph_hash not in partial_graph_bank: partial_graph_bank.add(graph_hash) partial_graphs.append(copy) for i in range(0, len(partial_graphs_back_half)): # Reverse the order of the back half list so that graphs come paired # by complement (first graph complement of last, second complement # of second-to-last, etc.). This way the graphs are also in # order of increasing number of edges. partial_graphs.append(partial_graphs_back_half[-1 * (i + 1)]) # Now that we have acquired all the partial graphs, we add in the nodes # being (primarily) modified and partition the subgraphs according to # which kind(s) of change(s) they represent. # We still are not applying traits to the graphs. That happens last. edge_change_graphs_wo_traits = [] node_add_graphs_wo_traits = [] node_del_graphs_wo_traits = [] # First we add one new node. new_node = partial_size possible_edges_first_node = [(new_node, i) for i in range(0, new_node)] if directed == 1: possible_edges_first_node += \ [(b, a) for (a, b) in possible_edges_first_node] first_node_graph_bank = set() first_node_graphs = [] enumeration_nodes = [i for i in range(0, partial_size + 1)] # Highlights are essentially initial labels that distinguish these nodes # from the others in the graph. highlights = {new_node: 0} for partial_graph in partial_graphs: edge_combos_first_node = \ a_utils.get_all_k_tuples(2, len(possible_edges_first_node)) # Cycle through all the ways the first node could connect to the # partial graphs. for edge_combo in edge_combos_first_node: copy = partial_graph.copy() copy.add_node(new_node) for edge_idx in range(0, len(edge_combo)): if edge_combo[edge_idx]: (a, b) = possible_edges_first_node[edge_idx] copy.add_edge(a, b) # Check to see if the graph is even new. canonicalizer = SubgraphChangeLabeler(copy, subgraph_size=None,\ precompute=False) (node_order, _) = canonicalizer.__canonical_node_order__(\ enumeration_nodes, highlights=highlights) graph_hash = canonicalizer.__subgraph_representation__(\ node_order, highlights=highlights) if graph_hash not in first_node_graph_bank: first_node_graph_bank.add(graph_hash) first_node_graphs.append(copy) # Check to see if copy is a valid node deletion graph by # checking to see that the graph is connected. if len(g_utils.connected_components(copy)) == 1: node_del_graphs_wo_traits.append(copy) del first_node_graph_bank # We now commence adding in a second node to the graph. new_node += 1 highlights[new_node] = 0 # If an ordering is imposed on the nodes, give them different highlights. if directed == 1 or self.__diff_ends__: highlights[new_node] = 1 possible_edges_second_node = [(new_node, i) for i in range(0, new_node)] if directed == 1: possible_edges_second_node += \ [(b, a) for (a, b) in possible_edges_second_node] second_node_graph_bank = set() enumeration_nodes = [i for i in range(0, partial_size + 2)] for first_node_graph in first_node_graphs: edge_combos_second_node = \ a_utils.get_all_k_tuples(2, len(possible_edges_second_node)) # Cycle through all the ways the first node could connect to the # partial graphs. for edge_combo in edge_combos_second_node: copy = first_node_graph.copy() copy.add_node(new_node) for edge_idx in range(0, len(edge_combo)): if edge_combo[edge_idx]: (a, b) = possible_edges_second_node[edge_idx] copy.add_edge(a, b) # Check to see if the graph is even new. canonicalizer = SubgraphChangeLabeler(copy, subgraph_size=None,\ precompute=False) (node_order, _) = canonicalizer.__canonical_node_order__(\ enumeration_nodes, highlights=highlights) graph_hash = canonicalizer.__subgraph_representation__(\ node_order, highlights=highlights) if graph_hash not in second_node_graph_bank: second_node_graph_bank.add(graph_hash) connected_components = g_utils.connected_components(copy) num_connected_components = len(connected_components) # Check to see if copy is a valid node addition graph by # checking to see that the graph is connected and that # one of the highlighted nodes connects only to the other # highlighted node. # # If the graph is directed, ensure that the edge points from # (new_node - 1) to new_node so that the highlights follow # the source->target convention. if num_connected_components == 1: if directed == 1 and \ ((copy.in_neighbors(new_node)==set([new_node-1]) and\ len(copy.out_neighbors(new_node)) == 0) or \ (copy.out_neighbors(new_node-1)==set([new_node]) and\ len(copy.in_neighbors(new_node - 1)) == 0)): node_add_graphs_wo_traits.append(copy) elif directed == 0 and self.__diff_ends__ and \ copy.neighbors(new_node) == set([new_node-1]): node_add_graphs_wo_traits.append(copy) elif directed == 0 and (not self.__diff_ends__) and \ (copy.neighbors(new_node) == set([new_node-1]) or copy.neighbors(new_node - 1) == set([new_node])): node_add_graphs_wo_traits.append(copy) # Check to see if copy is a valid edge modification graph. # # Note that if the graph is directed, we enforce that the # direction be from (new_node - 1) to new_node, but if it # is undirected, the has_edge() function looks for an edge # in either direction. if num_connected_components == 1 and \ (copy.has_edge(new_node - 1, new_node)): edge_change_graphs_wo_traits.append(copy.copy()) del second_node_graph_bank print("Number of Node Deletion Cases (W/O Traits): \t%d" % len(node_del_graphs_wo_traits)) print("Number of Node Addition Cases (W/O Traits): \t%d" % len(node_add_graphs_wo_traits)) print("Number of Edge Modification Cases (W/O Traits): \t%d" % len(edge_change_graphs_wo_traits)) ###### Now for the node and edge traits. ####### # Note that `highlights` acquires its value above. graph_lists_wo_traits = \ [node_del_graphs_wo_traits, node_add_graphs_wo_traits, \ edge_change_graphs_wo_traits] graph_lists_with_traits = [[], [], []] if len(self.__node_traits__) == 0 and len(self.__edge_traits__) == 0: # No traits to handle. Moving on. graph_lists_with_traits = graph_lists_wo_traits # Apply all possible trait combos. else: for change_type_idx in range(0, 3): # Reset the graph hash bank for each type of change. graph_trait_bank = set() graph_list_wo_traits = graph_lists_wo_traits[change_type_idx] graph_list_with_traits = graph_lists_with_traits[ change_type_idx] for graph in graph_list_wo_traits: nodes = [i for i in range(0, graph.num_nodes())] edges = [] for i in range(0, self.__edge_sst_size__): for j in range((i + 1) * (1 - directed), \ self.__edge_sst_size__): if graph.has_edge(i, j): edges.append((i, j)) entity_sets = [nodes, edges] trait_sets = [self.__node_traits__, self.__edge_traits__] trait_values_sets = [[], []] for trait_set_idx in range(0, 2): traits = trait_sets[trait_set_idx] entities = entity_sets[trait_set_idx] trait_values = trait_values_sets[trait_set_idx] n_ent = len(entities) for trait in traits: graph.add_trait(trait[1]) if trait[0] == SubgraphChangeLabeler.RANK_TRAIT: # It's a rank trait. # If guaranteed unique, only need permutations # for rankings. if trait[2]: trait_values.append(a_utils.\ get_all_k_permutations(n_ent, n_ent)) # Otherwise, allow for ties. else: trait_values.append(\ a_utils.get_all_n_rankings(n_ent)) else: # It's a class trait. options = trait[2] combos = a_utils.get_all_k_tuples(\ len(options), n_ent) trait_values.append(\ [[options[idx] for idx in combo] \ for combo in combos]) trait_names = [t[1] for t in self.__node_traits__] + \ [t[1] for t in self.__edge_traits__] entities_by_trait = [nodes for t in self.__node_traits__] +\ [edges for t in self.__edge_traits__] values_by_trait = trait_values_sets[0] + trait_values_sets[ 1] trait_combo_counter = \ [0 for i in range(0, len(entities_by_trait))] trait_combo_counter[-1] = -1 digit_limits_inclusive = \ [len(values) - 1 for values in values_by_trait] while a_utils.increment_counter(trait_combo_counter, \ digit_limits_inclusive): copy = graph.copy() for trait_idx in range(0, len(trait_names)): entities = entities_by_trait[trait_idx] values_selection = trait_combo_counter[trait_idx] values = values_by_trait[trait_idx][ values_selection] for ent_idx in range(0, len(entities)): ent = entities[ent_idx] copy[trait_names[trait_idx]][ent] = \ values[ent_idx] # Trait values have been assigned to copy. Now check to # see if this assignment is isomorphically unique. canonicalizer = SubgraphChangeLabeler(copy, \ node_traits=self.__node_traits__, \ edge_traits=self.__edge_traits__, \ subgraph_size=None, precompute=False) (node_order, _) = canonicalizer.\ __canonical_node_order__(nodes, highlights) graph_hash = canonicalizer.\ __subgraph_representation__(node_order, highlights) if graph_hash not in graph_trait_bank: graph_trait_bank.add(graph_hash) graph_list_with_traits.append(copy) del graph_trait_bank print("Number of Node Deletion Cases (With Traits): \t%d" % len(graph_lists_with_traits[0])) print("Number of Node Addition Cases (With Traits): \t%d" % len(graph_lists_with_traits[1])) print("Number of Edge Modification Cases (With Traits): \t%d" % len(graph_lists_with_traits[2])) ####### Lastly, store possible inputs. ####### self.__DEL_NODE__ = 0 self.__ADD_NODE__ = 1 self.__DEL_EDGE__ = 2 self.__ADD_EDGE__ = 3 self.__num_labels__ = {0: 0, 1: 0, 2: 0} # Since we're performing a full enumeration and edge additions have all # the same cases as edge deletions, the dicts can be duplicated. self.__repr_to_label__ = {0: dict(), 1: dict(), 2: dict()} self.__repr_to_label__[3] = self.__repr_to_label__[2] self.__label_to_canonical_repr__ = {0: dict(), 1: dict(), 2: dict()} self.__label_to_canonical_repr__[3] = self.__label_to_canonical_repr__[ 2] for change_type in range(0, 3): num_nodes = graph_lists_with_traits[change_type][0].num_nodes() nodes = [i for i in range(0, num_nodes)] num_highlight_nodes = 1 + int(num_nodes == self.__edge_sst_size__) num_other_nodes = num_nodes - num_highlight_nodes other_nodes_orders = \ a_utils.get_all_k_permutations(num_other_nodes, num_other_nodes) other_nodes_orders = [[n + num_highlight_nodes for n in order] for \ order in other_nodes_orders] if directed == 1 or self.__diff_ends__: highlight_nodes_orders = \ [tuple([i for i in range(0, num_highlight_nodes)])] highlights = {i: i for i in range(0, num_highlight_nodes)} else: highlight_nodes_orders = a_utils.get_all_k_permutations(\ num_highlight_nodes, num_highlight_nodes) highlights = {i: 0 for i in range(0, num_highlight_nodes)} for graph in graph_lists_with_traits[change_type]: label = self.__num_labels__[change_type] self.__num_labels__[change_type] += 1 # First create the canonical label. canonicalizer = SubgraphChangeLabeler(graph, \ node_traits=self.__node_traits__, \ edge_traits=self.__edge_traits__, \ subgraph_size=None, precompute=False) (order, _) = \ canonicalizer.__canonical_node_order__(nodes, highlights) graph_hash = \ canonicalizer.__subgraph_representation__(order, highlights) self.__label_to_canonical_repr__[change_type][ label] = graph_hash # Then enumerate possible input orders. for highlight_order in highlight_nodes_orders: highlight_order_list = list(highlight_order) for other_order in other_nodes_orders: node_order = highlight_order_list + other_order graph_hash = canonicalizer.__subgraph_representation__(\ node_order, highlights) self.__repr_to_label__[change_type][graph_hash] = label self.__num_labels__[3] = self.__num_labels__[2] change_type_names = ["node deletion", "node addition", \ "edge deletion", "edge addition"] for change_type in range(0, 4): print("Number %s cases with possible input orders: %d" % \ (change_type_names[change_type], \ len(self.__repr_to_label__[change_type])))
if self.__graph_data__.has_edge(nodes[i], nodes[j]): edges.append((nodes[i], nodes[j])) local_idx_edges.append((i, j)) else: for i in range(0, len(nodes)): for j in range(i + 1, len(nodes)): if self.__graph_data__.has_edge(nodes[i], nodes[j]): edges.append((nodes[i], nodes[j])) local_idx_edges.append((i, j)) return (edges, local_idx_edges) if __name__ == "__main__": from graph_change_modeler import GraphChangeModeler GD = GraphData() # GD = DirectedGraphData() # node_traits = [GraphChangeModeler.ClassTrait(X, ["A", "B", "C", "D"]) for X in ["t0", "t1"]] # node_traits = [GraphChangeModeler.RankTrait("Pagerank", guaranteed_unique=False)] # edge_traits = [GraphChangeModeler.RankTrait("Edgerank", guaranteed_unique=False)] node_traits = [ GraphChangeModeler.ClassTrait( "Change", possible_class_values=["Added", "None", "Deleted"]) ] #, GraphChangeModeler.RankTrait("Rank", guaranteed_unique=False)] edge_traits = [ GraphChangeModeler.ClassTrait( "Change", possible_class_values=["Added", "None", "Deleted"]) ] # node_traits = [] # edge_traits = [] SCL = SubgraphChangeLabeler(GD, 4, node_traits=node_traits, edge_traits=edge_traits, \
def __init__(self, graph_nodes, graph_edges, directed=False, \ subgraph_size=4, non_edge_multiplier=10, \ num_processes=8, base_frac=1.0, scale_data=False): self.__scale_data__ = scale_data if directed: self.__graph_data__ = DirectedGraphData() else: self.__graph_data__ = GraphData() for node in graph_nodes: self.__graph_data__.add_node(node) traits = TemporalLinkPredictionTraits # NonUpdater included for reasons explained in # TemporalLinkPredictionTraitUpdater's file and the # GraphChangeFeatureCounter file. # # In short, GCFC needs two updaters, but both the temporal traits used # here operate with a single updater. trait_updaters = [\ TemporalLinkPredictionTraitUpdater(self.__graph_data__), \ NonUpdater(None)] # Remove weight value (this class ignores it) and sort by time. sorted_edges = [(a, b, t) for (t, a, b) in \ sorted([(t, a, b) for (a, b, t, w) in graph_edges])] # Pick the timestamp from `base_frac` of the way through the data, # then add all edges with a timestamp <= to it and allow the traits # to update accordingly. This will be the base graph. But first, # ensure that the base graph does not include the last timestamp. last_timestamp = sorted_edges[-1][2] base_graph_timestamp_idx = min(len(sorted_edges) - 1, \ int(len(sorted_edges)*base_frac)) base_graph_timestamp = sorted_edges[base_graph_timestamp_idx][2] while base_graph_timestamp == last_timestamp: base_graph_timestamp_idx -= 1 base_graph_timestamp = sorted_edges[base_graph_timestamp_idx][2] changes = [] self.__GCFC__ = GraphChangeFeatureCounter(self.__graph_data__, \ num_processes=num_processes, subgraph_size=subgraph_size, \ edge_traits=traits, edge_trait_updaters=trait_updaters, \ use_counts=True) for (a, b, t) in sorted_edges: if t > base_graph_timestamp: break changes.append(EdgeAddition(self.__graph_data__, a, b, timestamp=t)) self.__GCFC__.run_changes_forward(changes) # Create fake edges for remaining timestamps in graph. curr_idx = 0 while sorted_edges[curr_idx][2] <= base_graph_timestamp: curr_idx += 1 print(("Used first %d edges for base graph. " % curr_idx) + \ "Using remaining %d for change model." % \ (len(sorted_edges) - curr_idx)) start_idx = curr_idx curr_time = sorted_edges[curr_idx][2] num_nodes = len(graph_nodes) self.__true_dicts__ = [] self.__non_dicts__ = [] edges_at_curr_time = [] end = False while not end: if curr_idx < len(sorted_edges): (a, b, t) = sorted_edges[curr_idx] else: end = True if end or t > curr_time: num_edges = len(edges_at_curr_time) num_non_edges = int((num_nodes * (num_nodes - 1)) / \ (2 - int(directed))) - num_edges target_non_edge_size = min(num_non_edges, \ len(edges_at_curr_time) * non_edge_multiplier) non_edges = non_edges_sample(graph_nodes, \ [(u, v) for (u, v, t) in edges_at_curr_time], \ directed, target_non_edge_size, with_replacement=False) fake_edges = [(u, v, curr_time) for (u, v) in non_edges] true_changes = self.__edges_to_changes__(edges_at_curr_time) non_changes = self.__edges_to_changes__(fake_edges) # Pass true changes as null changes to they don't accumulate # during this timestep. _, true_dicts, non_dicts = \ self.__GCFC__.get_change_counts([], true_changes, \ non_changes, \ permanently_apply_changes=False) # Then run changes forward. self.__GCFC__.run_changes_forward(true_changes) # Get the edge additions specifically. self.__true_dicts__ += true_dicts[1] self.__non_dicts__ += non_dicts[1] curr_time = t edges_at_curr_time = [] edges_at_curr_time.append((a, b, t)) curr_idx += 1 print("Finished training data counting.")
def __init__(self, graph_nodes, graph_edges, directed=False, \ subgraph_size=4, non_edge_multiplier=10, \ prediction_dist_cap=None, \ num_processes=8, scale_data=False): self.__scale_data__ = scale_data if directed: self.__graph_data__ = DirectedGraphData() else: self.__graph_data__ = GraphData() for node in graph_nodes: self.__graph_data__.add_node(node) for (a, b) in graph_edges: # remaining_edges self.__graph_data__.add_edge(a, b) num_nodes = len(graph_nodes) num_edges = len(graph_edges) num_non_edges = int((num_nodes * (num_nodes - 1)) / \ (2 - int(directed))) - num_edges if prediction_dist_cap is None: true_edges = graph_edges target_non_edge_size = min(num_non_edges, \ len(true_edges) * non_edge_multiplier) non_edges = non_edges_sample(graph_nodes, graph_edges, directed, target_non_edge_size, with_replacement=False) else: k = prediction_dist_cap true_edges = \ all_connected_node_pairs_that_would_be_within_k_if_disconnected(\ self.__graph_data__, k) possible_edges = \ all_disconnected_node_pairs_within_k(self.__graph_data__, k) target_non_edge_size = min(len(possible_edges), \ len(true_edges) * non_edge_multiplier) non_edges = set(random.sample(possible_edges, \ target_non_edge_size)) print("Training on %d true edges (%f percent of all graph edges)" % \ (len(true_edges), (100.0 * len(true_edges)) / len(graph_edges))) print("Training on %d non model edges (%f percent of all non edges.)" % \ (len(non_edges), (100.0 * len(non_edges)) / num_non_edges)) true_changes = self.__edges_to_changes__(true_edges) non_changes = self.__edges_to_changes__(non_edges) if self.__graph_data__.is_directed(): node_traits = [] node_trait_updaters = [] else: node_traits = [InvolvedNodeDegreeTrait()] node_trait_updaters = \ [InvolvedNodeDegreeTraitUpdater(self.__graph_data__)] self.__GCFC__ = GraphChangeFeatureCounter(self.__graph_data__, \ num_processes=num_processes, subgraph_size=subgraph_size, \ node_traits=node_traits, node_trait_updaters=node_trait_updaters, \ use_counts=True) self.__true_dicts__, _, self.__non_dicts__ = \ self.__GCFC__.get_change_counts(true_changes, [], non_changes) # Get the edge additions specifically. self.__true_dicts__ = self.__true_dicts__[1] self.__non_dicts__ = self.__non_dicts__[1] print("Finished training data counting.")
class SST_SVMTemporalLinkPredictor(TemporalLinkPredictor): # `non_edge_multiplier` - for every true edge, sample this many false edges # # `base_frac` - have at least this fraction of edges in the graph before # computing vectors for the subsequent edges. Will make split at a # timestamp change, and thus will always have at least the first full # timestamp in the base_graph, even if base_frac=0.0. Also, will always # have at least the last full timestamp outside the base graph, even if # base_frac=1.0 def __init__(self, graph_nodes, graph_edges, directed=False, \ subgraph_size=4, non_edge_multiplier=10, \ num_processes=8, base_frac=1.0, scale_data=False): self.__scale_data__ = scale_data if directed: self.__graph_data__ = DirectedGraphData() else: self.__graph_data__ = GraphData() for node in graph_nodes: self.__graph_data__.add_node(node) traits = TemporalLinkPredictionTraits # NonUpdater included for reasons explained in # TemporalLinkPredictionTraitUpdater's file and the # GraphChangeFeatureCounter file. # # In short, GCFC needs two updaters, but both the temporal traits used # here operate with a single updater. trait_updaters = [\ TemporalLinkPredictionTraitUpdater(self.__graph_data__), \ NonUpdater(None)] # Remove weight value (this class ignores it) and sort by time. sorted_edges = [(a, b, t) for (t, a, b) in \ sorted([(t, a, b) for (a, b, t, w) in graph_edges])] # Pick the timestamp from `base_frac` of the way through the data, # then add all edges with a timestamp <= to it and allow the traits # to update accordingly. This will be the base graph. But first, # ensure that the base graph does not include the last timestamp. last_timestamp = sorted_edges[-1][2] base_graph_timestamp_idx = min(len(sorted_edges) - 1, \ int(len(sorted_edges)*base_frac)) base_graph_timestamp = sorted_edges[base_graph_timestamp_idx][2] while base_graph_timestamp == last_timestamp: base_graph_timestamp_idx -= 1 base_graph_timestamp = sorted_edges[base_graph_timestamp_idx][2] changes = [] self.__GCFC__ = GraphChangeFeatureCounter(self.__graph_data__, \ num_processes=num_processes, subgraph_size=subgraph_size, \ edge_traits=traits, edge_trait_updaters=trait_updaters, \ use_counts=True) for (a, b, t) in sorted_edges: if t > base_graph_timestamp: break changes.append(EdgeAddition(self.__graph_data__, a, b, timestamp=t)) self.__GCFC__.run_changes_forward(changes) # Create fake edges for remaining timestamps in graph. curr_idx = 0 while sorted_edges[curr_idx][2] <= base_graph_timestamp: curr_idx += 1 print(("Used first %d edges for base graph. " % curr_idx) + \ "Using remaining %d for change model." % \ (len(sorted_edges) - curr_idx)) start_idx = curr_idx curr_time = sorted_edges[curr_idx][2] num_nodes = len(graph_nodes) self.__true_dicts__ = [] self.__non_dicts__ = [] edges_at_curr_time = [] end = False while not end: if curr_idx < len(sorted_edges): (a, b, t) = sorted_edges[curr_idx] else: end = True if end or t > curr_time: num_edges = len(edges_at_curr_time) num_non_edges = int((num_nodes * (num_nodes - 1)) / \ (2 - int(directed))) - num_edges target_non_edge_size = min(num_non_edges, \ len(edges_at_curr_time) * non_edge_multiplier) non_edges = non_edges_sample(graph_nodes, \ [(u, v) for (u, v, t) in edges_at_curr_time], \ directed, target_non_edge_size, with_replacement=False) fake_edges = [(u, v, curr_time) for (u, v) in non_edges] true_changes = self.__edges_to_changes__(edges_at_curr_time) non_changes = self.__edges_to_changes__(fake_edges) # Pass true changes as null changes to they don't accumulate # during this timestep. _, true_dicts, non_dicts = \ self.__GCFC__.get_change_counts([], true_changes, \ non_changes, \ permanently_apply_changes=False) # Then run changes forward. self.__GCFC__.run_changes_forward(true_changes) # Get the edge additions specifically. self.__true_dicts__ += true_dicts[1] self.__non_dicts__ += non_dicts[1] curr_time = t edges_at_curr_time = [] edges_at_curr_time.append((a, b, t)) curr_idx += 1 print("Finished training data counting.") def score_edges(self, edges): changes = self.__edges_to_changes__(edges) # Perform scoring in chunks to save memory. scores = [] chunk = 0 chunk_size = 12000 done = False stop = 0 while not done: chunk += 1 start = stop if chunk * chunk_size >= len(changes): stop = len(changes) done = True else: stop = chunk * chunk_size changes_to_score = changes[start:stop] scores += self.score_changes(changes_to_score) gc.collect() print(" Scored chunk %d." % chunk) return scores def score_changes(self, changes): # Pass as null_changes so that self's graph_data doesn't change. print(" Getting Changes' SST Vectors...") _, counts_dicts, _ = self.__GCFC__.get_change_counts([], changes, [], \ permanently_apply_changes=False, allow_new_SSTs=False) print(" Scoring...") # Convert in place to save space. if self.__scale_data__: self.__scale_dicts__(counts_dicts[1]) counts_vectors = self.__dicts_to_sparse_matrix__(counts_dicts[1]) return self.score_vectors(counts_vectors) def score_vectors(self, count_vectors): return list(self.__linear_svm__.decision_function(count_vectors)) # Returns the unit direction vector with components sorted in order of # largest magnitude to least, coupled with a representation of the # subgraph changes associated with each component. # # Format: List of (vector component, representative subgraph change) tuples def get_interpretable_model(self): # Extract interpretable features. direction_vector = self.__linear_svm__.coef_[0] norm = math.sqrt(sum([c * c for c in direction_vector])) direction_vector = [c / norm for c in direction_vector] sst_labeler = self.__GCFC__.get_subgraph_change_labeler() ssts = [sst_labeler.get_representative_subgraph_change_from_label(i, \ GraphChange.EDGE_ADDITION) for i in range(0, len(direction_vector))] dv_sorted = [(abs(direction_vector[i]), direction_vector[i], i) \ for i in range(0, len(direction_vector))] dv_sorted.sort(reverse=True) return [(dv_sorted[i][1], ssts[dv_sorted[i][2]]) \ for i in range(0, len(ssts))] # Allows python to pickle the predictor. # # Once the predictor is used to make a prediction, this method will need to # be called again in order for pickling to work. def become_serializeable(self): self.__GCFC__.del_worker_pool() def fit(self): self.__num_labels__ = self.__GCFC__.get_max_seen_labels()[1] + 1 # Save space with sparse row matrix. # Construct while deleting dicts so it's effectively in place. num_true = len(self.__true_dicts__) num_non = len(self.__non_dicts__) all_dicts = self.__true_dicts__ for i in range(0, num_non): all_dicts.append(self.__non_dicts__.pop()) if self.__scale_data__: self.__feature_maxs__ = [ 1.0 for i in range(0, self.__num_labels__) ] for d in all_dicts: for label, count in d.items(): if float(count) > self.__feature_maxs__[label]: self.__feature_maxs__[label] = float(count) self.__scale_dicts__(all_dicts) data_matrix = self.__dicts_to_sparse_matrix__(all_dicts) self.__true_dicts__ = None self.__non_dicts__ = None self.__linear_svm__ = LinearSVC(class_weight='balanced', max_iter=400000) # non labels come first because __dicts_to_sparse_matrix__ reverses # row order. labels = [0 for i in range(0, num_non)] + \ [1 for i in range(0, num_true)] print(" Now fitting SVM...") self.__linear_svm__.fit(data_matrix, labels) data_matrix = None gc.collect() print(" SVM fit successfully.") def __del__(self): del self.__GCFC__ # Destroys dicts in the process. def __dicts_to_sparse_matrix__(self, dicts): data = [] row_idxs = [] col_idxs = [] size = len(dicts) for row in range(0, size): row_dict = dicts.pop() for col, value in row_dict.items(): if col >= self.__num_labels__: continue data.append(value) row_idxs.append(row) col_idxs.append(col) return csr_matrix((data, (row_idxs, col_idxs)), \ shape=(size, self.__num_labels__)) def __edges_to_changes__(self, edges): changes = [] for (a, b, t) in edges: changes.append(EdgeAddition(self.__graph_data__, a, b, timestamp=t)) return changes def __scale_dicts__(self, dicts): for d in dicts: vals = [(label, float(count)) for (label, count) in d.items()] for (label, count) in vals: if label < self.__num_labels__: d[label] = count / self.__feature_maxs__[label]
class SST_SVMLinkPredictor(StaticLinkPredictor): # `prediction_dist_cap` -- used to indicate that the predictor will only # be used to make predictions about connecting pairs of nodes at most # a distance of `prediction_dist_cap` away. A value of None indicates no # limit. def __init__(self, graph_nodes, graph_edges, directed=False, \ subgraph_size=4, non_edge_multiplier=10, \ prediction_dist_cap=None, \ num_processes=8, scale_data=False): self.__scale_data__ = scale_data if directed: self.__graph_data__ = DirectedGraphData() else: self.__graph_data__ = GraphData() for node in graph_nodes: self.__graph_data__.add_node(node) for (a, b) in graph_edges: # remaining_edges self.__graph_data__.add_edge(a, b) num_nodes = len(graph_nodes) num_edges = len(graph_edges) num_non_edges = int((num_nodes * (num_nodes - 1)) / \ (2 - int(directed))) - num_edges if prediction_dist_cap is None: true_edges = graph_edges target_non_edge_size = min(num_non_edges, \ len(true_edges) * non_edge_multiplier) non_edges = non_edges_sample(graph_nodes, graph_edges, directed, target_non_edge_size, with_replacement=False) else: k = prediction_dist_cap true_edges = \ all_connected_node_pairs_that_would_be_within_k_if_disconnected(\ self.__graph_data__, k) possible_edges = \ all_disconnected_node_pairs_within_k(self.__graph_data__, k) target_non_edge_size = min(len(possible_edges), \ len(true_edges) * non_edge_multiplier) non_edges = set(random.sample(possible_edges, \ target_non_edge_size)) print("Training on %d true edges (%f percent of all graph edges)" % \ (len(true_edges), (100.0 * len(true_edges)) / len(graph_edges))) print("Training on %d non model edges (%f percent of all non edges.)" % \ (len(non_edges), (100.0 * len(non_edges)) / num_non_edges)) true_changes = self.__edges_to_changes__(true_edges) non_changes = self.__edges_to_changes__(non_edges) if self.__graph_data__.is_directed(): node_traits = [] node_trait_updaters = [] else: node_traits = [InvolvedNodeDegreeTrait()] node_trait_updaters = \ [InvolvedNodeDegreeTraitUpdater(self.__graph_data__)] self.__GCFC__ = GraphChangeFeatureCounter(self.__graph_data__, \ num_processes=num_processes, subgraph_size=subgraph_size, \ node_traits=node_traits, node_trait_updaters=node_trait_updaters, \ use_counts=True) self.__true_dicts__, _, self.__non_dicts__ = \ self.__GCFC__.get_change_counts(true_changes, [], non_changes) # Get the edge additions specifically. self.__true_dicts__ = self.__true_dicts__[1] self.__non_dicts__ = self.__non_dicts__[1] print("Finished training data counting.") def score_edges(self, edges): changes = self.__edges_to_changes__(edges) # Perform scoring in chunks to save memory. scores = [] chunk = 0 chunk_size = 6000 done = False stop = 0 while not done: chunk += 1 start = stop if chunk * chunk_size >= len(changes): stop = len(changes) done = True else: stop = chunk * chunk_size changes_to_score = changes[start:stop] scores += self.score_changes(changes_to_score) gc.collect() print(" Scored chunk %d." % chunk) return scores def score_changes(self, changes): # Pass as null_changes so that self's graph_data doesn't change. print(" Getting Changes' SST Vectors...") _, counts_dicts, _ = self.__GCFC__.get_change_counts([], changes, [], \ permanently_apply_changes=False, allow_new_SSTs=False) print(" Scoring...") # Convert in place to save space. if self.__scale_data__: self.__scale_dicts__(counts_dicts[1]) counts_vectors = self.__dicts_to_sparse_matrix__(counts_dicts[1]) return self.score_vectors(counts_vectors) def score_vectors(self, count_vectors): return list(self.__linear_svm__.decision_function(count_vectors)) # def graph(self): # return self.__graph_data__ # Returns the unit direction vector with components sorted in order of # largest magnitude to least, coupled with a representation of the # subgraph changes associated with each component. # # Format: List of (vector component, representative subgraph change) tuples def get_interpretable_model(self): # Extract interpretable features. direction_vector = self.__linear_svm__.coef_[0] norm = math.sqrt(sum([c * c for c in direction_vector])) direction_vector = [c / norm for c in direction_vector] sst_labeler = self.__GCFC__.get_subgraph_change_labeler() ssts = [sst_labeler.get_representative_subgraph_change_from_label(i, \ GraphChange.EDGE_ADDITION) for i in range(0, len(direction_vector))] dv_sorted = [(abs(direction_vector[i]), direction_vector[i], i) \ for i in range(0, len(direction_vector))] dv_sorted.sort(reverse=True) return [(dv_sorted[i][1], ssts[dv_sorted[i][2]]) \ for i in range(0, len(ssts))] # Allows python to pickle the predictor. # # Once the predictor is used to make a prediction, this method will need to # be called again in order for pickling to work. def become_serializeable(self): self.__GCFC__.del_worker_pool() def __del__(self): del self.__GCFC__ def fit(self): self.__num_labels__ = self.__GCFC__.get_max_seen_labels()[1] + 1 # Save space with sparse row matrix. # Construct while deleting dicts so it's effectively in place. num_true = len(self.__true_dicts__) num_non = len(self.__non_dicts__) all_dicts = self.__true_dicts__ for i in range(0, num_non): all_dicts.append(self.__non_dicts__.pop()) if self.__scale_data__: self.__feature_maxs__ = [ 1.0 for i in range(0, self.__num_labels__) ] for d in all_dicts: for label, count in d.items(): if float(count) > self.__feature_maxs__[label]: self.__feature_maxs__[label] = float(count) self.__scale_dicts__(all_dicts) data_matrix = self.__dicts_to_sparse_matrix__(all_dicts) self.__true_dicts__ = None self.__non_dicts__ = None self.__linear_svm__ = LinearSVC(class_weight='balanced', max_iter=400000) # non labels come first because __dicts_to_sparse_matrix__ reverses # row order. labels = [0 for i in range(0, num_non)] + \ [1 for i in range(0, num_true)] print(" Now fitting SVM...") self.__linear_svm__.fit(data_matrix, labels) data_matrix = None gc.collect() print(" SVM fit successfully.") # Destroys dicts in the process. def __dicts_to_sparse_matrix__(self, dicts): data = [] row_idxs = [] col_idxs = [] size = len(dicts) for row in range(0, size): row_dict = dicts.pop() for col, value in row_dict.items(): if col >= self.__num_labels__: continue data.append(value) row_idxs.append(row) col_idxs.append(col) return csr_matrix((data, (row_idxs, col_idxs)), \ shape=(size, self.__num_labels__)) def __edges_to_changes__(self, edges): changes = [] for (a, b) in edges: changes.append(EdgeAddition(self.__graph_data__, a, b)) return changes def __scale_dicts__(self, dicts): for d in dicts: vals = [(label, float(count)) for (label, count) in d.items()] for (label, count) in vals: if label < self.__num_labels__: d[label] = count / self.__feature_maxs__[label]
max_seq_len_placeholder = tf.placeholder(tf.int32) inputs_placeholder = tf.placeholder(tf.float32, shape=(args.batch_size, None, 2 * args.max_seq_len + 1)) outputs_placeholder = tf.placeholder(tf.float32, shape=(args.batch_size, None, args.max_seq_len)) model = BuildTModel(max_seq_len_placeholder, inputs_placeholder, outputs_placeholder) initializer = tf.global_variables_initializer() # training convergence_on_target_task = None convergence_on_multi_task = None performance_on_target_task = None performance_on_multi_task = None generalization_from_target_task = None generalization_from_multi_task = None data_generator = GraphData() target_point = args.max_seq_len curriculum_point = 1 if args.curriculum not in ('prediction_gain', 'none') else target_point progress_error = 1.0 convergence_error = 0.1 sess = tf.Session() sess.run(initializer) pickle.dump({target_point: []}, open(HEAD_LOG_FILE, "wb")) pickle.dump({}, open(GENERALIZATION_HEAD_LOG_FILE, "wb")) def run_eval(batches, store_heat_maps=False, generalization_num=None): task_loss = 0 task_error = 0
#从文件读取数据到data_array数组 with open(input_data_path) as file_object: for line in file_object: data_array.append(line.replace('\n', '').split(' ')) # 设置开始时间 start_time = datetime.datetime.now() # 统计每一个边和点的出现次数,并将data_array中的数据加入到data_graph中 i = 0 while i < label_max: freq_edge_label.append(0) freq_node_label.append(0) i += 1 gd = GraphData() for array in data_array: if array[0] == input_new_graph: # 有数据时将gd加入到total_graph if gd.get_node_labels(): total_graph_data.append(gd) gd = GraphData() elif array[0] == input_vertice: if not array[2] in gd.get_node_labels(): freq_node_label[int(array[2])] += 1 gd.get_node_labels().append(array[2]) gd.get_node_visibles().append(True) elif array[0] == input_edge: if not array[3] in gd.get_edge_labels():
def add_empty(x_complex, available_classes): for clazz in available_classes: x_complex.append(GraphData(clazz))