示例#1
0
def load_data(datadir, train_datadir=None):
    if train_datadir is None:
        fnames = os.listdir(datadir)
        rnd.shuffle(fnames)
        test_num = len(fnames) // 10
        test_fnames = fnames[:test_num]
        train_fnames = fnames[test_num:]
        train_datadir = datadir
        test_datadir = datadir
    else:
        test_datadir = datadir
        train_fnames = os.listdir(train_datadir)
        test_fnames = os.listdir(test_datadir)
    #if isinstance(test_fnames, str): test_fnames = file_lines(test_fnames)
    #if isinstance(train_fnames, str): train_fnames = file_lines(train_fnames)

    test_data = []
    for fname in test_fnames:
        data, lens_labels_symbols = cop.load_premsel(
            os.path.join(test_datadir, fname))
        test_data.append((GraphData(data), lens_labels_symbols))

    train_data = []
    for fname in train_fnames:
        data, lens_labels_symbols = cop.load_premsel(
            os.path.join(train_datadir, fname))
        train_data.append((GraphData(data), lens_labels_symbols))

    return test_data, train_data
示例#2
0
import numpy as np
from sklearn.metrics import roc_auc_score

from MELL.MELL import MELL_model
from utility import generate_train_test_data
from graph_data import GraphData

if __name__ == "__main__":

    path = 'Dataset/sample1'
    data = GraphData(path)

    test_rate = 0.2

    train_edges, test_edges = generate_train_test_data(data.L, data.N,
                                                       data.directed,
                                                       data.edges, test_rate)

    print("==========data description==========")
    print("total edge : ", len(data.edges))
    print("train edge : ", len(train_edges))
    print("test  edge : ", len(test_edges))
    print("====================================")

    model = MELL_model(data.L, data.N, data.directed, train_edges, 128, 4, 10,
                       1, 1)
    model.train(500)

    y_true = np.array(test_edges)[:, 3]
    y_predict = [model.predict(t) for t in test_edges]
示例#3
0
    def __enumerate_and_label_all_possible_inputs__(self):
        # Rather than enumerating all
        #   2^(subgraph_size-choose-2 * (1 + is_directed))
        #   possible naive graph options, builds them up using the
        #   node order canonicalizer, adding one edge (or trait) at a time.
        if self.__graph_data__.is_directed():
            empty_graph = DirectedGraphData()
        else:
            empty_graph = GraphData()

        # To enumerate all possible inputs efficiently, we first generate the
        #   possible graphs do not contain the nodes being added, deleted, or
        #   (dis)connected by a (old)new edge. Thus we iterate over graphs of
        #   up to subgraph_size - 2 nodes.
        #
        # Edge modification examples will add two more nodes: the two nodes
        #   being (dis)connected.
        # Node addition cases will add two more nodes: the new node and the node
        #   that the new node connects to.
        # Node deletion cases will add only one more node: the node being
        #   deleted.

        partial_size = self.__edge_sst_size__ - 2
        enumeration_nodes = [i for i in range(0, partial_size)]

        for node in enumeration_nodes:
            empty_graph.add_node(node)

        directed = int(self.__graph_data__.is_directed())

        # Max number of possible edges.
        full_edges = \
            int(((partial_size * (partial_size -1 )) / 2) * (1 + directed))
        # Half of the max possible edges, rounded down.
        half_edges = int(full_edges / 2)
        take_complement_of_half = half_edges * 2 < full_edges

        partial_graph_bank = set()

        # `partial_graphs` functions both as a collection and a queue.
        partial_graphs = [empty_graph]
        next_graph_idx = 0
        partial_graphs_back_half = []

        while next_graph_idx < len(partial_graphs):
            graph = partial_graphs[next_graph_idx]
            next_graph_idx += 1

            # Check to see if the complement of the graph should also be added.
            if graph.num_edges() < half_edges or take_complement_of_half:
                complement = empty_graph.copy()
                for i in range(0, partial_size):
                    for j in range((i + 1) * (1 - directed), partial_size):
                        if not graph.has_edge(i, j):
                            complement.add_edge(i, j)
                partial_graphs_back_half.append(complement)

            # Cycle through possible additions to the graph, checking to see if
            #   they have yet to be found. If so, add them to the collection.
            if graph.num_edges() < half_edges:
                for i in range(0, partial_size):
                    for j in range((i + 1) * (1 - directed), partial_size):
                        if i == j:
                            continue
                        if not graph.has_edge(i, j):
                            copy = graph.copy()
                            copy.add_edge(i, j)
                            canonicalizer = SubgraphChangeLabeler(copy, \
                                subgraph_size=None, precompute=False)
                            (node_order, _) = canonicalizer.\
                                __canonical_node_order__(enumeration_nodes)
                            graph_hash = canonicalizer.\
                                __subgraph_representation__(node_order)
                            if graph_hash not in partial_graph_bank:
                                partial_graph_bank.add(graph_hash)
                                partial_graphs.append(copy)

        for i in range(0, len(partial_graphs_back_half)):
            # Reverse the order of the back half list so that graphs come paired
            #   by complement (first graph complement of last, second complement
            #   of second-to-last, etc.). This way the graphs are also in
            #   order of increasing number of edges.
            partial_graphs.append(partial_graphs_back_half[-1 * (i + 1)])

        # Now that we have acquired all the partial graphs, we add in the nodes
        #   being (primarily) modified and partition the subgraphs according to
        #   which kind(s) of change(s) they represent.
        # We still are not applying traits to the graphs. That happens last.
        edge_change_graphs_wo_traits = []
        node_add_graphs_wo_traits = []
        node_del_graphs_wo_traits = []

        # First we add one new node.
        new_node = partial_size

        possible_edges_first_node = [(new_node, i) for i in range(0, new_node)]
        if directed == 1:
            possible_edges_first_node += \
                [(b, a) for (a, b) in possible_edges_first_node]

        first_node_graph_bank = set()
        first_node_graphs = []
        enumeration_nodes = [i for i in range(0, partial_size + 1)]

        # Highlights are essentially initial labels that distinguish these nodes
        #   from the others in the graph.
        highlights = {new_node: 0}

        for partial_graph in partial_graphs:

            edge_combos_first_node = \
                a_utils.get_all_k_tuples(2, len(possible_edges_first_node))

            # Cycle through all the ways the first node could connect to the
            #   partial graphs.
            for edge_combo in edge_combos_first_node:
                copy = partial_graph.copy()
                copy.add_node(new_node)
                for edge_idx in range(0, len(edge_combo)):
                    if edge_combo[edge_idx]:
                        (a, b) = possible_edges_first_node[edge_idx]
                        copy.add_edge(a, b)

                # Check to see if the graph is even new.
                canonicalizer = SubgraphChangeLabeler(copy, subgraph_size=None,\
                    precompute=False)
                (node_order, _) = canonicalizer.__canonical_node_order__(\
                    enumeration_nodes, highlights=highlights)
                graph_hash = canonicalizer.__subgraph_representation__(\
                    node_order, highlights=highlights)
                if graph_hash not in first_node_graph_bank:
                    first_node_graph_bank.add(graph_hash)
                    first_node_graphs.append(copy)

                    # Check to see if copy is a valid node deletion graph by
                    #   checking to see that the graph is connected.
                    if len(g_utils.connected_components(copy)) == 1:
                        node_del_graphs_wo_traits.append(copy)

        del first_node_graph_bank

        # We now commence adding in a second node to the graph.
        new_node += 1
        highlights[new_node] = 0

        # If an ordering is imposed on the nodes, give them different highlights.
        if directed == 1 or self.__diff_ends__:
            highlights[new_node] = 1

        possible_edges_second_node = [(new_node, i)
                                      for i in range(0, new_node)]
        if directed == 1:
            possible_edges_second_node += \
                [(b, a) for (a, b) in possible_edges_second_node]

        second_node_graph_bank = set()
        enumeration_nodes = [i for i in range(0, partial_size + 2)]

        for first_node_graph in first_node_graphs:

            edge_combos_second_node = \
                a_utils.get_all_k_tuples(2, len(possible_edges_second_node))

            # Cycle through all the ways the first node could connect to the
            #   partial graphs.
            for edge_combo in edge_combos_second_node:
                copy = first_node_graph.copy()
                copy.add_node(new_node)
                for edge_idx in range(0, len(edge_combo)):
                    if edge_combo[edge_idx]:
                        (a, b) = possible_edges_second_node[edge_idx]
                        copy.add_edge(a, b)

                # Check to see if the graph is even new.
                canonicalizer = SubgraphChangeLabeler(copy, subgraph_size=None,\
                    precompute=False)
                (node_order, _) = canonicalizer.__canonical_node_order__(\
                    enumeration_nodes, highlights=highlights)
                graph_hash = canonicalizer.__subgraph_representation__(\
                    node_order, highlights=highlights)
                if graph_hash not in second_node_graph_bank:
                    second_node_graph_bank.add(graph_hash)

                    connected_components = g_utils.connected_components(copy)
                    num_connected_components = len(connected_components)

                    # Check to see if copy is a valid node addition graph by
                    #   checking to see that the graph is connected and that
                    #   one of the highlighted nodes connects only to the other
                    #   highlighted node.
                    #
                    # If the graph is directed, ensure that the edge points from
                    #   (new_node - 1) to new_node so that the highlights follow
                    #   the source->target convention.
                    if num_connected_components == 1:

                        if directed == 1 and \
                           ((copy.in_neighbors(new_node)==set([new_node-1]) and\
                                len(copy.out_neighbors(new_node)) == 0) or \
                           (copy.out_neighbors(new_node-1)==set([new_node]) and\
                                len(copy.in_neighbors(new_node - 1)) == 0)):
                            node_add_graphs_wo_traits.append(copy)

                        elif directed == 0 and self.__diff_ends__ and \
                                copy.neighbors(new_node) == set([new_node-1]):
                            node_add_graphs_wo_traits.append(copy)

                        elif directed == 0 and (not self.__diff_ends__) and \
                               (copy.neighbors(new_node) == set([new_node-1]) or
                               copy.neighbors(new_node - 1) == set([new_node])):
                            node_add_graphs_wo_traits.append(copy)

                    # Check to see if copy is a valid edge modification graph.
                    #
                    # Note that if the graph is directed, we enforce that the
                    #   direction be from (new_node - 1) to new_node, but if it
                    #   is undirected, the has_edge() function looks for an edge
                    #   in either direction.
                    if num_connected_components == 1 and \
                            (copy.has_edge(new_node - 1, new_node)):
                        edge_change_graphs_wo_traits.append(copy.copy())

        del second_node_graph_bank

        print("Number of Node Deletion Cases (W/O Traits): \t%d" %
              len(node_del_graphs_wo_traits))
        print("Number of Node Addition Cases (W/O Traits): \t%d" %
              len(node_add_graphs_wo_traits))
        print("Number of Edge Modification Cases (W/O Traits): \t%d" %
              len(edge_change_graphs_wo_traits))

        ###### Now for the node and edge traits. #######

        # Note that `highlights` acquires its value above.

        graph_lists_wo_traits = \
            [node_del_graphs_wo_traits, node_add_graphs_wo_traits, \
             edge_change_graphs_wo_traits]
        graph_lists_with_traits = [[], [], []]

        if len(self.__node_traits__) == 0 and len(self.__edge_traits__) == 0:
            # No traits to handle. Moving on.
            graph_lists_with_traits = graph_lists_wo_traits

        # Apply all possible trait combos.
        else:
            for change_type_idx in range(0, 3):
                # Reset the graph hash bank for each type of change.
                graph_trait_bank = set()

                graph_list_wo_traits = graph_lists_wo_traits[change_type_idx]
                graph_list_with_traits = graph_lists_with_traits[
                    change_type_idx]

                for graph in graph_list_wo_traits:

                    nodes = [i for i in range(0, graph.num_nodes())]
                    edges = []
                    for i in range(0, self.__edge_sst_size__):
                        for j in range((i + 1) * (1 - directed), \
                                self.__edge_sst_size__):
                            if graph.has_edge(i, j):
                                edges.append((i, j))

                    entity_sets = [nodes, edges]
                    trait_sets = [self.__node_traits__, self.__edge_traits__]
                    trait_values_sets = [[], []]
                    for trait_set_idx in range(0, 2):
                        traits = trait_sets[trait_set_idx]
                        entities = entity_sets[trait_set_idx]
                        trait_values = trait_values_sets[trait_set_idx]
                        n_ent = len(entities)

                        for trait in traits:
                            graph.add_trait(trait[1])

                            if trait[0] == SubgraphChangeLabeler.RANK_TRAIT:
                                # It's a rank trait.
                                # If guaranteed unique, only need permutations
                                #   for rankings.
                                if trait[2]:
                                    trait_values.append(a_utils.\
                                        get_all_k_permutations(n_ent, n_ent))
                                # Otherwise, allow for ties.
                                else:
                                    trait_values.append(\
                                        a_utils.get_all_n_rankings(n_ent))
                            else:
                                # It's a class trait.
                                options = trait[2]
                                combos = a_utils.get_all_k_tuples(\
                                        len(options), n_ent)
                                trait_values.append(\
                                    [[options[idx] for idx in combo] \
                                        for combo in combos])

                    trait_names = [t[1] for t in self.__node_traits__] + \
                        [t[1] for t in self.__edge_traits__]
                    entities_by_trait = [nodes for t in self.__node_traits__] +\
                        [edges for t in self.__edge_traits__]
                    values_by_trait = trait_values_sets[0] + trait_values_sets[
                        1]

                    trait_combo_counter = \
                        [0 for i in range(0, len(entities_by_trait))]
                    trait_combo_counter[-1] = -1

                    digit_limits_inclusive = \
                        [len(values) - 1 for values in values_by_trait]

                    while a_utils.increment_counter(trait_combo_counter, \
                            digit_limits_inclusive):

                        copy = graph.copy()
                        for trait_idx in range(0, len(trait_names)):
                            entities = entities_by_trait[trait_idx]
                            values_selection = trait_combo_counter[trait_idx]
                            values = values_by_trait[trait_idx][
                                values_selection]
                            for ent_idx in range(0, len(entities)):
                                ent = entities[ent_idx]
                                copy[trait_names[trait_idx]][ent] = \
                                    values[ent_idx]

                        # Trait values have been assigned to copy. Now check to
                        #   see if this assignment is isomorphically unique.
                        canonicalizer = SubgraphChangeLabeler(copy, \
                            node_traits=self.__node_traits__, \
                            edge_traits=self.__edge_traits__, \
                            subgraph_size=None, precompute=False)
                        (node_order, _) = canonicalizer.\
                            __canonical_node_order__(nodes, highlights)
                        graph_hash = canonicalizer.\
                            __subgraph_representation__(node_order, highlights)
                        if graph_hash not in graph_trait_bank:
                            graph_trait_bank.add(graph_hash)
                            graph_list_with_traits.append(copy)

            del graph_trait_bank

        print("Number of Node Deletion Cases (With Traits): \t%d" %
              len(graph_lists_with_traits[0]))
        print("Number of Node Addition Cases (With Traits): \t%d" %
              len(graph_lists_with_traits[1]))
        print("Number of Edge Modification Cases (With Traits): \t%d" %
              len(graph_lists_with_traits[2]))

        ####### Lastly, store possible inputs. #######

        self.__DEL_NODE__ = 0
        self.__ADD_NODE__ = 1
        self.__DEL_EDGE__ = 2
        self.__ADD_EDGE__ = 3
        self.__num_labels__ = {0: 0, 1: 0, 2: 0}
        # Since we're performing a full enumeration and edge additions have all
        #   the same cases as edge deletions, the dicts can be duplicated.
        self.__repr_to_label__ = {0: dict(), 1: dict(), 2: dict()}
        self.__repr_to_label__[3] = self.__repr_to_label__[2]
        self.__label_to_canonical_repr__ = {0: dict(), 1: dict(), 2: dict()}
        self.__label_to_canonical_repr__[3] = self.__label_to_canonical_repr__[
            2]

        for change_type in range(0, 3):
            num_nodes = graph_lists_with_traits[change_type][0].num_nodes()
            nodes = [i for i in range(0, num_nodes)]

            num_highlight_nodes = 1 + int(num_nodes == self.__edge_sst_size__)
            num_other_nodes = num_nodes - num_highlight_nodes

            other_nodes_orders = \
                a_utils.get_all_k_permutations(num_other_nodes, num_other_nodes)
            other_nodes_orders = [[n + num_highlight_nodes for n in order] for \
                order in other_nodes_orders]

            if directed == 1 or self.__diff_ends__:
                highlight_nodes_orders = \
                    [tuple([i for i in range(0, num_highlight_nodes)])]
                highlights = {i: i for i in range(0, num_highlight_nodes)}
            else:
                highlight_nodes_orders = a_utils.get_all_k_permutations(\
                    num_highlight_nodes, num_highlight_nodes)
                highlights = {i: 0 for i in range(0, num_highlight_nodes)}

            for graph in graph_lists_with_traits[change_type]:

                label = self.__num_labels__[change_type]
                self.__num_labels__[change_type] += 1

                # First create the canonical label.
                canonicalizer = SubgraphChangeLabeler(graph, \
                    node_traits=self.__node_traits__, \
                    edge_traits=self.__edge_traits__, \
                    subgraph_size=None, precompute=False)
                (order, _) = \
                    canonicalizer.__canonical_node_order__(nodes, highlights)
                graph_hash = \
                    canonicalizer.__subgraph_representation__(order, highlights)
                self.__label_to_canonical_repr__[change_type][
                    label] = graph_hash

                # Then enumerate possible input orders.
                for highlight_order in highlight_nodes_orders:
                    highlight_order_list = list(highlight_order)
                    for other_order in other_nodes_orders:
                        node_order = highlight_order_list + other_order
                        graph_hash = canonicalizer.__subgraph_representation__(\
                            node_order, highlights)
                        self.__repr_to_label__[change_type][graph_hash] = label

        self.__num_labels__[3] = self.__num_labels__[2]

        change_type_names = ["node deletion", "node addition", \
                             "edge deletion", "edge addition"]
        for change_type in range(0, 4):
            print("Number %s cases with possible input orders: %d" % \
                (change_type_names[change_type], \
                 len(self.__repr_to_label__[change_type])))
示例#4
0
                    if self.__graph_data__.has_edge(nodes[i], nodes[j]):
                        edges.append((nodes[i], nodes[j]))
                        local_idx_edges.append((i, j))
        else:
            for i in range(0, len(nodes)):
                for j in range(i + 1, len(nodes)):
                    if self.__graph_data__.has_edge(nodes[i], nodes[j]):
                        edges.append((nodes[i], nodes[j]))
                        local_idx_edges.append((i, j))
        return (edges, local_idx_edges)


if __name__ == "__main__":
    from graph_change_modeler import GraphChangeModeler

    GD = GraphData()
    # GD = DirectedGraphData()
    # node_traits = [GraphChangeModeler.ClassTrait(X, ["A", "B", "C", "D"]) for X in ["t0", "t1"]]
    # node_traits = [GraphChangeModeler.RankTrait("Pagerank", guaranteed_unique=False)]
    # edge_traits = [GraphChangeModeler.RankTrait("Edgerank", guaranteed_unique=False)]
    node_traits = [
        GraphChangeModeler.ClassTrait(
            "Change", possible_class_values=["Added", "None", "Deleted"])
    ]  #, GraphChangeModeler.RankTrait("Rank", guaranteed_unique=False)]
    edge_traits = [
        GraphChangeModeler.ClassTrait(
            "Change", possible_class_values=["Added", "None", "Deleted"])
    ]
    # node_traits = []
    # edge_traits = []
    SCL = SubgraphChangeLabeler(GD, 4, node_traits=node_traits, edge_traits=edge_traits, \
示例#5
0
    def __init__(self, graph_nodes, graph_edges, directed=False, \
            subgraph_size=4, non_edge_multiplier=10, \
            num_processes=8, base_frac=1.0, scale_data=False):

        self.__scale_data__ = scale_data

        if directed:
            self.__graph_data__ = DirectedGraphData()
        else:
            self.__graph_data__ = GraphData()

        for node in graph_nodes:
            self.__graph_data__.add_node(node)

        traits = TemporalLinkPredictionTraits
        # NonUpdater included for reasons explained in
        #   TemporalLinkPredictionTraitUpdater's file and the
        #   GraphChangeFeatureCounter file.
        #
        # In short, GCFC needs two updaters, but both the temporal traits used
        #   here operate with a single updater.
        trait_updaters = [\
          TemporalLinkPredictionTraitUpdater(self.__graph_data__), \
            NonUpdater(None)]

        # Remove weight value (this class ignores it) and sort by time.
        sorted_edges = [(a, b, t) for (t, a, b) in \
            sorted([(t, a, b) for (a, b, t, w) in graph_edges])]

        # Pick the timestamp from `base_frac` of the way through the data,
        #   then add all edges with a timestamp <= to it and allow the traits
        #   to update accordingly. This will be the base graph. But first,
        #   ensure that the base graph does not include the last timestamp.
        last_timestamp = sorted_edges[-1][2]
        base_graph_timestamp_idx = min(len(sorted_edges) - 1, \
                                       int(len(sorted_edges)*base_frac))
        base_graph_timestamp = sorted_edges[base_graph_timestamp_idx][2]

        while base_graph_timestamp == last_timestamp:
            base_graph_timestamp_idx -= 1
            base_graph_timestamp = sorted_edges[base_graph_timestamp_idx][2]

        changes = []

        self.__GCFC__ = GraphChangeFeatureCounter(self.__graph_data__, \
            num_processes=num_processes, subgraph_size=subgraph_size, \
            edge_traits=traits, edge_trait_updaters=trait_updaters, \
            use_counts=True)

        for (a, b, t) in sorted_edges:
            if t > base_graph_timestamp:
                break
            changes.append(EdgeAddition(self.__graph_data__, a, b,
                                        timestamp=t))
        self.__GCFC__.run_changes_forward(changes)

        # Create fake edges for remaining timestamps in graph.
        curr_idx = 0
        while sorted_edges[curr_idx][2] <= base_graph_timestamp:
            curr_idx += 1

        print(("Used first %d edges for base graph. " % curr_idx) + \
            "Using remaining %d for change model." % \
                (len(sorted_edges) - curr_idx))

        start_idx = curr_idx
        curr_time = sorted_edges[curr_idx][2]

        num_nodes = len(graph_nodes)

        self.__true_dicts__ = []
        self.__non_dicts__ = []

        edges_at_curr_time = []
        end = False
        while not end:
            if curr_idx < len(sorted_edges):
                (a, b, t) = sorted_edges[curr_idx]
            else:
                end = True
            if end or t > curr_time:
                num_edges = len(edges_at_curr_time)

                num_non_edges = int((num_nodes * (num_nodes - 1)) / \
                    (2 - int(directed))) - num_edges
                target_non_edge_size = min(num_non_edges, \
                    len(edges_at_curr_time) * non_edge_multiplier)

                non_edges = non_edges_sample(graph_nodes, \
                    [(u, v) for (u, v, t) in edges_at_curr_time], \
                    directed, target_non_edge_size, with_replacement=False)

                fake_edges = [(u, v, curr_time) for (u, v) in non_edges]

                true_changes = self.__edges_to_changes__(edges_at_curr_time)
                non_changes = self.__edges_to_changes__(fake_edges)
                # Pass true changes as null changes to they don't accumulate
                #   during this timestep.
                _, true_dicts, non_dicts = \
                    self.__GCFC__.get_change_counts([], true_changes, \
                        non_changes, \
                        permanently_apply_changes=False)

                # Then run changes forward.
                self.__GCFC__.run_changes_forward(true_changes)

                # Get the edge additions specifically.
                self.__true_dicts__ += true_dicts[1]
                self.__non_dicts__ += non_dicts[1]

                curr_time = t
                edges_at_curr_time = []

            edges_at_curr_time.append((a, b, t))
            curr_idx += 1

        print("Finished training data counting.")
示例#6
0
    def __init__(self, graph_nodes, graph_edges, directed=False, \
            subgraph_size=4, non_edge_multiplier=10, \
            prediction_dist_cap=None, \
            num_processes=8, scale_data=False):

        self.__scale_data__ = scale_data

        if directed:
            self.__graph_data__ = DirectedGraphData()
        else:
            self.__graph_data__ = GraphData()
        for node in graph_nodes:
            self.__graph_data__.add_node(node)

        for (a, b) in graph_edges:  # remaining_edges
            self.__graph_data__.add_edge(a, b)

        num_nodes = len(graph_nodes)
        num_edges = len(graph_edges)
        num_non_edges = int((num_nodes * (num_nodes - 1)) / \
            (2 - int(directed))) - num_edges

        if prediction_dist_cap is None:
            true_edges = graph_edges

            target_non_edge_size = min(num_non_edges, \
                                       len(true_edges) * non_edge_multiplier)
            non_edges = non_edges_sample(graph_nodes,
                                         graph_edges,
                                         directed,
                                         target_non_edge_size,
                                         with_replacement=False)
        else:
            k = prediction_dist_cap
            true_edges = \
                all_connected_node_pairs_that_would_be_within_k_if_disconnected(\
                    self.__graph_data__, k)

            possible_edges = \
                all_disconnected_node_pairs_within_k(self.__graph_data__, k)

            target_non_edge_size = min(len(possible_edges), \
                                       len(true_edges) * non_edge_multiplier)

            non_edges = set(random.sample(possible_edges, \
                target_non_edge_size))

        print("Training on %d true edges (%f percent of all graph edges)" % \
            (len(true_edges), (100.0 * len(true_edges)) / len(graph_edges)))

        print("Training on %d non model edges (%f percent of all non edges.)" % \
            (len(non_edges), (100.0 * len(non_edges)) / num_non_edges))

        true_changes = self.__edges_to_changes__(true_edges)
        non_changes = self.__edges_to_changes__(non_edges)

        if self.__graph_data__.is_directed():
            node_traits = []
            node_trait_updaters = []
        else:
            node_traits = [InvolvedNodeDegreeTrait()]
            node_trait_updaters = \
                [InvolvedNodeDegreeTraitUpdater(self.__graph_data__)]

        self.__GCFC__ = GraphChangeFeatureCounter(self.__graph_data__, \
            num_processes=num_processes, subgraph_size=subgraph_size, \
            node_traits=node_traits, node_trait_updaters=node_trait_updaters, \
            use_counts=True)

        self.__true_dicts__, _, self.__non_dicts__ = \
            self.__GCFC__.get_change_counts(true_changes, [], non_changes)

        # Get the edge additions specifically.
        self.__true_dicts__ = self.__true_dicts__[1]
        self.__non_dicts__ = self.__non_dicts__[1]

        print("Finished training data counting.")
示例#7
0
class SST_SVMTemporalLinkPredictor(TemporalLinkPredictor):

    # `non_edge_multiplier` - for every true edge, sample this many false edges
    #
    # `base_frac` - have at least this fraction of edges in the graph before
    #   computing vectors for the subsequent edges. Will make split at a
    #   timestamp change, and thus will always have at least the first full
    #   timestamp in the base_graph, even if base_frac=0.0. Also, will always
    #   have at least the last full timestamp outside the base graph, even if
    #   base_frac=1.0
    def __init__(self, graph_nodes, graph_edges, directed=False, \
            subgraph_size=4, non_edge_multiplier=10, \
            num_processes=8, base_frac=1.0, scale_data=False):

        self.__scale_data__ = scale_data

        if directed:
            self.__graph_data__ = DirectedGraphData()
        else:
            self.__graph_data__ = GraphData()

        for node in graph_nodes:
            self.__graph_data__.add_node(node)

        traits = TemporalLinkPredictionTraits
        # NonUpdater included for reasons explained in
        #   TemporalLinkPredictionTraitUpdater's file and the
        #   GraphChangeFeatureCounter file.
        #
        # In short, GCFC needs two updaters, but both the temporal traits used
        #   here operate with a single updater.
        trait_updaters = [\
          TemporalLinkPredictionTraitUpdater(self.__graph_data__), \
            NonUpdater(None)]

        # Remove weight value (this class ignores it) and sort by time.
        sorted_edges = [(a, b, t) for (t, a, b) in \
            sorted([(t, a, b) for (a, b, t, w) in graph_edges])]

        # Pick the timestamp from `base_frac` of the way through the data,
        #   then add all edges with a timestamp <= to it and allow the traits
        #   to update accordingly. This will be the base graph. But first,
        #   ensure that the base graph does not include the last timestamp.
        last_timestamp = sorted_edges[-1][2]
        base_graph_timestamp_idx = min(len(sorted_edges) - 1, \
                                       int(len(sorted_edges)*base_frac))
        base_graph_timestamp = sorted_edges[base_graph_timestamp_idx][2]

        while base_graph_timestamp == last_timestamp:
            base_graph_timestamp_idx -= 1
            base_graph_timestamp = sorted_edges[base_graph_timestamp_idx][2]

        changes = []

        self.__GCFC__ = GraphChangeFeatureCounter(self.__graph_data__, \
            num_processes=num_processes, subgraph_size=subgraph_size, \
            edge_traits=traits, edge_trait_updaters=trait_updaters, \
            use_counts=True)

        for (a, b, t) in sorted_edges:
            if t > base_graph_timestamp:
                break
            changes.append(EdgeAddition(self.__graph_data__, a, b,
                                        timestamp=t))
        self.__GCFC__.run_changes_forward(changes)

        # Create fake edges for remaining timestamps in graph.
        curr_idx = 0
        while sorted_edges[curr_idx][2] <= base_graph_timestamp:
            curr_idx += 1

        print(("Used first %d edges for base graph. " % curr_idx) + \
            "Using remaining %d for change model." % \
                (len(sorted_edges) - curr_idx))

        start_idx = curr_idx
        curr_time = sorted_edges[curr_idx][2]

        num_nodes = len(graph_nodes)

        self.__true_dicts__ = []
        self.__non_dicts__ = []

        edges_at_curr_time = []
        end = False
        while not end:
            if curr_idx < len(sorted_edges):
                (a, b, t) = sorted_edges[curr_idx]
            else:
                end = True
            if end or t > curr_time:
                num_edges = len(edges_at_curr_time)

                num_non_edges = int((num_nodes * (num_nodes - 1)) / \
                    (2 - int(directed))) - num_edges
                target_non_edge_size = min(num_non_edges, \
                    len(edges_at_curr_time) * non_edge_multiplier)

                non_edges = non_edges_sample(graph_nodes, \
                    [(u, v) for (u, v, t) in edges_at_curr_time], \
                    directed, target_non_edge_size, with_replacement=False)

                fake_edges = [(u, v, curr_time) for (u, v) in non_edges]

                true_changes = self.__edges_to_changes__(edges_at_curr_time)
                non_changes = self.__edges_to_changes__(fake_edges)
                # Pass true changes as null changes to they don't accumulate
                #   during this timestep.
                _, true_dicts, non_dicts = \
                    self.__GCFC__.get_change_counts([], true_changes, \
                        non_changes, \
                        permanently_apply_changes=False)

                # Then run changes forward.
                self.__GCFC__.run_changes_forward(true_changes)

                # Get the edge additions specifically.
                self.__true_dicts__ += true_dicts[1]
                self.__non_dicts__ += non_dicts[1]

                curr_time = t
                edges_at_curr_time = []

            edges_at_curr_time.append((a, b, t))
            curr_idx += 1

        print("Finished training data counting.")

    def score_edges(self, edges):
        changes = self.__edges_to_changes__(edges)
        # Perform scoring in chunks to save memory.
        scores = []
        chunk = 0
        chunk_size = 12000
        done = False
        stop = 0
        while not done:
            chunk += 1
            start = stop
            if chunk * chunk_size >= len(changes):
                stop = len(changes)
                done = True
            else:
                stop = chunk * chunk_size
            changes_to_score = changes[start:stop]
            scores += self.score_changes(changes_to_score)

            gc.collect()
            print("  Scored chunk %d." % chunk)
        return scores

    def score_changes(self, changes):
        # Pass as null_changes so that self's graph_data doesn't change.
        print("  Getting Changes' SST Vectors...")
        _, counts_dicts, _ = self.__GCFC__.get_change_counts([], changes, [], \
            permanently_apply_changes=False, allow_new_SSTs=False)
        print("  Scoring...")
        # Convert in place to save space.
        if self.__scale_data__:
            self.__scale_dicts__(counts_dicts[1])
        counts_vectors = self.__dicts_to_sparse_matrix__(counts_dicts[1])
        return self.score_vectors(counts_vectors)

    def score_vectors(self, count_vectors):
        return list(self.__linear_svm__.decision_function(count_vectors))

    # Returns the unit direction vector with components sorted in order of
    #   largest magnitude to least, coupled with a representation of the
    #   subgraph changes associated with each component.
    #
    # Format: List of (vector component, representative subgraph change) tuples
    def get_interpretable_model(self):
        # Extract interpretable features.
        direction_vector = self.__linear_svm__.coef_[0]
        norm = math.sqrt(sum([c * c for c in direction_vector]))
        direction_vector = [c / norm for c in direction_vector]
        sst_labeler = self.__GCFC__.get_subgraph_change_labeler()
        ssts = [sst_labeler.get_representative_subgraph_change_from_label(i, \
            GraphChange.EDGE_ADDITION) for i in range(0, len(direction_vector))]

        dv_sorted = [(abs(direction_vector[i]), direction_vector[i], i) \
            for i in range(0, len(direction_vector))]
        dv_sorted.sort(reverse=True)

        return [(dv_sorted[i][1], ssts[dv_sorted[i][2]]) \
            for i in range(0, len(ssts))]

    # Allows python to pickle the predictor.
    #
    # Once the predictor is used to make a prediction, this method will need to
    #   be called again in order for pickling to work.
    def become_serializeable(self):
        self.__GCFC__.del_worker_pool()

    def fit(self):

        self.__num_labels__ = self.__GCFC__.get_max_seen_labels()[1] + 1

        # Save space with sparse row matrix.
        # Construct while deleting dicts so it's effectively in place.
        num_true = len(self.__true_dicts__)
        num_non = len(self.__non_dicts__)
        all_dicts = self.__true_dicts__
        for i in range(0, num_non):
            all_dicts.append(self.__non_dicts__.pop())

        if self.__scale_data__:
            self.__feature_maxs__ = [
                1.0 for i in range(0, self.__num_labels__)
            ]
            for d in all_dicts:
                for label, count in d.items():
                    if float(count) > self.__feature_maxs__[label]:
                        self.__feature_maxs__[label] = float(count)
            self.__scale_dicts__(all_dicts)

        data_matrix = self.__dicts_to_sparse_matrix__(all_dicts)

        self.__true_dicts__ = None
        self.__non_dicts__ = None

        self.__linear_svm__ = LinearSVC(class_weight='balanced',
                                        max_iter=400000)
        # non labels come first because __dicts_to_sparse_matrix__ reverses
        #   row order.
        labels = [0 for i in range(0, num_non)] + \
            [1 for i in range(0, num_true)]
        print("  Now fitting SVM...")

        self.__linear_svm__.fit(data_matrix, labels)

        data_matrix = None
        gc.collect()

        print("  SVM fit successfully.")

    def __del__(self):
        del self.__GCFC__

    # Destroys dicts in the process.
    def __dicts_to_sparse_matrix__(self, dicts):
        data = []
        row_idxs = []
        col_idxs = []
        size = len(dicts)
        for row in range(0, size):
            row_dict = dicts.pop()
            for col, value in row_dict.items():
                if col >= self.__num_labels__:
                    continue
                data.append(value)
                row_idxs.append(row)
                col_idxs.append(col)
        return csr_matrix((data, (row_idxs, col_idxs)), \
            shape=(size, self.__num_labels__))

    def __edges_to_changes__(self, edges):
        changes = []
        for (a, b, t) in edges:
            changes.append(EdgeAddition(self.__graph_data__, a, b,
                                        timestamp=t))
        return changes

    def __scale_dicts__(self, dicts):
        for d in dicts:
            vals = [(label, float(count)) for (label, count) in d.items()]
            for (label, count) in vals:
                if label < self.__num_labels__:
                    d[label] = count / self.__feature_maxs__[label]
示例#8
0
class SST_SVMLinkPredictor(StaticLinkPredictor):

    # `prediction_dist_cap` -- used to indicate that the predictor will only
    #   be used to make predictions about connecting pairs of nodes at most
    #   a distance of `prediction_dist_cap` away. A value of None indicates no
    #   limit.
    def __init__(self, graph_nodes, graph_edges, directed=False, \
            subgraph_size=4, non_edge_multiplier=10, \
            prediction_dist_cap=None, \
            num_processes=8, scale_data=False):

        self.__scale_data__ = scale_data

        if directed:
            self.__graph_data__ = DirectedGraphData()
        else:
            self.__graph_data__ = GraphData()
        for node in graph_nodes:
            self.__graph_data__.add_node(node)

        for (a, b) in graph_edges:  # remaining_edges
            self.__graph_data__.add_edge(a, b)

        num_nodes = len(graph_nodes)
        num_edges = len(graph_edges)
        num_non_edges = int((num_nodes * (num_nodes - 1)) / \
            (2 - int(directed))) - num_edges

        if prediction_dist_cap is None:
            true_edges = graph_edges

            target_non_edge_size = min(num_non_edges, \
                                       len(true_edges) * non_edge_multiplier)
            non_edges = non_edges_sample(graph_nodes,
                                         graph_edges,
                                         directed,
                                         target_non_edge_size,
                                         with_replacement=False)
        else:
            k = prediction_dist_cap
            true_edges = \
                all_connected_node_pairs_that_would_be_within_k_if_disconnected(\
                    self.__graph_data__, k)

            possible_edges = \
                all_disconnected_node_pairs_within_k(self.__graph_data__, k)

            target_non_edge_size = min(len(possible_edges), \
                                       len(true_edges) * non_edge_multiplier)

            non_edges = set(random.sample(possible_edges, \
                target_non_edge_size))

        print("Training on %d true edges (%f percent of all graph edges)" % \
            (len(true_edges), (100.0 * len(true_edges)) / len(graph_edges)))

        print("Training on %d non model edges (%f percent of all non edges.)" % \
            (len(non_edges), (100.0 * len(non_edges)) / num_non_edges))

        true_changes = self.__edges_to_changes__(true_edges)
        non_changes = self.__edges_to_changes__(non_edges)

        if self.__graph_data__.is_directed():
            node_traits = []
            node_trait_updaters = []
        else:
            node_traits = [InvolvedNodeDegreeTrait()]
            node_trait_updaters = \
                [InvolvedNodeDegreeTraitUpdater(self.__graph_data__)]

        self.__GCFC__ = GraphChangeFeatureCounter(self.__graph_data__, \
            num_processes=num_processes, subgraph_size=subgraph_size, \
            node_traits=node_traits, node_trait_updaters=node_trait_updaters, \
            use_counts=True)

        self.__true_dicts__, _, self.__non_dicts__ = \
            self.__GCFC__.get_change_counts(true_changes, [], non_changes)

        # Get the edge additions specifically.
        self.__true_dicts__ = self.__true_dicts__[1]
        self.__non_dicts__ = self.__non_dicts__[1]

        print("Finished training data counting.")

    def score_edges(self, edges):
        changes = self.__edges_to_changes__(edges)
        # Perform scoring in chunks to save memory.
        scores = []
        chunk = 0
        chunk_size = 6000
        done = False
        stop = 0
        while not done:
            chunk += 1
            start = stop
            if chunk * chunk_size >= len(changes):
                stop = len(changes)
                done = True
            else:
                stop = chunk * chunk_size
            changes_to_score = changes[start:stop]
            scores += self.score_changes(changes_to_score)

            gc.collect()
            print("  Scored chunk %d." % chunk)
        return scores

    def score_changes(self, changes):
        # Pass as null_changes so that self's graph_data doesn't change.
        print("  Getting Changes' SST Vectors...")
        _, counts_dicts, _ = self.__GCFC__.get_change_counts([], changes, [], \
            permanently_apply_changes=False, allow_new_SSTs=False)
        print("  Scoring...")
        # Convert in place to save space.
        if self.__scale_data__:
            self.__scale_dicts__(counts_dicts[1])
        counts_vectors = self.__dicts_to_sparse_matrix__(counts_dicts[1])
        return self.score_vectors(counts_vectors)

    def score_vectors(self, count_vectors):
        return list(self.__linear_svm__.decision_function(count_vectors))

    # def graph(self):
    #     return self.__graph_data__

    # Returns the unit direction vector with components sorted in order of
    #   largest magnitude to least, coupled with a representation of the
    #   subgraph changes associated with each component.
    #
    # Format: List of (vector component, representative subgraph change) tuples
    def get_interpretable_model(self):
        # Extract interpretable features.
        direction_vector = self.__linear_svm__.coef_[0]
        norm = math.sqrt(sum([c * c for c in direction_vector]))
        direction_vector = [c / norm for c in direction_vector]
        sst_labeler = self.__GCFC__.get_subgraph_change_labeler()
        ssts = [sst_labeler.get_representative_subgraph_change_from_label(i, \
            GraphChange.EDGE_ADDITION) for i in range(0, len(direction_vector))]

        dv_sorted = [(abs(direction_vector[i]), direction_vector[i], i) \
            for i in range(0, len(direction_vector))]
        dv_sorted.sort(reverse=True)

        return [(dv_sorted[i][1], ssts[dv_sorted[i][2]]) \
            for i in range(0, len(ssts))]

    # Allows python to pickle the predictor.
    #
    # Once the predictor is used to make a prediction, this method will need to
    #   be called again in order for pickling to work.
    def become_serializeable(self):
        self.__GCFC__.del_worker_pool()

    def __del__(self):
        del self.__GCFC__

    def fit(self):

        self.__num_labels__ = self.__GCFC__.get_max_seen_labels()[1] + 1

        # Save space with sparse row matrix.
        # Construct while deleting dicts so it's effectively in place.
        num_true = len(self.__true_dicts__)
        num_non = len(self.__non_dicts__)
        all_dicts = self.__true_dicts__
        for i in range(0, num_non):
            all_dicts.append(self.__non_dicts__.pop())

        if self.__scale_data__:
            self.__feature_maxs__ = [
                1.0 for i in range(0, self.__num_labels__)
            ]
            for d in all_dicts:
                for label, count in d.items():
                    if float(count) > self.__feature_maxs__[label]:
                        self.__feature_maxs__[label] = float(count)
            self.__scale_dicts__(all_dicts)

        data_matrix = self.__dicts_to_sparse_matrix__(all_dicts)

        self.__true_dicts__ = None
        self.__non_dicts__ = None

        self.__linear_svm__ = LinearSVC(class_weight='balanced',
                                        max_iter=400000)
        # non labels come first because __dicts_to_sparse_matrix__ reverses
        #   row order.
        labels = [0 for i in range(0, num_non)] + \
            [1 for i in range(0, num_true)]

        print("  Now fitting SVM...")

        self.__linear_svm__.fit(data_matrix, labels)

        data_matrix = None
        gc.collect()

        print("  SVM fit successfully.")

    # Destroys dicts in the process.
    def __dicts_to_sparse_matrix__(self, dicts):
        data = []
        row_idxs = []
        col_idxs = []
        size = len(dicts)
        for row in range(0, size):
            row_dict = dicts.pop()
            for col, value in row_dict.items():
                if col >= self.__num_labels__:
                    continue
                data.append(value)
                row_idxs.append(row)
                col_idxs.append(col)
        return csr_matrix((data, (row_idxs, col_idxs)), \
            shape=(size, self.__num_labels__))

    def __edges_to_changes__(self, edges):
        changes = []
        for (a, b) in edges:
            changes.append(EdgeAddition(self.__graph_data__, a, b))
        return changes

    def __scale_dicts__(self, dicts):
        for d in dicts:
            vals = [(label, float(count)) for (label, count) in d.items()]
            for (label, count) in vals:
                if label < self.__num_labels__:
                    d[label] = count / self.__feature_maxs__[label]
    max_seq_len_placeholder = tf.placeholder(tf.int32)
    inputs_placeholder = tf.placeholder(tf.float32, shape=(args.batch_size, None, 2 * args.max_seq_len + 1))
    outputs_placeholder = tf.placeholder(tf.float32, shape=(args.batch_size, None, args.max_seq_len))
    model = BuildTModel(max_seq_len_placeholder, inputs_placeholder, outputs_placeholder)
    initializer = tf.global_variables_initializer()

# training

convergence_on_target_task = None
convergence_on_multi_task = None
performance_on_target_task = None
performance_on_multi_task = None
generalization_from_target_task = None
generalization_from_multi_task = None

data_generator = GraphData()
target_point = args.max_seq_len
curriculum_point = 1 if args.curriculum not in ('prediction_gain', 'none') else target_point
progress_error = 1.0
convergence_error = 0.1

sess = tf.Session()
sess.run(initializer)

pickle.dump({target_point: []}, open(HEAD_LOG_FILE, "wb"))
pickle.dump({}, open(GENERALIZATION_HEAD_LOG_FILE, "wb"))


def run_eval(batches, store_heat_maps=False, generalization_num=None):
    task_loss = 0
    task_error = 0
示例#10
0
文件: main.py 项目: bdlddn/gspan
    #从文件读取数据到data_array数组
    with open(input_data_path) as file_object:
        for line in file_object:
            data_array.append(line.replace('\n', '').split(' '))

    #  设置开始时间
    start_time = datetime.datetime.now()

    # 统计每一个边和点的出现次数,并将data_array中的数据加入到data_graph中
    i = 0
    while i < label_max:
        freq_edge_label.append(0)
        freq_node_label.append(0)
        i += 1

    gd = GraphData()

    for array in data_array:
        if array[0] == input_new_graph:
            # 有数据时将gd加入到total_graph
            if gd.get_node_labels():
                total_graph_data.append(gd)
            gd = GraphData()
        elif array[0] == input_vertice:
            if not array[2] in gd.get_node_labels():
                freq_node_label[int(array[2])] += 1
            gd.get_node_labels().append(array[2])
            gd.get_node_visibles().append(True)

        elif array[0] == input_edge:
            if not array[3] in gd.get_edge_labels():
示例#11
0
def add_empty(x_complex, available_classes):
    for clazz in available_classes:
        x_complex.append(GraphData(clazz))