示例#1
0
def test_main():
    import numpy as np
    from features_infra.graph_features import GraphFeatures
    from loggers import PrintLogger
    import os
    import pickle
    import networkx as nx

    dataset = "citeseer"
    logger = PrintLogger("MetaTest")
    base_dir = r"/home/benami/git/pygcn/data"
    gnx = pickle.load(open(os.path.join(base_dir, dataset, "gnx.pkl"), 'rb'))

    max_subgnx = max(nx.connected_component_subgraphs(gnx.to_undirected()),
                     key=len)
    gnx = gnx.subgraph(max_subgnx)

    features = GraphFeatures(gnx,
                             TEST_FEATURES,
                             dir_path="./%s_features_sub" % dataset,
                             logger=logger)
    features.build(should_dump=True)
    measures_mx = features.to_matrix(add_ones=False,
                                     dtype=np.float32,
                                     mtype=np.matrix)
    logger.info("Finished")
示例#2
0
def create_features(data_name, time_range):
    for i in range(time_range):
        gnx = pickle.load(open("./dataset/"+data_name+"/pkl/gcn_input/"+"graph_"+str(i)+".pkl","rb"))

        # with open(os.path.join('data',str(data_name),'gcn_input', 'graph_'+str(i)+'.pkl'), 'rb') as f:
        #     gnx = pickle.load(f)
        logger = PrintLogger("MyLogger")
        features_meta = {
            "page_rank": FeatureMeta(PageRankCalculator, {"pr"}),
            "general": FeatureMeta(GeneralCalculator, {"gen"}),
            "Average_Neighbor_Degree": FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}),
            "k_core": FeatureMeta(KCoreCalculator, {"kc"})}
    
        features = GraphFeatures(gnx, features_meta, "./dataset/"+str(data_name)+"/pkl/feature", logger=logger)
        features.build()
        mx = features.to_matrix(mtype=np.matrix)

        pickle.dump(mx, open("./dataset/"+data_name+"/pkl/gcn_input/"+"mx_"+str(i)+".pkl", "wb"))
        # with open(os.path.join('data',str(data_name),'gcn_input','mx_'+str(i)+'.pkl'), 'wb') as f:
        #     pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL)
            
    return

# with open(os.path.join('data',str(data_name),'pkl', 'mx_1.pkl'), 'rb') as f:
#     l = pickle.load(f)
#
# print (l[0])
示例#3
0
def create_features():
    for i in range(21):
        with open(os.path.join('graphs_by_years', 'graph_' + str(i) + '.pkl'),
                  'rb') as f:
            gnx = pickle.load(f)

        logger = PrintLogger("MyLogger")

        features_meta = {
            "page_rank":
            FeatureMeta(PageRankCalculator, {"pr"}),
            "general":
            FeatureMeta(GeneralCalculator, {"gen"}),
            "Average_Neighbor_Degree":
            FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}),
            "k_core":
            FeatureMeta(KCoreCalculator, {"kc"}),
        }

        features = GraphFeatures(gnx,
                                 features_meta,
                                 "/home/dsi/racheli/graph_calculations",
                                 logger=logger)
        features.build()

        mx = features.to_matrix(mtype=np.matrix)

        with open(os.path.join('graphs_by_years', 'mx_' + str(i) + '.pkl'),
                  'wb') as f:
            pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL)
示例#4
0
    def __get_features(self):
        with open(self.dirname + '/data/' + self.DATASET + '.pickle',
                  'rb') as f:
            data = pkl.load(f)

        self._original_adj_matrices = data['A']
        y = data['y']

        node_labels = []
        for a in y.todense():
            if a.max() != 0:
                node_labels.append(a.argmax())
            else:
                node_labels.append(-1)

        sum_adj = AsymmetricRGCNWithNeighborHistograms.__sum_sparse(self.A)
        gnx = nx.from_scipy_sparse_matrix(sum_adj, parallel_edges=True)
        gnx = nx.DiGraph(gnx, labels=node_labels)

        for n, label in zip(gnx.nodes, node_labels):
            gnx.node[n]['label'] = label

        real_labels = list(set(node_labels) - {-1})

        # Get the features for the graph
        NEIGHBOR_FEATURES = {
            "first_neighbor_histogram":
            FeatureMeta(
                nth_neighbor_calculator(1, labels_to_consider=real_labels),
                {"fnh", "first_neighbor"}),
            "second_neighbor_histogram":
            FeatureMeta(
                nth_neighbor_calculator(2, labels_to_consider=real_labels),
                {"snh", "second_neighbor"}),
        }
        features_path = os.path.join(os.path.abspath('../features'),
                                     self.DATASET)
        features = GraphFeatures(gnx,
                                 NEIGHBOR_FEATURES,
                                 dir_path=features_path)
        features.build(include=set(self.idx_train), should_dump=True)

        add_ones = bool(
            {"first_neighbor_histogram",
             "second_neighbor_histogram"}.intersection(NEIGHBOR_FEATURES))
        _topo_mx = features.to_matrix(add_ones=add_ones,
                                      dtype=np.float64,
                                      mtype=np.matrix,
                                      should_zscore=True)

        del data
        return sp.csr_matrix(
            np.hstack([
                _topo_mx,
                np.zeros(
                    (_topo_mx.shape[0], _topo_mx.shape[0] - _topo_mx.shape[1]))
            ]))
示例#5
0
    def set_train(self, train_set, features_meta):
        features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger,
                                 is_max_connected=False)  # Already taking the max sub_graph in init
        features.build(include=set(train_set), should_dump=True)

        add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta))
        self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True)

        ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx))))
        self._topo_mx /= ratio
    def split_train(self, train_p, features_meta):
        train_set, val_set, self._train_idx, self._val_idx = train_test_split(self._train_set, self._base_train_idx,
                                                                              test_size=1 - train_p, shuffle=True)

        features_path = self._features_path()
        features = GraphFeatures(self._gnx, features_meta, dir_path=features_path,
                                 logger=self._logger, is_max_connected=self._is_max_connected)
        features.build(include=set(train_set), should_dump=False)

        add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta))
        self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True)

        ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx))))
        self._topo_mx /= ratio
    def _prepare_data(self):
        self._split_data()

        self._inputs = self._targets = None
        for path in self._get_gnx_paths():
            feat_path = os.path.join(path, "features_0")
            gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb"))
            gnx = gnx.subgraph(self._nodes_order)

            features = GraphFeatures(gnx,
                                     self._features_meta,
                                     dir_path=feat_path,
                                     logger=self._logger)
            features.build(include=self._train_set)

            add_ones = bool(
                set(self._features_meta).intersection(
                    ["first_neighbor_histogram", "second_neighbor_histogram"]))
            cur_data = features.to_matrix(add_ones=add_ones,
                                          dtype=np.float32,
                                          mtype=np.array,
                                          should_zscore=True)
            self._inputs = cur_data if self._inputs is None else np.dstack(
                (self._inputs, cur_data))
            pickle.dump(cur_data,
                        open(os.path.join(feat_path, "data.pkl"), "wb"))

            cur_labels = self._encode_onehot_gnx(gnx, self._nodes_order)
            self._targets = cur_labels if self._targets is None else np.dstack(
                (self._targets, cur_labels))
            pickle.dump(cur_labels,
                        open(os.path.join(feat_path, "labels.pkl"), "wb"))

        # Arranging data as <batch, seq, feature>
        if self._gnx_idx is None:
            self._inputs = self._inputs.transpose((0, 2, 1))
            self._targets = self._targets.transpose((0, 2, 1))
        self._logger.debug("Finished preparing the data")