Exemplo n.º 1
0
def create_features(data_name, time_range):
    for i in range(time_range):
        gnx = pickle.load(open("./dataset/"+data_name+"/pkl/gcn_input/"+"graph_"+str(i)+".pkl","rb"))

        # with open(os.path.join('data',str(data_name),'gcn_input', 'graph_'+str(i)+'.pkl'), 'rb') as f:
        #     gnx = pickle.load(f)
        logger = PrintLogger("MyLogger")
        features_meta = {
            "page_rank": FeatureMeta(PageRankCalculator, {"pr"}),
            "general": FeatureMeta(GeneralCalculator, {"gen"}),
            "Average_Neighbor_Degree": FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}),
            "k_core": FeatureMeta(KCoreCalculator, {"kc"})}
    
        features = GraphFeatures(gnx, features_meta, "./dataset/"+str(data_name)+"/pkl/feature", logger=logger)
        features.build()
        mx = features.to_matrix(mtype=np.matrix)

        pickle.dump(mx, open("./dataset/"+data_name+"/pkl/gcn_input/"+"mx_"+str(i)+".pkl", "wb"))
        # with open(os.path.join('data',str(data_name),'gcn_input','mx_'+str(i)+'.pkl'), 'wb') as f:
        #     pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL)
            
    return

# with open(os.path.join('data',str(data_name),'pkl', 'mx_1.pkl'), 'rb') as f:
#     l = pickle.load(f)
#
# print (l[0])
    def _calc_tg_feature_matrix(self):
        log_ext = "log_" if self._params['features']['log'] else ""
        feature_matrix_dir = os.path.join(self._params['general']['pkl_path'], "gt_feature_matrix")
        mat_pkl = os.path.join(feature_matrix_dir, f"{self.data_name()}_{log_ext}tg_feature_matrices.pkl")

        if os.path.exists(mat_pkl):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl, "rb"))

        gnx_to_vec = {}
        # create dir for database
        database_pkl_dir = os.path.join(self._params['general']['pkl_path'], "features", self.data_name())
        for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, re.sub('[^a-zA-Z0-9]', '_', gnx_name))
            if self._params['general']["dump_pkl"]:
                os.makedirs(gnx_path, exist_ok=True)

            gnx_ftr = GraphFeatures(gnx, ANOMALY_DETECTION_FEATURES, dir_path=gnx_path, logger=self._logger,
                                    is_max_connected=self._params['features']['max_connected'])
            gnx_ftr.build(should_dump=self._params['general']["dump_pkl"],
                          force_build=self._params['general']['FORCE_REBUILD_FEATURES'])  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(norm_func=log_norm if self._params['features']['log'] else None)
        if self._params['general']['dump_pkl']:
            os.makedirs(feature_matrix_dir, exist_ok=True)
            pickle.dump(gnx_to_vec, open(mat_pkl, "wb"))
        return gnx_to_vec
Exemplo n.º 3
0
def test_main():
    import numpy as np
    from features_infra.graph_features import GraphFeatures
    from loggers import PrintLogger
    import os
    import pickle
    import networkx as nx

    dataset = "citeseer"
    logger = PrintLogger("MetaTest")
    base_dir = r"/home/benami/git/pygcn/data"
    gnx = pickle.load(open(os.path.join(base_dir, dataset, "gnx.pkl"), 'rb'))

    max_subgnx = max(nx.connected_component_subgraphs(gnx.to_undirected()),
                     key=len)
    gnx = gnx.subgraph(max_subgnx)

    features = GraphFeatures(gnx,
                             TEST_FEATURES,
                             dir_path="./%s_features_sub" % dataset,
                             logger=logger)
    features.build(should_dump=True)
    measures_mx = features.to_matrix(add_ones=False,
                                     dtype=np.float32,
                                     mtype=np.matrix)
    logger.info("Finished")
Exemplo n.º 4
0
def create_features():
    for i in range(21):
        with open(os.path.join('graphs_by_years', 'graph_' + str(i) + '.pkl'),
                  'rb') as f:
            gnx = pickle.load(f)

        logger = PrintLogger("MyLogger")

        features_meta = {
            "page_rank":
            FeatureMeta(PageRankCalculator, {"pr"}),
            "general":
            FeatureMeta(GeneralCalculator, {"gen"}),
            "Average_Neighbor_Degree":
            FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}),
            "k_core":
            FeatureMeta(KCoreCalculator, {"kc"}),
        }

        features = GraphFeatures(gnx,
                                 features_meta,
                                 "/home/dsi/racheli/graph_calculations",
                                 logger=logger)
        features.build()

        mx = features.to_matrix(mtype=np.matrix)

        with open(os.path.join('graphs_by_years', 'mx_' + str(i) + '.pkl'),
                  'wb') as f:
            pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL)
Exemplo n.º 5
0
    def __get_features(self):
        with open(self.dirname + '/data/' + self.DATASET + '.pickle',
                  'rb') as f:
            data = pkl.load(f)

        self._original_adj_matrices = data['A']
        y = data['y']

        node_labels = []
        for a in y.todense():
            if a.max() != 0:
                node_labels.append(a.argmax())
            else:
                node_labels.append(-1)

        sum_adj = AsymmetricRGCNWithNeighborHistograms.__sum_sparse(self.A)
        gnx = nx.from_scipy_sparse_matrix(sum_adj, parallel_edges=True)
        gnx = nx.DiGraph(gnx, labels=node_labels)

        for n, label in zip(gnx.nodes, node_labels):
            gnx.node[n]['label'] = label

        real_labels = list(set(node_labels) - {-1})

        # Get the features for the graph
        NEIGHBOR_FEATURES = {
            "first_neighbor_histogram":
            FeatureMeta(
                nth_neighbor_calculator(1, labels_to_consider=real_labels),
                {"fnh", "first_neighbor"}),
            "second_neighbor_histogram":
            FeatureMeta(
                nth_neighbor_calculator(2, labels_to_consider=real_labels),
                {"snh", "second_neighbor"}),
        }
        features_path = os.path.join(os.path.abspath('../features'),
                                     self.DATASET)
        features = GraphFeatures(gnx,
                                 NEIGHBOR_FEATURES,
                                 dir_path=features_path)
        features.build(include=set(self.idx_train), should_dump=True)

        add_ones = bool(
            {"first_neighbor_histogram",
             "second_neighbor_histogram"}.intersection(NEIGHBOR_FEATURES))
        _topo_mx = features.to_matrix(add_ones=add_ones,
                                      dtype=np.float64,
                                      mtype=np.matrix,
                                      should_zscore=True)

        del data
        return sp.csr_matrix(
            np.hstack([
                _topo_mx,
                np.zeros(
                    (_topo_mx.shape[0], _topo_mx.shape[0] - _topo_mx.shape[1]))
            ]))
Exemplo n.º 6
0
    def set_train(self, train_set, features_meta):
        features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger,
                                 is_max_connected=False)  # Already taking the max sub_graph in init
        features.build(include=set(train_set), should_dump=True)

        add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta))
        self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True)

        ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx))))
        self._topo_mx /= ratio
Exemplo n.º 7
0
    def split_train(self, train_p, features_meta):
        train_set, val_set, self._train_idx, self._val_idx = train_test_split(self._train_set, self._base_train_idx,
                                                                              test_size=1 - train_p, shuffle=True)

        features_path = self._features_path()
        features = GraphFeatures(self._gnx, features_meta, dir_path=features_path,
                                 logger=self._logger, is_max_connected=self._is_max_connected)
        features.build(include=set(train_set), should_dump=False)

        add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta))
        self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True)

        ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx))))
        self._topo_mx /= ratio
Exemplo n.º 8
0
    def _prepare_data(self):
        self._split_data()

        self._inputs = self._targets = None
        for path in self._get_gnx_paths():
            feat_path = os.path.join(path, "features_0")
            gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb"))
            gnx = gnx.subgraph(self._nodes_order)

            features = GraphFeatures(gnx,
                                     self._features_meta,
                                     dir_path=feat_path,
                                     logger=self._logger)
            features.build(include=self._train_set)

            add_ones = bool(
                set(self._features_meta).intersection(
                    ["first_neighbor_histogram", "second_neighbor_histogram"]))
            cur_data = features.to_matrix(add_ones=add_ones,
                                          dtype=np.float32,
                                          mtype=np.array,
                                          should_zscore=True)
            self._inputs = cur_data if self._inputs is None else np.dstack(
                (self._inputs, cur_data))
            pickle.dump(cur_data,
                        open(os.path.join(feat_path, "data.pkl"), "wb"))

            cur_labels = self._encode_onehot_gnx(gnx, self._nodes_order)
            self._targets = cur_labels if self._targets is None else np.dstack(
                (self._targets, cur_labels))
            pickle.dump(cur_labels,
                        open(os.path.join(feat_path, "labels.pkl"), "wb"))

        # Arranging data as <batch, seq, feature>
        if self._gnx_idx is None:
            self._inputs = self._inputs.transpose((0, 2, 1))
            self._targets = self._targets.transpose((0, 2, 1))
        self._logger.debug("Finished preparing the data")
def calculate_gpu_one(run, level, size, p, directed):
    from features_infra.graph_features import GraphFeatures
    from features_infra.feature_calculators import FeatureMeta
    from features_algorithms.accelerated_graph_features.motifs import nth_nodes_motif
    from loggers import FileLogger
    feature_meta = {
        "motif" + str(level):
        FeatureMeta(nth_nodes_motif(level, gpu=True, device=3),
                    {"m" + str(level)})
    }
    head_path = os.path.join(
        "size{}_p{}_directed{}_runs".format(size, p, directed),
        "run_" + str(run))
    dump_path = os.path.join(head_path, "motifs_gpu")
    graph = pickle.load(open(os.path.join(head_path, "gnx.pkl"), "rb"))
    logger = FileLogger("CalculationLogger" + str(level),
                        path=dump_path,
                        level=logging.DEBUG)
    raw_feature = GraphFeatures(gnx=graph,
                                features=feature_meta,
                                dir_path=dump_path,
                                logger=logger)
    raw_feature.build(should_dump=True)