def test_main(): import numpy as np from features_infra.graph_features import GraphFeatures from loggers import PrintLogger import os import pickle import networkx as nx dataset = "citeseer" logger = PrintLogger("MetaTest") base_dir = r"/home/benami/git/pygcn/data" gnx = pickle.load(open(os.path.join(base_dir, dataset, "gnx.pkl"), 'rb')) max_subgnx = max(nx.connected_component_subgraphs(gnx.to_undirected()), key=len) gnx = gnx.subgraph(max_subgnx) features = GraphFeatures(gnx, TEST_FEATURES, dir_path="./%s_features_sub" % dataset, logger=logger) features.build(should_dump=True) measures_mx = features.to_matrix(add_ones=False, dtype=np.float32, mtype=np.matrix) logger.info("Finished")
def create_features(data_name, time_range): for i in range(time_range): gnx = pickle.load(open("./dataset/"+data_name+"/pkl/gcn_input/"+"graph_"+str(i)+".pkl","rb")) # with open(os.path.join('data',str(data_name),'gcn_input', 'graph_'+str(i)+'.pkl'), 'rb') as f: # gnx = pickle.load(f) logger = PrintLogger("MyLogger") features_meta = { "page_rank": FeatureMeta(PageRankCalculator, {"pr"}), "general": FeatureMeta(GeneralCalculator, {"gen"}), "Average_Neighbor_Degree": FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}), "k_core": FeatureMeta(KCoreCalculator, {"kc"})} features = GraphFeatures(gnx, features_meta, "./dataset/"+str(data_name)+"/pkl/feature", logger=logger) features.build() mx = features.to_matrix(mtype=np.matrix) pickle.dump(mx, open("./dataset/"+data_name+"/pkl/gcn_input/"+"mx_"+str(i)+".pkl", "wb")) # with open(os.path.join('data',str(data_name),'gcn_input','mx_'+str(i)+'.pkl'), 'wb') as f: # pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL) return # with open(os.path.join('data',str(data_name),'pkl', 'mx_1.pkl'), 'rb') as f: # l = pickle.load(f) # # print (l[0])
def create_features(): for i in range(21): with open(os.path.join('graphs_by_years', 'graph_' + str(i) + '.pkl'), 'rb') as f: gnx = pickle.load(f) logger = PrintLogger("MyLogger") features_meta = { "page_rank": FeatureMeta(PageRankCalculator, {"pr"}), "general": FeatureMeta(GeneralCalculator, {"gen"}), "Average_Neighbor_Degree": FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}), "k_core": FeatureMeta(KCoreCalculator, {"kc"}), } features = GraphFeatures(gnx, features_meta, "/home/dsi/racheli/graph_calculations", logger=logger) features.build() mx = features.to_matrix(mtype=np.matrix) with open(os.path.join('graphs_by_years', 'mx_' + str(i) + '.pkl'), 'wb') as f: pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL)
def __get_features(self): with open(self.dirname + '/data/' + self.DATASET + '.pickle', 'rb') as f: data = pkl.load(f) self._original_adj_matrices = data['A'] y = data['y'] node_labels = [] for a in y.todense(): if a.max() != 0: node_labels.append(a.argmax()) else: node_labels.append(-1) sum_adj = AsymmetricRGCNWithNeighborHistograms.__sum_sparse(self.A) gnx = nx.from_scipy_sparse_matrix(sum_adj, parallel_edges=True) gnx = nx.DiGraph(gnx, labels=node_labels) for n, label in zip(gnx.nodes, node_labels): gnx.node[n]['label'] = label real_labels = list(set(node_labels) - {-1}) # Get the features for the graph NEIGHBOR_FEATURES = { "first_neighbor_histogram": FeatureMeta( nth_neighbor_calculator(1, labels_to_consider=real_labels), {"fnh", "first_neighbor"}), "second_neighbor_histogram": FeatureMeta( nth_neighbor_calculator(2, labels_to_consider=real_labels), {"snh", "second_neighbor"}), } features_path = os.path.join(os.path.abspath('../features'), self.DATASET) features = GraphFeatures(gnx, NEIGHBOR_FEATURES, dir_path=features_path) features.build(include=set(self.idx_train), should_dump=True) add_ones = bool( {"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(NEIGHBOR_FEATURES)) _topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) del data return sp.csr_matrix( np.hstack([ _topo_mx, np.zeros( (_topo_mx.shape[0], _topo_mx.shape[0] - _topo_mx.shape[1])) ]))
def set_train(self, train_set, features_meta): features = GraphFeatures(self._gnx, features_meta, dir_path=self._paths["features"], logger=self._logger, is_max_connected=False) # Already taking the max sub_graph in init features.build(include=set(train_set), should_dump=True) add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta)) self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx)))) self._topo_mx /= ratio
def split_train(self, train_p, features_meta): train_set, val_set, self._train_idx, self._val_idx = train_test_split(self._train_set, self._base_train_idx, test_size=1 - train_p, shuffle=True) features_path = self._features_path() features = GraphFeatures(self._gnx, features_meta, dir_path=features_path, logger=self._logger, is_max_connected=self._is_max_connected) features.build(include=set(train_set), should_dump=False) add_ones = bool({"first_neighbor_histogram", "second_neighbor_histogram"}.intersection(features_meta)) self._topo_mx = features.to_matrix(add_ones=add_ones, dtype=np.float64, mtype=np.matrix, should_zscore=True) ratio = 10 ** np.ceil(np.log10(abs(np.mean(self._topo_mx) / np.mean(self._bow_mx)))) self._topo_mx /= ratio
def _prepare_data(self): self._split_data() self._inputs = self._targets = None for path in self._get_gnx_paths(): feat_path = os.path.join(path, "features_0") gnx = pickle.load(open(os.path.join(feat_path, "gnx.pkl"), "rb")) gnx = gnx.subgraph(self._nodes_order) features = GraphFeatures(gnx, self._features_meta, dir_path=feat_path, logger=self._logger) features.build(include=self._train_set) add_ones = bool( set(self._features_meta).intersection( ["first_neighbor_histogram", "second_neighbor_histogram"])) cur_data = features.to_matrix(add_ones=add_ones, dtype=np.float32, mtype=np.array, should_zscore=True) self._inputs = cur_data if self._inputs is None else np.dstack( (self._inputs, cur_data)) pickle.dump(cur_data, open(os.path.join(feat_path, "data.pkl"), "wb")) cur_labels = self._encode_onehot_gnx(gnx, self._nodes_order) self._targets = cur_labels if self._targets is None else np.dstack( (self._targets, cur_labels)) pickle.dump(cur_labels, open(os.path.join(feat_path, "labels.pkl"), "wb")) # Arranging data as <batch, seq, feature> if self._gnx_idx is None: self._inputs = self._inputs.transpose((0, 2, 1)) self._targets = self._targets.transpose((0, 2, 1)) self._logger.debug("Finished preparing the data")