Exemplo n.º 1
0
    def _gnx_vec(self, gnx_id, gnx: nx.Graph, node_order):
        final_vec = []
        if self._deg:
            degrees = gnx.degree(gnx.nodes)
            final_vec.append(
                np.matrix([np.log(degrees[d] + 1e-3) for d in node_order]).T)
        if self._in_deg:
            degrees = gnx.in_degree(gnx.nodes)
            final_vec.append(
                np.matrix([np.log(degrees[d] + 1e-3) for d in node_order]).T)
        if self._out_deg:
            degrees = gnx.out_degree(gnx.nodes)
            final_vec.append(
                np.matrix([np.log(degrees[d] + 1e-3) for d in node_order]).T)
        if self._is_external_data and self._external_data.is_value:
            final_vec.append(
                np.matrix([
                    self._external_data.value_feature(gnx_id, d)
                    for d in node_order
                ]))
        if self._is_ftr:
            name = str(gnx_id)
            gnx_dir_path = os.path.join(self._ftr_path, name)
            if not os.path.exists(gnx_dir_path):
                os.mkdir(gnx_dir_path)
            raw_ftr = GraphFeatures(gnx,
                                    self._ftr_meta,
                                    dir_path=gnx_dir_path,
                                    is_max_connected=False,
                                    logger=PrintLogger("logger"))
            raw_ftr.build(should_dump=True)  # build features
            final_vec.append(
                FeaturesProcessor(raw_ftr).as_matrix(norm_func=log_norm))

        return np.hstack(final_vec)
 def _calc_motif3(self, gpu, device):
     if self._dir_path != "":
         if os.path.exists(os.path.join(self._dir_path, "motif3.pkl")):
             pkl3 = pickle.load(
                 open(os.path.join(self._dir_path, "motif3.pkl"), "rb"))
             if type(pkl3) == dict:
                 return pkl3
             elif type(pkl3) == list:
                 motif3 = {v: pkl3[v] for v in range(len(pkl3))}
                 return motif3
             else:
                 motif3 = pkl3._features
                 motif3dict = {v: motif3[v] for v in range(len(motif3))}
                 return motif3dict
     (graph, vertices_dict) = (self._graph, {v: v for v in self._graph.nodes()}) if not \
         sorted(list(self._graph.nodes()))[-1] != len(self._graph) - 1 else self._relabel_graph()
     raw_ftr = GraphFeatures(graph, {
         "motif3":
         FeatureMeta(nth_nodes_motif(3, gpu=gpu, device=device), {"m3"})
     },
                             dir_path=self._dir_path)
     raw_ftr.build(should_dump=True if self._dir_path != "" else False)
     motif3 = raw_ftr['motif3']._features
     motif3dict = {
         vertices_dict[v]: motif3[v]
         for v in range(len(vertices_dict))
     }
     return motif3dict
Exemplo n.º 3
0
 def _execute_for_3(self, motifs_picked):
     if self._params["load_motifs"] or os.path.exists(
             os.path.join(self._dir_path, 'motif3.pkl')):
         pkl3 = pickle.load(
             open(os.path.join(self._dir_path, "motif3.pkl"), "rb"))
         try:
             m3 = pkl3._features
             if type(m3) == dict:
                 motif3 = self._to_matrix_(m3)
             else:
                 motif3 = np.array(m3)
         except AttributeError:
             if type(pkl3) == dict:
                 motif3 = self._to_matrix(pkl3)
             else:
                 motif3 = np.array(pkl3)
         self._motif_mat = motif3
         self._motif_mat = self._motif_mat[:, motifs_picked]
         print(str(datetime.datetime.now()) + " , Calculated motifs")
         return
     motif_featutes = {"motif3": self._motif_features["motif3"]}
     g_ftrs = GraphFeatures(self._graph,
                            motif_featutes,
                            dir_path=self._dir_path)
     g_ftrs.build(should_dump=True)
     print(str(datetime.datetime.now()) + " , Calculated motifs")
     self._motif_mat = np.asarray(g_ftrs['motif3']._features)
     self._motif_mat = self._motif_mat[:, motifs_picked]
Exemplo n.º 4
0
    def _set_index_to_ftr(self):
        gnx_name = self._temporal_graph.graph_names().__next__()
        gnx = self._temporal_graph.graphs().__next__()
        database_name = self._data_name + "_" + str(
            self._params.max_connected) + "_" + str(self._params.directed)
        gnx_path = os.path.join(self._base_dir, "pkl", "features",
                                database_name, gnx_name)
        gnx_ftr = GraphFeatures(gnx,
                                self._params.features,
                                dir_path=gnx_path,
                                logger=self._logger,
                                is_max_connected=self._params.max_connected)
        gnx_ftr.build(
            should_dump=False,
            force_build=self._params.FORCE_REBUILD_FEATURES)  # build features

        if not self._index_ftr:
            sorted_ftr = [
                f for f in sorted(gnx_ftr) if gnx_ftr[f].is_relevant()
            ]  # fix feature order (names)
            self._index_ftr = []

            for ftr in sorted_ftr:
                len_ftr = len(gnx_ftr[ftr])
                # fill list with (ftr, counter)
                self._index_ftr += self._get_motif_type(ftr, len_ftr) if ftr == 'motif3' or ftr == 'motif4' else \
                    [(ftr, i) for i in range(len_ftr)]
        return self._index_ftr
Exemplo n.º 5
0
    def _calc_vec(self):
        database_name = self._params.database.DATABASE_NAME + "_" + \
                        str(self._params.max_connected) + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(self._base_dir, "pkl", "vectors", database_name + "_vectors_log_" +
                                    str(self._params.log) + ".pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - graph_vectors")
            return pickle.load(open(vec_pkl_path, "rb"))

        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        gnx_to_vec = {}
        for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(gnx, self._params.features, dir_path=gnx_path, logger=self._logger,
                                    is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES)  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).activate_motif_ratio_vec(norm_func=log_norm)\
                if self._params.log else FeaturesProcessor(gnx_ftr).activate_motif_ratio_vec()

        pickle.dump(gnx_to_vec, open(vec_pkl_path, "wb"))
        return gnx_to_vec
Exemplo n.º 6
0
 def build_features(self):
     gnx_ftr = GraphFeatures(self._gnx,
                             CHOSEN_FEATURES,
                             dir_path=os.path.join(self._data_dir,
                                                   "features"),
                             logger=self._logger)
     gnx_ftr.build(should_dump=True)  # build ALL_FEATURES
     self._features_mx = gnx_ftr.to_matrix(dtype=np.float32,
                                           mtype=np.matrix)
     print(self._features_mx.shape)
 def _calc_betweenness(self):
     raw_ftr = GraphFeatures(self._graph,
                             {"betweenness": FeatureMeta(BetweennessCentralityCalculator, {"betweenness"})},
                             dir_path=self._dir_path)
     raw_ftr.build(should_dump=True)
     feature_dict = raw_ftr["betweenness"]._features
     feature_mx = np.zeros((len(feature_dict), 1))
     for i in feature_dict.keys():
         feature_mx[i] = feature_dict[i]
     return self._log_norm(feature_mx)
 def build_features_problem_ab(self, force_rebuild=False, largest_cc=False):
     if len(self._features_matrix_dict) != 0 and not force_rebuild:
         return
     gnx_name = '20-Apr-2001'
     self._logger.debug("calculating features for " + gnx_name)
     gnx_path = os.path.join(self._pkl_dir, gnx_name)
     if gnx_name not in os.listdir(self._pkl_dir):
         os.mkdir(gnx_path)
     gnx = self.subgraph_by_name(gnx_name)
     gnx_ftr = GraphFeatures(gnx, self._features_meta, dir_path=gnx_path, logger=self._logger, is_max_connected=largest_cc)
     gnx_ftr.build(should_dump=True)  # build ALL_FEATURES
     self._features_matrix_dict[gnx_name] = gnx_ftr.to_matrix(dtype=np.float32, mtype=np.matrix)
Exemplo n.º 9
0
 def build_features(self, largest_cc=False, should_zscore=True):
     for community in self._changed_communities:
         self._logger.debug("calculating features for " + community)
         gnx_path = os.path.join(self._pkl_dir, community)
         if community not in os.listdir(self._pkl_dir):
             os.mkdir(gnx_path)
         gnx = self.subgraph_by_name(community)
         gnx_ftr = GraphFeatures(gnx, self._features_meta, dir_path=gnx_path, logger=self._logger,
                                 is_max_connected=largest_cc)
         gnx_ftr.build(should_dump=False, force_build=True)  # build ALL_FEATURES
         self._features_matrix_dict[community] = (gnx, gnx_ftr)
     self._changed_communities = []
 def _calc_bfs(self):
     raw_ftr = GraphFeatures(
         self._graph,
         {"bfs_moments": FeatureMeta(BfsMomentsCalculator, {"bfs"})},
         dir_path=self._dir_path)
     raw_ftr.build(should_dump=True)
     feature_dict = raw_ftr["bfs_moments"]._features
     feature_mx = np.zeros(
         (len(feature_dict), len(list(feature_dict.values())[0][0])))
     for i in feature_dict.keys():
         for j in range(len(feature_dict[i][0])):
             feature_mx[i, j] = feature_dict[i][0][j]
     return self._log_norm(feature_mx)
    def collect_train_and_test_data_graph_features():
        """Collect some features from the given graph for train
		and test dataset.
		"""

        graph_features = GraphFeatures()
        graph_features.read_train_data()
        graph_features.read_test_data()

        X_train = graph_features.create_features_matrix("train")
        X_test = graph_features.create_features_matrix("test")

        print("\nTrain matrix dimensionality: ", X_train.shape)
        print("Test matrix dimensionality: ", X_test.shape)
        X_train_df = pd.DataFrame(
            data=X_train, columns=['out_degree', 'in_degree', 'avg_neig_deg'])
        X_train_df['Article'] = graph_features.train_ids
        X_train_df['Article'] = X_train_df['Article'].astype('int64')

        X_test_df = pd.DataFrame(
            data=X_test, columns=['out_degree', 'in_degree', 'avg_neig_deg'])
        X_test_df['Article'] = graph_features.test_ids
        X_test_df['Article'] = X_test_df['Article'].astype('int64')

        return X_train_df, X_test_df
Exemplo n.º 12
0
 def _calc_motif3(self):
     raw_ftr = GraphFeatures(self._graph, {
         "motif3":
         FeatureMeta(nth_nodes_motif(3, gpu=self._gpu, device=self._device),
                     {"m3"})
     },
                             dir_path=self._dir_path)
     raw_ftr.build(should_dump=self._dump)
     feature = raw_ftr['motif3']._features
     if type(feature) == dict:
         motif_matrix = self._to_matrix(feature)
     else:
         motif_matrix = feature
     return self._log_norm(motif_matrix)
 def build_features(self, pick_ftr=False, force_rebuild=False, largest_cc=False, should_zscore=True):
     if len(self._features_matrix_dict) != 0 and not force_rebuild and not pick_ftr:
         return
     for gnx_name in self._list_id:
         self._logger.debug("calculating features for " + gnx_name)
         gnx_path = os.path.join(self._pkl_dir, gnx_name)
         if gnx_name not in os.listdir(self._pkl_dir):
             os.mkdir(gnx_path)
         gnx = self.subgraph_by_name(gnx_name)
         gnx_ftr = GraphFeatures(gnx, self._features_meta, dir_path=gnx_path, logger=self._logger,
                                 is_max_connected=largest_cc)
         gnx_ftr.build(should_dump=True, force_build=force_rebuild)  # build ALL_FEATURES
         self._features_matrix_dict[gnx_name] = gnx_ftr.to_matrix(dtype=np.float32, mtype=np.matrix,
                                                                  should_zscore=should_zscore)
 def _calc_motif4(self):
     # FOR NOW, NO GPU FOR US
     if os.path.exists(os.path.join(self._dir_path, "motif4.pkl")):
         pkl4 = pickle.load(
             open(os.path.join(self._dir_path, "motif4.pkl"), "rb"))
         if type(pkl4) == dict:
             motif4 = self._to_matrix(pkl4)
         elif type(pkl4) == MotifsNodeCalculator:
             motif4 = np.array(pkl4._features)
         else:
             motif4 = np.array(pkl4)
         if self._motif_choice == "All_Motifs":
             mp = MotifProbability(self._params['vertices'],
                                   self._params['probability'],
                                   self._params['clique_size'],
                                   self._params['directed'])
             motif3_count = 1 + mp.get_3_clique_motifs(3)[
                 -1]  # The full 3 clique is the last motif 3.
             clique_motifs = [
                 m - motif3_count for m in mp.get_3_clique_motifs(4)
             ]
             return motif4[:, clique_motifs]
         else:
             return motif4
     raw_ftr = GraphFeatures(self._graph, {
         "motif4":
         FeatureMeta(nth_nodes_motif(4, gpu=self._gpu, device=self._device),
                     {"m4"})
     },
                             dir_path=self._dir_path)
     raw_ftr.build(should_dump=True)
     feature = raw_ftr['motif4']._features
     if type(feature) == dict:
         motif_matrix = self._to_matrix(feature)
     else:
         motif_matrix = feature
     normed_matrix = self._log_norm(motif_matrix)
     if self._motif_choice == "All_Motifs":
         mp = MotifProbability(self._params['vertices'],
                               self._params['probability'],
                               self._params['clique_size'],
                               self._params['directed'])
         motif3_count = 1 + mp.get_3_clique_motifs(3)[
             -1]  # The full 3 clique is the last motif 3.
         clique_motifs = [
             m - motif3_count for m in mp.get_3_clique_motifs(4)
         ]
         return normed_matrix[:, clique_motifs]
     else:
         return normed_matrix
Exemplo n.º 15
0
 def _calc_bfs(self):
     raw_ftr = GraphFeatures(
         self._graph,
         {"bfs_moments": FeatureMeta(BfsMomentsCalculator, {"bfs"})},
         dir_path="")
     raw_ftr.build(should_dump=False)
     feat = raw_ftr["bfs_moments"]._features
     if type(feat) == list:
         feature_mx = np.array(feat)
     else:
         feature_mx = np.zeros((len(feat), len(list(feat.values())[0][0])))
         for i in feat.keys():
             for j in range(len(feat[i][0])):
                 feature_mx[i, j] = feat[i][0][j]
     return self._log_norm(feature_mx)
Exemplo n.º 16
0
 def _calc_motif4(self):
     raw_ftr = GraphFeatures(self._graph, {
         "motif4":
         FeatureMeta(nth_nodes_motif(4, gpu=self._gpu, device=self._device),
                     {"m4"})
     },
                             dir_path="")
     raw_ftr.build(should_dump=False)
     feature = raw_ftr['motif4']._features
     if type(feature) == dict:
         motif_matrix = self._to_matrix(feature)
     else:
         motif_matrix = feature
     normed_matrix = self._log_norm(motif_matrix)
     return normed_matrix
    def _calc_features(self, pkl=True):
        # load dictionary if exists
        if pkl and self._ftr_pkl_name() in os.listdir(
                os.path.join(self._base_dir, 'pkl', 'ftr_by_time_dictionaries')):
            self._features_by_time, self._multi_graphs_by_time = \
                pickle.load(open(os.path.join(self._base_dir, 'pkl', 'ftr_by_time_dictionaries',
                                              self._ftr_pkl_name()), "rb"))
            return

        self._load_database()
        labels = self._database.labels
        # make directory for database
        dir_path = os.path.join(self._base_dir, 'pkl', 'graph_measures', self._params['database_full_name'])
        if self._params['database_full_name'] not in os.listdir(os.path.join(self._base_dir, 'pkl', 'graph_measures')):
            os.mkdir(dir_path)

        # calculate features
        for multi_graph in self._database.multi_graph_by_window(self._params['window_size'],
                                                                self._params['start_time']):
            ftr_tmp_dict = {}
            for name in multi_graph.graph_names():
                raw_ftr = GraphFeatures(multi_graph.get_gnx(name), NODE_FEATURES_ML, dir_path,
                                        is_max_connected=self._params['max_connected'],
                                        logger=PrintLogger(self._params['database_full_name']))
                nodes_and_edges = [multi_graph.node_count(graph_id=name), multi_graph.edge_count(graph_id=name)]
                ftr_tmp_dict[name] = (FeaturesProcessor(raw_ftr).activate_motif_ratio_vec(to_add=nodes_and_edges),
                                      labels[name])
            self._features_by_time.append(ftr_tmp_dict)

            multi_graph.suspend_logger()
            self._multi_graphs_by_time.append(multi_graph)

        pickle.dump((self._features_by_time, self._multi_graphs_by_time),
                    open(os.path.join(self._base_dir, 'pkl', 'ftr_by_time_dictionaries', self._ftr_pkl_name()), "wb"))
 def _calc_motif3(self):
     raw_ftr = GraphFeatures(self._graph,
                             {"motif3": FeatureMeta(nth_nodes_motif(3, gpu=self._gpu, device=self._device), {"m3"})},
                             dir_path=self._dir_path)
     raw_ftr.build(should_dump=True)
     feature = raw_ftr['motif3']._features
     if type(feature) == dict:
         motif_matrix = self._to_matrix(feature)
     else:
         motif_matrix = feature
     normed_matrix = self._log_norm(motif_matrix)
     if self._motif_choice == "All_Motifs":
         mp = MotifProbability(self._params['vertices'], self._params['probability'],
                               self._params['subgraph_size'], self._params['directed'])
         clique_motifs = mp.get_3_clique_motifs(3)
         return normed_matrix[:, clique_motifs]
     else:
         return normed_matrix
Exemplo n.º 19
0
    def _execute_for_4(self, motifs_picked):
        if self._params["load_motifs"] or os.path.exists(
                os.path.join(self._dir_path, 'motif4.pkl')):
            pkl3 = pickle.load(
                open(os.path.join(self._dir_path, "motif3.pkl"), "rb"))
            pkl4 = pickle.load(
                open(os.path.join(self._dir_path, "motif4.pkl"), "rb"))
            try:
                m3 = pkl3._features
                if type(m3) == dict:
                    motif3 = self._to_matrix_(m3)
                else:
                    motif3 = np.array(m3)
            except AttributeError:
                if type(pkl3) == dict:
                    motif3 = self._to_matrix(pkl3)
                else:
                    motif3 = np.array(pkl3)
            try:
                m4 = pkl4._features
                if type(m4) == dict:
                    motif4 = self._to_matrix_(m4)
                else:
                    motif4 = np.array(m4)
            except AttributeError:
                if type(pkl4) == dict:
                    motif4 = self._to_matrix(pkl4)

                else:
                    motif4 = np.array(pkl4)
            self._motif_mat = np.hstack((motif3, motif4))
            if motifs_picked is not None:
                self._motif_mat = self._motif_mat[:, motifs_picked]
            print(str(datetime.datetime.now()) + " , Calculated motifs")
            return
        g_ftrs = GraphFeatures(self._graph,
                               self._motif_features,
                               dir_path=self._dir_path)
        g_ftrs.build(should_dump=True)
        print(str(datetime.datetime.now()) + " , Calculated motifs")
        self._motif_mat = np.hstack((np.asarray(g_ftrs['motif3']._features),
                                     np.asarray(g_ftrs['motif4']._features)))
        if motifs_picked is not None:
            self._motif_mat = self._motif_mat[:, motifs_picked]
 def _calc_motif4(self):
     raw_ftr = GraphFeatures(self._graph,
                             {"motif4": FeatureMeta(nth_nodes_motif(4, gpu=self._gpu, device=self._device), {"m4"})},
                             dir_path=self._dir_path)
     raw_ftr.build(should_dump=True)
     feature = raw_ftr['motif4']._features
     if type(feature) == dict:
         motif_matrix = self._to_matrix(feature)
     else:
         motif_matrix = feature
     normed_matrix = self._log_norm(motif_matrix)
     if self._motif_choice == "All_Motifs":
         mp = MotifProbability(self._params['vertices'], self._params['probability'],
                               self._params['subgraph_size'], self._params['directed'])
         motif3_count = 1 + mp.get_3_clique_motifs(3)[-1]  # The full 3 clique is the last motif 3.
         clique_motifs = [m - motif3_count for m in mp.get_3_clique_motifs(4)]
         return normed_matrix[:, clique_motifs]
     else:
         return normed_matrix
 def _calc_motif3(self):
     # FOR NOW, NO GPU FOR US
     if os.path.exists(os.path.join(self._dir_path, "motif3.pkl")):
         pkl3 = pickle.load(
             open(os.path.join(self._dir_path, "motif3.pkl"), "rb"))
         if type(pkl3) == dict:
             motif3 = self._to_matrix(pkl3)
         elif type(pkl3) == MotifsNodeCalculator:
             motif3 = np.array(pkl3._features)
         else:
             motif3 = np.array(pkl3)
         if self._motif_choice == "All_Motifs":
             mp = MotifProbability(self._params['vertices'],
                                   self._params['probability'],
                                   self._params['clique_size'],
                                   self._params['directed'])
             clique_motifs = mp.get_3_clique_motifs(3)
             return motif3[:, clique_motifs]
         else:
             return motif3
     raw_ftr = GraphFeatures(self._graph, {
         "motif3":
         FeatureMeta(nth_nodes_motif(3, gpu=self._gpu, device=self._device),
                     {"m3"})
     },
                             dir_path=self._dir_path)
     raw_ftr.build(should_dump=True)
     feature = raw_ftr['motif3']._features
     if type(feature) == dict:
         motif_matrix = self._to_matrix(feature)
     else:
         motif_matrix = feature
     normed_matrix = self._log_norm(motif_matrix)
     if self._motif_choice == "All_Motifs":
         mp = MotifProbability(self._params['vertices'],
                               self._params['probability'],
                               self._params['clique_size'],
                               self._params['directed'])
         clique_motifs = mp.get_3_clique_motifs(3)
         return normed_matrix[:, clique_motifs]
     else:
         return normed_matrix
Exemplo n.º 22
0
 def calculate_features(self, dumping_specs=None):
     """
     :param dumping_specs: A dictionary of specifications how to dump the non-special features.
                           The default is saving the class only (as a pickle file).
                           'object': What to save - either 'class' (save the calculator with the features inside),
                                     'feature' (the feature itself only, saved as name + '_ftr') or 'both'.
                                     Note that if only the feature is saved, when one calls the calculator again,
                                     the class will not load the feature and instead calculate it again.
                           'file_type': If the feature itself is saved, one can choose between two formats:
                                        either 'pkl' (save the feature as a pickle file, as is) or 'csv' (save a
                                        csv file of the feature values).
                           'vertex_names': If the features are saved as a csv file, there is an option of saving
                                           the name of each vertex in each row, before the feature values.
                                           The value here is a boolean indicating whether to put the original names
                                           the vertices in the beginning of each row.
     """
     if not len(self._features) + len(
             self._special_features) and self._verbose:
         print("No features were chosen!")
     else:
         self._adj_matrix = nx.adjacency_matrix(self._graph)
         # self._adj_matrix = self._adj_matrix.toarray()
         self._raw_features = GraphFeatures(gnx=self._graph,
                                            features=self._features,
                                            dir_path=self._dir_path,
                                            logger=self._logger)
         if dumping_specs is not None:
             if 'vertex_names' in dumping_specs:
                 if dumping_specs['vertex_names']:
                     dumping_specs['vertex_names'] = self._mapping
                 else:
                     del dumping_specs['vertex_names']
         self._raw_features.build(should_dump=True,
                                  dumping_specs=dumping_specs)
         self._other_features = OtherFeatures(self._graph,
                                              self._special_features,
                                              self._dir_path, self._params,
                                              self._logger)
         self._other_features.build(should_dump=True)
         self._logger.info(
             str(datetime.datetime.now()) + " , Calculated features")
Exemplo n.º 23
0
 def calculate_features(self):
     if not len(self._features) + len(
             self._special_features) and self._verbose:
         print("No features were chosen!")
     else:
         self._adj_matrix = nx.adjacency_matrix(self._graph)
         # self._adj_matrix = self._adj_matrix.toarray()
         self._raw_features = GraphFeatures(gnx=self._graph,
                                            features=self._features,
                                            dir_path=self._dir_path,
                                            logger=self._logger)
         self._raw_features.build(
             should_dump=True
         )  # The option of multiple workers in this function exists.
         self._other_features = OtherFeatures(self._graph,
                                              self._special_features,
                                              self._dir_path, self._params,
                                              self._logger)
         self._other_features.build(should_dump=True)
         self._logger.info(
             str(datetime.datetime.now()) + " , Calculated features")
Exemplo n.º 24
0
    def _calc_matrix(self):
        database_name = self._data_name + "_" + str(
            self._params.max_connected) + "_" + str(self._params.directed)
        mat_pkl_path = os.path.join(self._base_dir, "pkl", "vectors",
                                    database_name + "_matrix.pkl")
        if os.path.exists(mat_pkl_path):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl_path, "rb"))

        gnx_to_vec = {}
        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        for gnx_name, gnx in zip(self._temporal_graph.graph_names(),
                                 self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(
                gnx,
                self._params.features,
                dir_path=gnx_path,
                logger=self._logger,
                is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True,
                          force_build=self._params.FORCE_REBUILD_FEATURES
                          )  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix()

        pickle.dump(gnx_to_vec, open(mat_pkl_path, "wb"))
        return gnx_to_vec
Exemplo n.º 25
0
 def _calculate_motif_matrix(self):
     if self._params["load_motifs"] or os.path.exists(
             os.path.join(self._dir_path, 'motif4.pkl')):
         pkl3 = pickle.load(
             open(os.path.join(self._dir_path, "motif3.pkl"), "rb"))
         pkl4 = pickle.load(
             open(os.path.join(self._dir_path, "motif4.pkl"), "rb"))
         if type(pkl3) == dict:
             motif3 = self._to_matrix(pkl3)
         elif type(pkl3) == MotifsNodeCalculator:
             pkl3 = pkl3._features
             if type(pkl3) == list:
                 motif3 = np.array(pkl3)
             else:
                 motif3 = self._to_matrix_(pkl3)
         else:
             motif3 = np.array(pkl3)
         if type(pkl4) == dict:
             motif4 = self._to_matrix(pkl4)
         elif type(pkl4) == MotifsNodeCalculator:
             pkl4 = pkl4._features
             if type(pkl4) == list:
                 motif4 = np.array(pkl4)
             else:
                 motif4 = self._to_matrix_(pkl4)
         else:
             motif4 = np.array(pkl4)
         self._motif_mat = np.hstack((motif3, motif4))
         print(str(datetime.datetime.now()) + " , Calculated motifs")
         return
     g_ftrs = GraphFeatures(self._graph,
                            self._motif_features,
                            dir_path=self._dir_path)
     g_ftrs.build(should_dump=True)
     print(str(datetime.datetime.now()) + " , Calculated motifs")
     self._motif_mat = np.hstack((np.asarray(g_ftrs['motif3']._features),
                                  np.asarray(g_ftrs['motif4']._features)))
Exemplo n.º 26
0
        if to_add:
            to_add = self._convert_dict_to_list(to_add, entries_order)
            as_matrix = np.hstack((as_matrix, np.matrix(to_add)))
        if norm_func:
            as_matrix = norm_func(as_matrix)
        return as_matrix


if __name__ == "__main__":
    import networkx as nx

    gnx = nx.Graph()
    gnx.add_edges_from([
        (1, 2),
        (1, 3),
        (2, 3),
        (2, 7),
        (7, 8),
        (3, 6),
        (4, 6),
        (6, 8),
        (5, 6),
    ])
    gnx_ftr = GraphFeatures(gnx,
                            FeaturesMeta().NODE_LEVEL,
                            ".",
                            is_max_connected=True)
    fp = FeaturesProcessor(gnx_ftr)
    fp.activate_motif_ratio_vec()
    e = 0
Exemplo n.º 27
0
        header = []
        for i in range(num_motifs):
            header.append((motif_type, i))
        return header

    @staticmethod
    def is_motif(ftr):
        return ftr == 'motif4' or ftr == "motif3"


if __name__ == "__main__":
    import networkx as nx
    from feature_meta import NODE_FEATURES

    gnx = nx.Graph()
    gnx.add_edges_from([
        (1, 2),
        (1, 3),
        (2, 3),
        (2, 7),
        (7, 8),
        (3, 6),
        (4, 6),
        (6, 8),
        (5, 6),
    ])
    gnx_ftr = GraphFeatures(gnx, NODE_FEATURES, ".", is_max_connected=True)
    gnx_ftr.build()
    m = MotifRatio(gnx_ftr, False)
    e = 0
    def _calc_features(self, pkl=True):
        # load dictionary if exists
        if pkl and self._ftr_pkl_name() in os.listdir(
                os.path.join(self._base_dir, 'pkl',
                             'ftr_by_time_dictionaries')):
            self._features_by_time, self._multi_graphs_by_time = \
                pickle.load(open(os.path.join(self._base_dir, 'pkl', 'ftr_by_time_dictionaries',
                                              self._ftr_pkl_name()), "rb"))
            return

        self._load_database()
        labels = self._database.labels
        # make directory for database
        dir_path = os.path.join(self._base_dir, 'pkl', 'graph_measures')
        if self._params['database_full_name'] not in os.listdir(dir_path):
            os.mkdir(os.path.join(dir_path,
                                  self._params['database_full_name']))
        dir_path = os.path.join(dir_path, self._params['database_full_name'])

        # calculate features
        for i, multi_graph in enumerate(
                self._database.multi_graph_by_window(
                    self._params['window_size'], self._params['start_time'])):
            if "time_" + str(i) not in os.listdir(dir_path):
                os.mkdir(os.path.join(dir_path, "time_" + str(i)))
            mg_dir_path = os.path.join(dir_path, "time_" + str(i))

            ftr_tmp_dict = {}
            # nodes_and_edges = {}
            for name in multi_graph.graph_names():
                if name not in os.listdir(mg_dir_path):
                    os.mkdir(os.path.join(mg_dir_path, name))
                gnx_dir_path = os.path.join(mg_dir_path, name)

                raw_ftr = GraphFeatures(
                    multi_graph.get_gnx(name),
                    NODE_FEATURES_ML,
                    dir_path=gnx_dir_path,
                    is_max_connected=self._params['max_connected'],
                    logger=PrintLogger(self._params['database_full_name']))
                raw_ftr.build(should_dump=True)  # build features
                nodes_and_edges = [
                    np.log(1 + multi_graph.node_count(graph_id=name)),
                    np.log(1 + multi_graph.edge_count(graph_id=name))
                ]
                # nodes_and_edges = [multi_graph.node_count(graph_id=name), multi_graph.edge_count(graph_id=name)]
                # nodes_and_edges[name] = [multi_graph.node_count(graph_id=name), multi_graph.edge_count(graph_id=name)]

                # ====================== motif ratio ========================
                ftr_tmp_dict[name] = (
                    FeaturesProcessor(raw_ftr).activate_motif_ratio_vec(
                        to_add=nodes_and_edges), labels[name])

                # ==================== ftr correlation ======================
                # ftr_tmp_dict[name] = (FeaturesProcessor(raw_ftr).as_matrix(norm_func=log_norm))
                # ftr_tmp_dict[name] = (FeaturesProcessor(raw_ftr).as_matrix())

            # concat_mx = np.vstack([mx for name, mx in ftr_tmp_dict.items()])
            # pearson_picker = PearsonFeaturePicker(concat_mx, size=self._params['ftr_pairs'],
            #                                       identical_bar=0.9)
            # best_pairs = pearson_picker.best_pairs()
            # beta = LinearContext(multi_graph, ftr_tmp_dict, best_pairs, window_size=len(ftr_tmp_dict))
            # beta_matrix = beta.beta_matrix()
            # node and edges can pe appended here
            # for j, name in enumerate(multi_graph.graph_names()):
            #     ftr_tmp_dict[name] = (np.hstack((beta_matrix[j], nodes_and_edges[name])), labels[name])

            self._features_by_time.append(ftr_tmp_dict)

            multi_graph.suspend_logger()
            self._multi_graphs_by_time.append(multi_graph)

        pickle.dump((self._features_by_time, self._multi_graphs_by_time),
                    open(
                        os.path.join(self._base_dir, 'pkl',
                                     'ftr_by_time_dictionaries',
                                     self._ftr_pkl_name()), "wb"))
Exemplo n.º 29
0
class FeatureCalculator:
    def __init__(self,
                 edge_path,
                 dir_path,
                 features,
                 acc=True,
                 directed=False,
                 gpu=False,
                 device=2,
                 verbose=True,
                 params=None):
        """
        A class used to calculate features for a given graph, input as a text-like file.

        :param edge_path: str
        Path to graph edges file (text-like file, e.g. txt or csv), from which the graph is built using networkx.
        The graph must be unweighted. If its vertices are not [0, 1, ..., n-1], they are mapped to become
        [0, 1, ..., n-1] and the mapping is saved.
        Every row in the edges file should include "source_id,distance_id", without a header row.
        :param dir_path: str
        Path to the directory in which the feature calculations will be (or already are) located.
        :param features: list of strings
        List of the names of each feature. Could be any name from features_meta.py or "additional_features".
        :param acc: bool
        Whether to run the accelerated features, assuming it is possible to do so.
        :param directed: bool
        Whether the built graph is directed.
        :param gpu: bool
        Whether to use GPUs, assuming it is possible to do so (i.e. the GPU exists and the CUDA matches).
        :param device: int
        If gpu is True, indicates on which GPU device to calculate. Will return error if the index doesn't match the
        available GPUs.
        :param verbose: bool
        Whether to print things indicating the phases of calculations.
        :param params: dict, or None
        For clique detection uses, this is a dictionary of the graph settings
        (size, directed, clique size, edge probability). Ignored for any other use.
        """

        self._dir_path = dir_path
        self._features = features  # By their name as appears in accelerated_features_meta
        self._gpu = gpu
        self._device = device
        self._verbose = verbose
        self._logger = multi_logger([PrintLogger("Logger", level=logging.DEBUG),
                                     FileLogger("FLogger", path=dir_path, level=logging.INFO)], name=None) \
            if verbose else None
        self._params = params
        self._load_graph(edge_path, directed)
        self._get_feature_meta(
            features,
            acc)  # acc determines whether to use the accelerated features

        self._adj_matrix = None
        self._raw_features = None
        self._other_features = None

    def _load_graph(self, edge_path, directed=False):
        self._graph = nx.read_edgelist(
            edge_path,
            delimiter=',',
            create_using=nx.DiGraph() if directed else nx.Graph())
        vertices = np.array(self._graph.nodes)
        should_be_vertices = np.arange(len(vertices))
        self._mapping = {i: v for i, v in enumerate(self._graph)}
        if not np.array_equal(vertices, should_be_vertices):
            if self._verbose:
                self._logger.debug("Relabeling vertices to [0, 1, ..., n-1]")
            pickle.dump(
                self._mapping,
                open(os.path.join(self._dir_path, "vertices_mapping.pkl"),
                     "wb"))
            self._graph = nx.convert_node_labels_to_integers(self._graph)
        if self._verbose:
            self._logger.info(str(datetime.datetime.now()) + " , Loaded graph")
            self._logger.debug("Graph Size: %d Nodes, %d Edges" %
                               (len(self._graph), len(self._graph.edges)))

    def _get_feature_meta(self, features, acc):
        if acc:
            from accelerated_features_meta import FeaturesMeta
            features_meta_kwargs = dict(gpu=self._gpu, device=self._device)
        else:
            from features_meta import FeaturesMeta
            features_meta_kwargs = dict()

        all_node_features = FeaturesMeta(**features_meta_kwargs).NODE_LEVEL
        self._features = {}
        self._special_features = []
        for key in features:
            if key in [
                    'degree', 'in_degree', 'out_degree', 'additional_features'
            ]:
                self._special_features.append(key)
            elif key not in all_node_features:
                if self._verbose:
                    self._logger.debug(
                        "Feature %s unknown, ignoring this feature" % key)
                features.remove(key)
                continue
            else:
                self._features[key] = all_node_features[key]

    def calculate_features(self, dumping_specs=None):
        """
        :param dumping_specs: A dictionary of specifications how to dump the non-special features.
                              The default is saving the class only (as a pickle file).
                              'object': What to save - either 'class' (save the calculator with the features inside),
                                        'feature' (the feature itself only, saved as name + '_ftr') or 'both'.
                                        Note that if only the feature is saved, when one calls the calculator again,
                                        the class will not load the feature and instead calculate it again.
                              'file_type': If the feature itself is saved, one can choose between two formats:
                                           either 'pkl' (save the feature as a pickle file, as is) or 'csv' (save a
                                           csv file of the feature values).
                              'vertex_names': If the features are saved as a csv file, there is an option of saving
                                              the name of each vertex in each row, before the feature values.
                                              The value here is a boolean indicating whether to put the original names
                                              the vertices in the beginning of each row.
        """
        if not len(self._features) + len(
                self._special_features) and self._verbose:
            print("No features were chosen!")
        else:
            self._adj_matrix = nx.adjacency_matrix(self._graph)
            # self._adj_matrix = self._adj_matrix.toarray()
            self._raw_features = GraphFeatures(gnx=self._graph,
                                               features=self._features,
                                               dir_path=self._dir_path,
                                               logger=self._logger)
            if dumping_specs is not None:
                if 'vertex_names' in dumping_specs:
                    if dumping_specs['vertex_names']:
                        dumping_specs['vertex_names'] = self._mapping
                    else:
                        del dumping_specs['vertex_names']
            self._raw_features.build(should_dump=True,
                                     dumping_specs=dumping_specs)
            self._other_features = OtherFeatures(self._graph,
                                                 self._special_features,
                                                 self._dir_path, self._params,
                                                 self._logger)
            self._other_features.build(should_dump=True)
            self._logger.info(
                str(datetime.datetime.now()) + " , Calculated features")

    @property
    def feature_matrix(self):
        return np.hstack((self._raw_features.to_matrix(mtype=np.array),
                          self._other_features.feature_matrix))

    @property
    def adjacency_matrix(self):
        return self._adj_matrix