예제 #1
0
 def build_features(self):
     gnx_ftr = GraphFeatures(self._gnx,
                             CHOSEN_FEATURES,
                             dir_path=os.path.join(self._data_dir,
                                                   "features"),
                             logger=self._logger)
     gnx_ftr.build(should_dump=True)  # build ALL_FEATURES
     self._features_mx = gnx_ftr.to_matrix(dtype=np.float32,
                                           mtype=np.matrix)
     print(self._features_mx.shape)
 def build_features_problem_ab(self, force_rebuild=False, largest_cc=False):
     if len(self._features_matrix_dict) != 0 and not force_rebuild:
         return
     gnx_name = '20-Apr-2001'
     self._logger.debug("calculating features for " + gnx_name)
     gnx_path = os.path.join(self._pkl_dir, gnx_name)
     if gnx_name not in os.listdir(self._pkl_dir):
         os.mkdir(gnx_path)
     gnx = self.subgraph_by_name(gnx_name)
     gnx_ftr = GraphFeatures(gnx, self._features_meta, dir_path=gnx_path, logger=self._logger, is_max_connected=largest_cc)
     gnx_ftr.build(should_dump=True)  # build ALL_FEATURES
     self._features_matrix_dict[gnx_name] = gnx_ftr.to_matrix(dtype=np.float32, mtype=np.matrix)
 def build_features(self, pick_ftr=False, force_rebuild=False, largest_cc=False, should_zscore=True):
     if len(self._features_matrix_dict) != 0 and not force_rebuild and not pick_ftr:
         return
     for gnx_name in self._list_id:
         self._logger.debug("calculating features for " + gnx_name)
         gnx_path = os.path.join(self._pkl_dir, gnx_name)
         if gnx_name not in os.listdir(self._pkl_dir):
             os.mkdir(gnx_path)
         gnx = self.subgraph_by_name(gnx_name)
         gnx_ftr = GraphFeatures(gnx, self._features_meta, dir_path=gnx_path, logger=self._logger,
                                 is_max_connected=largest_cc)
         gnx_ftr.build(should_dump=True, force_build=force_rebuild)  # build ALL_FEATURES
         self._features_matrix_dict[gnx_name] = gnx_ftr.to_matrix(dtype=np.float32, mtype=np.matrix,
                                                                  should_zscore=should_zscore)
 def build_features(self, largest_cc=False, should_zscore=True):
     for community in self._changed_communities:
         self._logger.debug("calculating features for " + community)
         gnx_path = os.path.join(self._pkl_dir, community)
         if community not in os.listdir(self._pkl_dir):
             os.mkdir(gnx_path)
         gnx = self.subgraph_by_name(community)
         gnx_ftr = GraphFeatures(gnx,
                                 self._features_meta,
                                 dir_path=gnx_path,
                                 logger=self._logger,
                                 is_max_connected=largest_cc)
         gnx_ftr.build(should_dump=False,
                       force_build=True)  # build ALL_FEATURES
         self._features_matrix_dict[community] = gnx_ftr.to_matrix(
             dtype=np.float32, mtype=np.matrix, should_zscore=should_zscore)
예제 #5
0
class FeatureCalculator:
    def __init__(self,
                 edge_path,
                 dir_path,
                 features,
                 acc=True,
                 directed=False,
                 gpu=False,
                 device=2,
                 verbose=True,
                 params=None):
        """
        A class used to calculate features for a given graph, input as a text-like file.

        :param edge_path: str
        Path to graph edges file (text-like file, e.g. txt or csv), from which the graph is built using networkx.
        The graph must be unweighted. If its vertices are not [0, 1, ..., n-1], they are mapped to become
        [0, 1, ..., n-1] and the mapping is saved.
        Every row in the edges file should include "source_id,distance_id", without a header row.
        :param dir_path: str
        Path to the directory in which the feature calculations will be (or already are) located.
        :param features: list of strings
        List of the names of each feature. Could be any name from features_meta.py or "additional_features".
        :param acc: bool
        Whether to run the accelerated features, assuming it is possible to do so.
        :param directed: bool
        Whether the built graph is directed.
        :param gpu: bool
        Whether to use GPUs, assuming it is possible to do so (i.e. the GPU exists and the CUDA matches).
        :param device: int
        If gpu is True, indicates on which GPU device to calculate. Will return error if the index doesn't match the
        available GPUs.
        :param verbose: bool
        Whether to print things indicating the phases of calculations.
        :param params: dict, or None
        For clique detection uses, this is a dictionary of the graph settings
        (size, directed, clique size, edge probability). Ignored for any other use.
        """

        self._dir_path = dir_path
        self._features = features  # By their name as appears in accelerated_features_meta
        self._gpu = gpu
        self._device = device
        self._verbose = verbose
        self._logger = multi_logger([PrintLogger("Logger", level=logging.DEBUG),
                                     FileLogger("FLogger", path=dir_path, level=logging.INFO)], name=None) \
            if verbose else None
        self._params = params
        self._load_graph(edge_path, directed)
        self._get_feature_meta(
            features,
            acc)  # acc determines whether to use the accelerated features

        self._adj_matrix = None
        self._raw_features = None
        self._other_features = None

    def _load_graph(self, edge_path, directed=False):
        self._graph = nx.read_edgelist(
            edge_path,
            delimiter=',',
            create_using=nx.DiGraph() if directed else nx.Graph())
        vertices = np.array(self._graph.nodes)
        should_be_vertices = np.arange(len(vertices))
        self._mapping = {i: v for i, v in enumerate(self._graph)}
        if not np.array_equal(vertices, should_be_vertices):
            if self._verbose:
                self._logger.debug("Relabeling vertices to [0, 1, ..., n-1]")
            pickle.dump(
                self._mapping,
                open(os.path.join(self._dir_path, "vertices_mapping.pkl"),
                     "wb"))
            self._graph = nx.convert_node_labels_to_integers(self._graph)
        if self._verbose:
            self._logger.info(str(datetime.datetime.now()) + " , Loaded graph")
            self._logger.debug("Graph Size: %d Nodes, %d Edges" %
                               (len(self._graph), len(self._graph.edges)))

    def _get_feature_meta(self, features, acc):
        if acc:
            from accelerated_features_meta import FeaturesMeta
            features_meta_kwargs = dict(gpu=self._gpu, device=self._device)
        else:
            from features_meta import FeaturesMeta
            features_meta_kwargs = dict()

        all_node_features = FeaturesMeta(**features_meta_kwargs).NODE_LEVEL
        self._features = {}
        self._special_features = []
        for key in features:
            if key in [
                    'degree', 'in_degree', 'out_degree', 'additional_features'
            ]:
                self._special_features.append(key)
            elif key not in all_node_features:
                if self._verbose:
                    self._logger.debug(
                        "Feature %s unknown, ignoring this feature" % key)
                features.remove(key)
                continue
            else:
                self._features[key] = all_node_features[key]

    def calculate_features(self, dumping_specs=None):
        """
        :param dumping_specs: A dictionary of specifications how to dump the non-special features.
                              The default is saving the class only (as a pickle file).
                              'object': What to save - either 'class' (save the calculator with the features inside),
                                        'feature' (the feature itself only, saved as name + '_ftr') or 'both'.
                                        Note that if only the feature is saved, when one calls the calculator again,
                                        the class will not load the feature and instead calculate it again.
                              'file_type': If the feature itself is saved, one can choose between two formats:
                                           either 'pkl' (save the feature as a pickle file, as is) or 'csv' (save a
                                           csv file of the feature values).
                              'vertex_names': If the features are saved as a csv file, there is an option of saving
                                              the name of each vertex in each row, before the feature values.
                                              The value here is a boolean indicating whether to put the original names
                                              the vertices in the beginning of each row.
        """
        if not len(self._features) + len(
                self._special_features) and self._verbose:
            print("No features were chosen!")
        else:
            self._adj_matrix = nx.adjacency_matrix(self._graph)
            # self._adj_matrix = self._adj_matrix.toarray()
            self._raw_features = GraphFeatures(gnx=self._graph,
                                               features=self._features,
                                               dir_path=self._dir_path,
                                               logger=self._logger)
            if dumping_specs is not None:
                if 'vertex_names' in dumping_specs:
                    if dumping_specs['vertex_names']:
                        dumping_specs['vertex_names'] = self._mapping
                    else:
                        del dumping_specs['vertex_names']
            self._raw_features.build(should_dump=True,
                                     dumping_specs=dumping_specs)
            self._other_features = OtherFeatures(self._graph,
                                                 self._special_features,
                                                 self._dir_path, self._params,
                                                 self._logger)
            self._other_features.build(should_dump=True)
            self._logger.info(
                str(datetime.datetime.now()) + " , Calculated features")

    @property
    def feature_matrix(self):
        return np.hstack((self._raw_features.to_matrix(mtype=np.array),
                          self._other_features.feature_matrix))

    @property
    def adjacency_matrix(self):
        return self._adj_matrix