def build_features(self): gnx_ftr = GraphFeatures(self._gnx, CHOSEN_FEATURES, dir_path=os.path.join(self._data_dir, "features"), logger=self._logger) gnx_ftr.build(should_dump=True) # build ALL_FEATURES self._features_mx = gnx_ftr.to_matrix(dtype=np.float32, mtype=np.matrix) print(self._features_mx.shape)
def build_features_problem_ab(self, force_rebuild=False, largest_cc=False): if len(self._features_matrix_dict) != 0 and not force_rebuild: return gnx_name = '20-Apr-2001' self._logger.debug("calculating features for " + gnx_name) gnx_path = os.path.join(self._pkl_dir, gnx_name) if gnx_name not in os.listdir(self._pkl_dir): os.mkdir(gnx_path) gnx = self.subgraph_by_name(gnx_name) gnx_ftr = GraphFeatures(gnx, self._features_meta, dir_path=gnx_path, logger=self._logger, is_max_connected=largest_cc) gnx_ftr.build(should_dump=True) # build ALL_FEATURES self._features_matrix_dict[gnx_name] = gnx_ftr.to_matrix(dtype=np.float32, mtype=np.matrix)
def build_features(self, pick_ftr=False, force_rebuild=False, largest_cc=False, should_zscore=True): if len(self._features_matrix_dict) != 0 and not force_rebuild and not pick_ftr: return for gnx_name in self._list_id: self._logger.debug("calculating features for " + gnx_name) gnx_path = os.path.join(self._pkl_dir, gnx_name) if gnx_name not in os.listdir(self._pkl_dir): os.mkdir(gnx_path) gnx = self.subgraph_by_name(gnx_name) gnx_ftr = GraphFeatures(gnx, self._features_meta, dir_path=gnx_path, logger=self._logger, is_max_connected=largest_cc) gnx_ftr.build(should_dump=True, force_build=force_rebuild) # build ALL_FEATURES self._features_matrix_dict[gnx_name] = gnx_ftr.to_matrix(dtype=np.float32, mtype=np.matrix, should_zscore=should_zscore)
def build_features(self, largest_cc=False, should_zscore=True): for community in self._changed_communities: self._logger.debug("calculating features for " + community) gnx_path = os.path.join(self._pkl_dir, community) if community not in os.listdir(self._pkl_dir): os.mkdir(gnx_path) gnx = self.subgraph_by_name(community) gnx_ftr = GraphFeatures(gnx, self._features_meta, dir_path=gnx_path, logger=self._logger, is_max_connected=largest_cc) gnx_ftr.build(should_dump=False, force_build=True) # build ALL_FEATURES self._features_matrix_dict[community] = gnx_ftr.to_matrix( dtype=np.float32, mtype=np.matrix, should_zscore=should_zscore)
class FeatureCalculator: def __init__(self, edge_path, dir_path, features, acc=True, directed=False, gpu=False, device=2, verbose=True, params=None): """ A class used to calculate features for a given graph, input as a text-like file. :param edge_path: str Path to graph edges file (text-like file, e.g. txt or csv), from which the graph is built using networkx. The graph must be unweighted. If its vertices are not [0, 1, ..., n-1], they are mapped to become [0, 1, ..., n-1] and the mapping is saved. Every row in the edges file should include "source_id,distance_id", without a header row. :param dir_path: str Path to the directory in which the feature calculations will be (or already are) located. :param features: list of strings List of the names of each feature. Could be any name from features_meta.py or "additional_features". :param acc: bool Whether to run the accelerated features, assuming it is possible to do so. :param directed: bool Whether the built graph is directed. :param gpu: bool Whether to use GPUs, assuming it is possible to do so (i.e. the GPU exists and the CUDA matches). :param device: int If gpu is True, indicates on which GPU device to calculate. Will return error if the index doesn't match the available GPUs. :param verbose: bool Whether to print things indicating the phases of calculations. :param params: dict, or None For clique detection uses, this is a dictionary of the graph settings (size, directed, clique size, edge probability). Ignored for any other use. """ self._dir_path = dir_path self._features = features # By their name as appears in accelerated_features_meta self._gpu = gpu self._device = device self._verbose = verbose self._logger = multi_logger([PrintLogger("Logger", level=logging.DEBUG), FileLogger("FLogger", path=dir_path, level=logging.INFO)], name=None) \ if verbose else None self._params = params self._load_graph(edge_path, directed) self._get_feature_meta( features, acc) # acc determines whether to use the accelerated features self._adj_matrix = None self._raw_features = None self._other_features = None def _load_graph(self, edge_path, directed=False): self._graph = nx.read_edgelist( edge_path, delimiter=',', create_using=nx.DiGraph() if directed else nx.Graph()) vertices = np.array(self._graph.nodes) should_be_vertices = np.arange(len(vertices)) self._mapping = {i: v for i, v in enumerate(self._graph)} if not np.array_equal(vertices, should_be_vertices): if self._verbose: self._logger.debug("Relabeling vertices to [0, 1, ..., n-1]") pickle.dump( self._mapping, open(os.path.join(self._dir_path, "vertices_mapping.pkl"), "wb")) self._graph = nx.convert_node_labels_to_integers(self._graph) if self._verbose: self._logger.info(str(datetime.datetime.now()) + " , Loaded graph") self._logger.debug("Graph Size: %d Nodes, %d Edges" % (len(self._graph), len(self._graph.edges))) def _get_feature_meta(self, features, acc): if acc: from accelerated_features_meta import FeaturesMeta features_meta_kwargs = dict(gpu=self._gpu, device=self._device) else: from features_meta import FeaturesMeta features_meta_kwargs = dict() all_node_features = FeaturesMeta(**features_meta_kwargs).NODE_LEVEL self._features = {} self._special_features = [] for key in features: if key in [ 'degree', 'in_degree', 'out_degree', 'additional_features' ]: self._special_features.append(key) elif key not in all_node_features: if self._verbose: self._logger.debug( "Feature %s unknown, ignoring this feature" % key) features.remove(key) continue else: self._features[key] = all_node_features[key] def calculate_features(self, dumping_specs=None): """ :param dumping_specs: A dictionary of specifications how to dump the non-special features. The default is saving the class only (as a pickle file). 'object': What to save - either 'class' (save the calculator with the features inside), 'feature' (the feature itself only, saved as name + '_ftr') or 'both'. Note that if only the feature is saved, when one calls the calculator again, the class will not load the feature and instead calculate it again. 'file_type': If the feature itself is saved, one can choose between two formats: either 'pkl' (save the feature as a pickle file, as is) or 'csv' (save a csv file of the feature values). 'vertex_names': If the features are saved as a csv file, there is an option of saving the name of each vertex in each row, before the feature values. The value here is a boolean indicating whether to put the original names the vertices in the beginning of each row. """ if not len(self._features) + len( self._special_features) and self._verbose: print("No features were chosen!") else: self._adj_matrix = nx.adjacency_matrix(self._graph) # self._adj_matrix = self._adj_matrix.toarray() self._raw_features = GraphFeatures(gnx=self._graph, features=self._features, dir_path=self._dir_path, logger=self._logger) if dumping_specs is not None: if 'vertex_names' in dumping_specs: if dumping_specs['vertex_names']: dumping_specs['vertex_names'] = self._mapping else: del dumping_specs['vertex_names'] self._raw_features.build(should_dump=True, dumping_specs=dumping_specs) self._other_features = OtherFeatures(self._graph, self._special_features, self._dir_path, self._params, self._logger) self._other_features.build(should_dump=True) self._logger.info( str(datetime.datetime.now()) + " , Calculated features") @property def feature_matrix(self): return np.hstack((self._raw_features.to_matrix(mtype=np.array), self._other_features.feature_matrix)) @property def adjacency_matrix(self): return self._adj_matrix