class BetaCalculator: def __init__(self, graphs: Graphs, feature_pairs=None, logger: BaseLogger=None): if logger: self._logger = logger else: self._logger = PrintLogger("default graphs logger") self._graphs = graphs self._ftr_pairs = feature_pairs num_features = graphs.features_matrix(0).shape[1] num_rows = len(feature_pairs) if feature_pairs else int(comb(num_features, 2)) self._beta_matrix = np.zeros((self._graphs.number_of_graphs(), num_rows)) self._build() def _build(self): graph_index = 0 for g_id in self._graphs.graph_names(): self._logger.debug("calculating beta vec for:\t" + g_id) self._beta_matrix[graph_index, :] = self._calc_beta(g_id) graph_index += 1 def _calc_beta(self, gid): raise NotImplementedError() def beta_matrix(self): return self._beta_matrix def to_file(self, file_name): out_file = open(file_name, "rw") for i in range(self._graphs.number_of_graphs()): out_file.write(self._graphs.index_to_name(i)) # graph_name for j in range(len(self._ftr_pairs)): out_file.write(str(self._beta_matrix[i][j])) # beta_vector out_file.write("\n") out_file.close()
def print_log_data(train_results, valid_results, test_results=None, epoch=None): if test_results is None: PrintLogger().debug( 'Epoch: {:04d} '.format(epoch + 1) + 'loss_train: {:.4f} '.format(train_results['loss']) + 'temp_loss_train: {:.4f} '.format(train_results['tempo_loss']) + 'f1_macro_train: {} '.format(train_results['f1_score_macro']) + 'f1_micro_train {} '.format(train_results['f1_score_micro']) + 'loss_valid: {:.4f} '.format(valid_results['loss']) + 'temp_loss_valid: {:.4f} '.format(valid_results['tempo_loss']) + 'f1_macro_valid: {} '.format(valid_results['f1_score_macro']) + 'f1_micro_valid {} '.format(valid_results['f1_score_micro'])) else: PrintLogger().debug( 'loss_train: {:.4f} '.format(train_results['loss'].item()) + 'temp_loss_train: {:.4f} '.format( train_results['tempo_loss'].item()) + 'f1_macro_train: {} '.format(train_results['f1_score_macro']) + 'f1_macro_train {} '.format(train_results['f1_score_micro']) + 'loss_valid: {:.4f} '.format(valid_results['loss'].item()) + 'temp_loss_valid: {:.4f} '.format( valid_results['tempo_loss'].item()) + 'f1_macro_valid: {} '.format(valid_results['f1_score_macro']) + 'f1_macro_valid {} '.format(valid_results['f1_score_micro']) + 'reg_loss_test: {:.4f} '.format(test_results['loss'].item()) + 'temp_loss_test: {:.4f} '.format(test_results['tempo_loss'].item()) + 'f1_macro_test: {} '.format(test_results['f1_score_macro']) + 'f1_micro_test {} '.format(test_results['f1_score_micro']))
def __init__(self, database_name, start_time=10, logger: BaseLogger = None, features_meta=None, directed=False, files_path=None, date_format=None, largest_cc=False): self._start_time = start_time self._features_meta = NODE_FEATURES if features_meta is None else features_meta self._largest_cc = largest_cc self._date_format = date_format self._directed = directed self._database_name = database_name + "_directed:" + str( directed) + "_lcc:" + str(largest_cc) self._path = os.path.join('data', self._database_name) if logger: self._logger = logger else: self._logger = PrintLogger("default graphs logger") self._files_path = files_path # location of graphs as files # make directories to save features data (as pickles) if "data" not in os.listdir("."): os.mkdir("data") if self._database_name not in os.listdir("data/"): os.mkdir(self._path) self._logger.debug("graphs initialized") self._initiation()
def test_main(): import numpy as np from features_infra.graph_features import GraphFeatures from loggers import PrintLogger import os import pickle import networkx as nx dataset = "citeseer" logger = PrintLogger("MetaTest") base_dir = r"/home/benami/git/pygcn/data" gnx = pickle.load(open(os.path.join(base_dir, dataset, "gnx.pkl"), 'rb')) max_subgnx = max(nx.connected_component_subgraphs(gnx.to_undirected()), key=len) gnx = gnx.subgraph(max_subgnx) features = GraphFeatures(gnx, TEST_FEATURES, dir_path="./%s_features_sub" % dataset, logger=logger) features.build(should_dump=True) measures_mx = features.to_matrix(add_ones=False, dtype=np.float32, mtype=np.matrix) logger.info("Finished")
def __init__(self, graphs: Graphs, feature_pairs, logger: BaseLogger=None): if logger: self._logger = logger else: self._logger = PrintLogger("default graphs logger") self._graphs = graphs self._ftr_pairs = feature_pairs self._beta_matrix = np.zeros((self._graphs.number_of_graphs(), len(feature_pairs))) self._build()
def __init__(self, params): self._params = params if type(params) is dict else json.load(open(params, "rt")) self._logger = PrintLogger("graph-ad") self._temporal_graph = self._build_temporal_graph() self._ground_truth = self._load_ground_truth(self._params['gt']['filename']) self._num_anomalies = len(self._ground_truth)*2 self._idx_to_graph = list(self._temporal_graph.graph_names()) self._graph_to_idx = {name: idx for idx, name in enumerate(self._idx_to_graph)} self._run_ad()
def __init__(self, graphs: Graphs, feature_pairs=None, logger: BaseLogger=None): if logger: self._logger = logger else: self._logger = PrintLogger("default graphs logger") self._graphs = graphs self._ftr_pairs = feature_pairs num_features = graphs.features_matrix(0).shape[1] num_rows = len(feature_pairs) if feature_pairs else int(comb(num_features, 2)) self._beta_matrix = np.zeros((self._graphs.number_of_graphs(), num_rows)) self._build()
def __init__(self, graphs: Graphs, logger :BaseLogger = None, size=10, identical_bar=0.6): if logger: self._logger = logger else: self._logger = PrintLogger("default logger") self._size = size # number of pairs to pick self._graphs = graphs self._features_matrix = self._get_features_np_matrix() self._identical_bar = identical_bar # if feature has identical values to more then bar*|V| - feature is dropped self._features_identicality = [] # percentage of biggest vertices group with same value per feature self._fill_features_identicality() self._best_pairs = self._pick()
def __init__(self, data_path, params): # parameters - dictionary must contain { database: , logger_name: , date_format: , directed : # , max_connected : , ftr_pairs : , identical_bar : , context_beta: } self._params = params self._white = params['white_label'] # number of days represented by one time interval self._time_split = self._params['days_split'] self._all_beta_path = ALL_BETA_PATH + "_split_" + str( self._time_split) + ".pkl" self._start_interval = self._params['start_interval'] # where to save splitted graph self._target_path = os.path.join(DATA_TARGET_FOLDER, params['database'], "split_" + str(self._time_split)) self._logger = PrintLogger(self._params['logger_name']) self._params['files_path'] = self._target_path self._data_path = data_path # split to time intervals - only self._partition_data() self._timed_graph = None self.calc_all_times( ) # calc all features for all times and save as pickle self._time_idx = 0 # TOTAL NUMBER OF BLACK IN FINAL TIME self.num_blacks = sum([ val for key, val in Counter(self._all_times_data[-1][0][3]).items() if key != self._white ])
def build_model(training_data, training_adj, training_labels, eval_data, eval_adj, eval_labels, test_data, test_adj, test_labels, learning_hyperparams, class_weights, graph_params, dumping_name, is_nni=False, device=1): activations = [learning_hyperparams.activation] * (len(learning_hyperparams.hidden_layers) + 1) conf = {"model": learning_hyperparams.model, "hidden_layers": learning_hyperparams.hidden_layers, "dropout": learning_hyperparams.dropout, "lr": learning_hyperparams.learning_rate, "weight_decay": learning_hyperparams.l2_regularization, "training_mat": training_data, "training_adj": training_adj, "training_labels": training_labels, "eval_mat": eval_data, "eval_adj": eval_adj, "eval_labels": eval_labels, "test_mat": test_data, "test_adj": test_adj, "test_labels": test_labels, "optimizer": learning_hyperparams.optimizer, "epochs": learning_hyperparams.epochs, "activations": activations, "loss_coeffs": learning_hyperparams.loss_coefficients, "unary": learning_hyperparams.unary_loss_type, "edge_normalization": learning_hyperparams.edge_normalization} products_path = os.path.join(os.getcwd(), "logs", *dumping_name, datetime.now().strftime("%Y%m%d_%H%M%S_%f")) check_make_dir(products_path) logger = multi_logger([ PrintLogger("MyLogger", level=logging.DEBUG), FileLogger("results_" + dumping_name[1], path=products_path, level=logging.INFO)], name=None) runner = ModelRunner(conf, logger=logger, weights=class_weights, graph_params=graph_params, early_stop=learning_hyperparams.early_stop, is_nni=is_nni, tmp_path=products_path, device=device) return runner
def create_features(): for i in range(21): with open(os.path.join('graphs_by_years', 'graph_' + str(i) + '.pkl'), 'rb') as f: gnx = pickle.load(f) logger = PrintLogger("MyLogger") features_meta = { "page_rank": FeatureMeta(PageRankCalculator, {"pr"}), "general": FeatureMeta(GeneralCalculator, {"gen"}), "Average_Neighbor_Degree": FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}), "k_core": FeatureMeta(KCoreCalculator, {"kc"}), } features = GraphFeatures(gnx, features_meta, "/home/dsi/racheli/graph_calculations", logger=logger) features.build() mx = features.to_matrix(mtype=np.matrix) with open(os.path.join('graphs_by_years', 'mx_' + str(i) + '.pkl'), 'wb') as f: pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL)
def build_model(training_data, training_labels, test_data, test_labels, adjacency_matrices, hid_features, activation, optimizer, epochs, dropout, lr, l2_pen, temporal_pen, dumping_name, feature_matrices, is_nni=False): optim_name="SGD" if optimizer==optim.Adam: optim_name = "Adam" conf = {"hid_features": hid_features, "dropout": dropout, "lr": lr, "weight_decay": l2_pen, "temporal_pen": temporal_pen, "training_mat": training_data, "training_labels": training_labels, "test_mat": test_data, "test_labels": test_labels, "adj_matrices": adjacency_matrices, "optimizer": optimizer, "epochs": epochs, "feature_matrices": feature_matrices, "activation": activation,"optim_name":optim_name} products_path = os.path.join(os.getcwd(), "logs", dumping_name, time.strftime("%Y%m%d_%H%M%S")) if not os.path.exists(products_path): os.makedirs(products_path) logger = multi_logger([ PrintLogger("MyLogger", level=logging.DEBUG), FileLogger("results_%s" % dumping_name, path=products_path, level=logging.INFO)], name=None) data_logger = CSVLogger("results_%s" % dumping_name, path=products_path) data_logger.info("model_name", "loss", "acc") ## logger.info('STARTING with lr= {:.4f} '.format(lr) + ' dropout= {:.4f} '.format(dropout)+ ' regulariztion_l2_pen= {:.4f} '.format(l2_pen) + ' temporal_pen= {:.10f} '.format(temporal_pen)+ ' optimizer= %s ' %optim_name) logger.debug('STARTING with lr= {:.4f} '.format(lr) + ' dropout= {:.4f} '.format(dropout) + ' regulariztion_l2_pen= {:.4f} '.format(l2_pen) + ' temporal_pen= {:.10f} '.format(temporal_pen) + ' optimizer= %s ' %optim_name) ## runner = ModelRunner(conf, logger=logger, data_logger=data_logger, is_nni=is_nni) return runner
def _gnx_vec(self, gnx_id, gnx: nx.Graph, node_order): final_vec = [] if self._deg: degrees = gnx.degree(gnx.nodes) final_vec.append( np.matrix([np.log(degrees[d] + 1e-3) for d in node_order]).T) if self._in_deg: degrees = gnx.in_degree(gnx.nodes) final_vec.append( np.matrix([np.log(degrees[d] + 1e-3) for d in node_order]).T) if self._out_deg: degrees = gnx.out_degree(gnx.nodes) final_vec.append( np.matrix([np.log(degrees[d] + 1e-3) for d in node_order]).T) if self._is_external_data and self._external_data.is_value: final_vec.append( np.matrix([ self._external_data.value_feature(gnx_id, d) for d in node_order ])) if self._is_ftr: name = str(gnx_id) gnx_dir_path = os.path.join(self._ftr_path, name) if not os.path.exists(gnx_dir_path): os.mkdir(gnx_dir_path) raw_ftr = GraphFeatures(gnx, self._ftr_meta, dir_path=gnx_dir_path, is_max_connected=False, logger=PrintLogger("logger")) raw_ftr.build(should_dump=True) # build features final_vec.append( FeaturesProcessor(raw_ftr).as_matrix(norm_func=log_norm)) return np.hstack(final_vec)
def __init__(self, database_name, csv_source, time_format, time_col, src_col, dst_col, weight_col=None, weeks=0, days=0, hours=0, minutes=0, seconds=0, logger=None, time_format_out=None, directed=False): self._csv_source = csv_source self._database_name = database_name self._directed = directed self._time_format = time_format self._time_col = time_col self._src_col = src_col self._dst_col = dst_col self._weight_col = weight_col self._format_out = time_format_out if time_format_out else time_format self._timedelta = timedelta(weeks=weeks, days=days, hours=hours, minutes=minutes, seconds=seconds) self._time_col = time_col self._src_col = src_col self._logger = logger if logger else PrintLogger() self._mg_dictionary = self._break_by_time()
def __init__(self, path, dist_type=DistType.Euclidian, eps=0.01, recall=0.7): self._params = { 'database': 'Refael', 'files_path': path, 'date_format': None, # Twitter 'directed': True, 'max_connected': False, 'logger_name': "logger", 'ftr_pairs': 300, 'identical_bar': 0.9, 'context_beta': 1, } # self._labels = [] # self._beta_matrix = None self.eps = eps self.recall = recall self.dit_type = dist_type self._logger = PrintLogger(self._params['logger_name']) self._graphs = Graphs(self._params['database'], files_path=self._params['files_path'], logger=self._logger, features_meta=ANOMALY_DETECTION_FEATURES, directed=self._params['directed'], date_format=self._params['date_format'], largest_cc=self._params['max_connected']) self._graphs.build(force_rebuild_ftr=REBUILD_FEATURES, pick_ftr=RE_PICK_FTR, should_zscore=False) self.labels = self._graphs.get_labels() # normalize features --------------------------------- self._graphs.norm_features(log_norm) pearson_picker = PearsonFeaturePicker(self._graphs, size=self._params['ftr_pairs'], logger=self._logger, identical_bar=self._params['identical_bar']) best_pairs = pearson_picker.best_pairs() beta = LinearContext(self._graphs, best_pairs, split=self._params['context_beta']) self.beta_matrix = beta.beta_matrix()
def __init__(self, database_name, csv_source, time_format, time_col, src_col, dst_col, subgraph_name_col, weight_col=None, label_col=None, weeks=0, days=0, hours=0, minutes=0, seconds=0, logger=None, time_format_out=None, directed=False): self._labels = {} self._order = {} self._times = [] self._times_index = {} self._graphs_for_time = {} self._subgraph_name_col = subgraph_name_col self._csv_source = csv_source self._database_name = database_name self._directed = directed self._time_format = time_format self._time_col = time_col self._src_col = src_col self._dst_col = dst_col self._weight_col = weight_col self._label_col = label_col self._format_out = self._time_format_out(time_format_out) self._timedelta = timedelta(weeks=weeks, days=days, hours=hours, minutes=minutes, seconds=seconds) self._time_col = time_col self._src_col = src_col self._logger = logger if logger else PrintLogger() self._edge_list_dict = self._break_by_time() self._mg_dict = self._build_multi_graphs() self._number_of_times = len(self._edge_list_dict)
def build_model(rand_test_indices, train_indices,traint,testt, labels ,X, adj_tr, adj_te, in_features, hid_features,out_features,ds_name, activation, optimizer, epochs, dropout, lr, l2_pen, beta, gamma, dumping_name, GS,is_nni=False): optim_name="SGD" if optimizer==optim.Adam: optim_name = "Adam" conf = {"in_features":in_features, "hid_features": hid_features, "out_features":out_features,"ds_name":ds_name, "dropout": dropout, "lr": lr, "weight_decay": l2_pen, "beta": beta, "gamma": gamma, #"training_mat": training_data, "training_labels": training_labels, # "test_mat": test_data, "test_labels": test_labels, "train_ind": train_indices, "test_ind": rand_test_indices, "traint":traint,"testt":testt, "labels":labels, "X":X, "adj_tr": adj_tr,"adj_te": adj_te, "optimizer": optimizer, "epochs": epochs, "activation": activation,"optim_name":optim_name} products_path = os.path.join(os.getcwd(), "logs", dumping_name, time.strftime("%Y%m%d_%H%M%S")) if not os.path.exists(products_path): os.makedirs(products_path) logger = multi_logger([ PrintLogger("MyLogger", level=logging.DEBUG), FileLogger("results_%s" % dumping_name, path=products_path, level=logging.INFO)], name=None) data_logger = CSVLogger("results_%s" % dumping_name, path=products_path) data_logger.info("model_name", "loss", "acc") runner = ModelRunner(conf, GS,logger=logger, data_logger=data_logger, is_nni=is_nni) return runner
def create_features(data_name, time_range): for i in range(time_range): gnx = pickle.load(open("./dataset/"+data_name+"/pkl/gcn_input/"+"graph_"+str(i)+".pkl","rb")) # with open(os.path.join('data',str(data_name),'gcn_input', 'graph_'+str(i)+'.pkl'), 'rb') as f: # gnx = pickle.load(f) logger = PrintLogger("MyLogger") features_meta = { "page_rank": FeatureMeta(PageRankCalculator, {"pr"}), "general": FeatureMeta(GeneralCalculator, {"gen"}), "Average_Neighbor_Degree": FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}), "k_core": FeatureMeta(KCoreCalculator, {"kc"})} features = GraphFeatures(gnx, features_meta, "./dataset/"+str(data_name)+"/pkl/feature", logger=logger) features.build() mx = features.to_matrix(mtype=np.matrix) pickle.dump(mx, open("./dataset/"+data_name+"/pkl/gcn_input/"+"mx_"+str(i)+".pkl", "wb")) # with open(os.path.join('data',str(data_name),'gcn_input','mx_'+str(i)+'.pkl'), 'wb') as f: # pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL) return # with open(os.path.join('data',str(data_name),'pkl', 'mx_1.pkl'), 'rb') as f: # l = pickle.load(f) # # print (l[0])
def _calc_features(self, pkl=True): # load dictionary if exists if pkl and self._ftr_pkl_name() in os.listdir( os.path.join(self._base_dir, 'pkl', 'ftr_by_time_dictionaries')): self._features_by_time, self._multi_graphs_by_time = \ pickle.load(open(os.path.join(self._base_dir, 'pkl', 'ftr_by_time_dictionaries', self._ftr_pkl_name()), "rb")) return self._load_database() labels = self._database.labels # make directory for database dir_path = os.path.join(self._base_dir, 'pkl', 'graph_measures', self._params['database_full_name']) if self._params['database_full_name'] not in os.listdir(os.path.join(self._base_dir, 'pkl', 'graph_measures')): os.mkdir(dir_path) # calculate features for multi_graph in self._database.multi_graph_by_window(self._params['window_size'], self._params['start_time']): ftr_tmp_dict = {} for name in multi_graph.graph_names(): raw_ftr = GraphFeatures(multi_graph.get_gnx(name), NODE_FEATURES_ML, dir_path, is_max_connected=self._params['max_connected'], logger=PrintLogger(self._params['database_full_name'])) nodes_and_edges = [multi_graph.node_count(graph_id=name), multi_graph.edge_count(graph_id=name)] ftr_tmp_dict[name] = (FeaturesProcessor(raw_ftr).activate_motif_ratio_vec(to_add=nodes_and_edges), labels[name]) self._features_by_time.append(ftr_tmp_dict) multi_graph.suspend_logger() self._multi_graphs_by_time.append(multi_graph) pickle.dump((self._features_by_time, self._multi_graphs_by_time), open(os.path.join(self._base_dir, 'pkl', 'ftr_by_time_dictionaries', self._ftr_pkl_name()), "wb"))
def test_feature(): from loggers import PrintLogger from measure_tests.test_graph import get_graph gnx = get_graph() feat = MultiDimensionalScalingCalculator( gnx, logger=PrintLogger("Keren's Logger")) res = feat.build() print(res)
def __init__(self, source_file, num_prefix=120, num_suffix=200, delta=(0.2, 0.5, 0.3)): self._logger = PrintLogger("NLP-ass1") self._delta = delta self._source = source_file self._num_prefix = num_prefix self._num_suffix = num_suffix # counters self._emission_count, self._transition_count, self._suffix_count = self._get_data( ) self._pos_list = list( set(list(self._transition_count[0].keys()) + [START])) self._num_pos = len(self._pos_list) self._pos_idx = {pos: i for i, pos in enumerate(self._pos_list)}
def test_neighbor_histogram(): gnx = sample_graph() logger = PrintLogger() calc = NthNeighborNodeEdgeHistogramCalculator(2, gnx, logger=logger) calc.build() n = calc.to_matrix() # (self, gnx, name, abbreviations, logger=None): # m = calculate_second_neighbor_vector(gnx, colors) print('bla')
class FeaturesPicker: def __init__(self, graphs: Graphs, logger: BaseLogger = None, size=10, identical_bar=0.6): if logger: self._logger = logger else: self._logger = PrintLogger("default logger") self._size = size # number of pairs to pick self._graphs = graphs self._features_matrix = self._get_features_np_matrix() self._identical_bar = identical_bar # if feature has identical values to more then bar*|V| - feature is dropped self._features_identicality = [ ] # percentage of biggest vertices group with same value per feature self._fill_features_identicality() self._best_pairs = self._pick() def _get_features_np_matrix(self): return self._graphs.features_matrix_by_index(for_all=True) # fill best pairs with the most informative pair of features def _pick(self): raise NotImplementedError() def best_pairs(self): return self._best_pairs def _fill_features_identicality(self): self._logger.debug("start features identicality") rows, cols = self._features_matrix.shape for i in range(cols): self._features_identicality.append( collections.Counter(self._features_matrix[:, i].T.tolist() [0]).most_common(1)[0][1] / rows) self._logger.debug("end_features identicality") def _identicality_for(self, feature_index): return self._features_identicality[feature_index] def _is_feature_relevant(self, feature_index): return True if self._features_identicality[ feature_index] < self._identical_bar else False
def main_clean(): args = parse_args() dataset = "citeseer" seed = random.randint(1, 1000000000) # "feat_type": "neighbors", conf = { "kipf": { "hidden": args.hidden, "dropout": args.dropout, "lr": args.lr, "weight_decay": args.weight_decay }, "hidden_layers": [16], "multi_hidden_layers": [100, 35], "dropout": 0.6, "lr": 0.01, "weight_decay": 0.001, "dataset": dataset, "epochs": args.epochs, "cuda": args.cuda, "fastmode": args.fastmode, "seed": seed } init_seed(conf['seed'], conf['cuda']) dataset_path = os.path.join(PROJ_DIR, "data", dataset) products_path = os.path.join(CUR_DIR, "logs", args.prefix + dataset, time.strftime("%Y_%m_%d_%H_%M_%S")) if not os.path.exists(products_path): os.makedirs(products_path) logger = multi_logger([ PrintLogger("IdansLogger", level=logging.DEBUG), FileLogger("results_%s" % conf["dataset"], path=products_path, level=logging.INFO), FileLogger("results_%s_all" % conf["dataset"], path=products_path, level=logging.DEBUG), ], name=None) data_logger = CSVLogger("results_%s" % conf["dataset"], path=products_path) data_logger.info("model_name", "loss", "acc", "train_p") runner = ModelRunner(dataset_path, conf, logger=logger, data_logger=data_logger) # execute_runner(runner, logger, 5, num_iter=30) for train_p in range(5, 90, 10): execute_runner(runner, logger, train_p, num_iter=10) logger.info("Finished")
def calculate_test_feature(calculator, is_max_connected=False): from loggers import PrintLogger logger = PrintLogger("TestLogger") res = {} for g_type, gnx in [("directed", get_di_graph()), ("undirected", get_graph())]: gnx = filter_gnx(gnx, is_max_connected) feat = calculator(gnx, logger=logger) res[g_type] = feat.build() return res
def __init__(self, graphs, scores_list, database_name, logger: BaseLogger = None): self._database_name = database_name if logger: self._logger = logger else: self._logger = PrintLogger("default anomaly picker logger") self._graphs = graphs self._scores_list = scores_list self._anomalies = [] self._anomalies_calculated = False
def __init__(self, params: AdParams): self._base_dir = __file__.replace("/", os.sep) self._base_dir = os.path.join(self._base_dir.rsplit(os.sep, 1)[0]) self._data_path = os.path.join(self._base_dir, "INPUT_DATA", params.database.DATABASE_FILE) self._params = params self._data_name = params.database.DATABASE_NAME self._logger = PrintLogger("Anomaly logger") self._temporal_graph = self._build_temporal_graph() self._ground_truth = self._load_ground_truth(self._params.database.GROUND_TRUTH) # self._temporal_graph.filter( # lambda x: False if self._temporal_graph.node_count(x) < 20 else True, # func_input="graph_name") self._idx_to_name = list(self._temporal_graph.graph_names()) self._name_to_idx = {name: idx for idx, name in enumerate(self._idx_to_name)} if self._params.vec_type == "motif_ratio": self._build_second_method() elif self._params.vec_type == "regression": self._build_first_method()
def test_graph(): logger = PrintLogger("Oved's logger") path = "test_graphs" graphs = Graphs("test - Debug", logger=logger, files_path=path) graphs.build() G_1 = graphs.get_subgraph("time_1") G_2 = graphs.get_subgraph("time_2") G_3 = graphs.get_subgraph("time_3") stop = 0
def __init__(self, edge_path, dir_path, features, acc=True, directed=False, gpu=False, device=2, verbose=True, params=None): """ A class used to calculate features for a given graph, input as a text-like file. :param edge_path: str Path to graph edges file (text-like file, e.g. txt or csv), from which the graph is built using networkx. The graph must be unweighted. If its vertices are not [0, 1, ..., n-1], they are mapped to become [0, 1, ..., n-1] and the mapping is saved. Every row in the edges file should include "source_id,distance_id", without a header row. :param dir_path: str Path to the directory in which the feature calculations will be (or already are) located. :param features: list of strings List of the names of each feature. Could be any name from features_meta.py or "additional_features". :param acc: bool Whether to run the accelerated features, assuming it is possible to do so. :param directed: bool Whether the built graph is directed. :param gpu: bool Whether to use GPUs, assuming it is possible to do so (i.e. the GPU exists and the CUDA matches). :param device: int If gpu is True, indicates on which GPU device to calculate. Will return error if the index doesn't match the available GPUs. :param verbose: bool Whether to print things indicating the phases of calculations. :param params: dict, or None For clique detection uses, this is a dictionary of the graph settings (size, directed, clique size, edge probability). Ignored for any other use. """ self._dir_path = dir_path self._features = features # By their name as appears in accelerated_features_meta self._gpu = gpu self._device = device self._verbose = verbose self._logger = multi_logger([PrintLogger("Logger", level=logging.DEBUG), FileLogger("FLogger", path=dir_path, level=logging.INFO)], name=None) \ if verbose else None self._params = params self._load_graph(edge_path, directed) self._get_feature_meta( features, acc) # acc determines whether to use the accelerated features self._adj_matrix = None self._raw_features = None self._other_features = None
def __init__(self, path, eps=0.01, recall=0.7): self._params = { 'database': 'Refael', 'files_path': path, 'date_format': None, # Twitter 'directed': True, 'max_connected': False, 'logger_name': "logger", 'ftr_pairs': 300, 'identical_bar': 0.95, 'context_beta': 1, } self._logger = PrintLogger(self._params['logger_name']) self._graphs = Graphs(self._params['database'], files_path=self._params['files_path'], logger=self._logger, features_meta=ANOMALY_DETECTION_FEATURES, directed=self._params['directed'], date_format=self._params['date_format'], largest_cc=self._params['max_connected']) self._graphs.build(force_rebuild_ftr=REBUILD_FEATURES, pick_ftr=RE_PICK_FTR, should_zscore=False) # normalize features --------------------------------- self._graphs.norm_features(log_norm) # labels self.labels = self._graphs.get_labels() pearson_picker = PearsonFeaturePicker( self._graphs, size=self._params['ftr_pairs'], logger=self._logger, identical_bar=self._params['identical_bar']) best_pairs = pearson_picker.best_pairs() self._pairs_header = best_pairs if os.path.exists(BETA_PKL_P): self._beta_matrix = pickle.load(open(BETA_PKL_P, "rb")) else: beta = LinearContext(self._graphs, best_pairs, split=self._params['context_beta']) self._beta_matrix = beta.beta_matrix() pickle.dump(self._beta_matrix, open(BETA_PKL_P, "wb")) self._beta_df = self._beta_matrix_to_df(header=self._pairs_header) # self._best_beta_df = self._best_pairs_df() self._best_beta_df = self._beta_df res_df = self._learn_RF( self._pca_df(self._best_beta_df, graph_data=True, min_nodes=10)) self.plot_learning_df(res_df)