class BetaCalculator:
    def __init__(self, graphs: Graphs, feature_pairs=None, logger: BaseLogger=None):
        if logger:
            self._logger = logger
        else:
            self._logger = PrintLogger("default graphs logger")
        self._graphs = graphs
        self._ftr_pairs = feature_pairs
        num_features = graphs.features_matrix(0).shape[1]
        num_rows = len(feature_pairs) if feature_pairs else int(comb(num_features, 2))
        self._beta_matrix = np.zeros((self._graphs.number_of_graphs(), num_rows))
        self._build()

    def _build(self):
        graph_index = 0
        for g_id in self._graphs.graph_names():
            self._logger.debug("calculating beta vec for:\t" + g_id)
            self._beta_matrix[graph_index, :] = self._calc_beta(g_id)
            graph_index += 1

    def _calc_beta(self, gid):
        raise NotImplementedError()

    def beta_matrix(self):
        return self._beta_matrix

    def to_file(self, file_name):
        out_file = open(file_name, "rw")
        for i in range(self._graphs.number_of_graphs()):
            out_file.write(self._graphs.index_to_name(i))  # graph_name
            for j in range(len(self._ftr_pairs)):
                out_file.write(str(self._beta_matrix[i][j]))  # beta_vector
            out_file.write("\n")
        out_file.close()
示例#2
0
def print_log_data(train_results,
                   valid_results,
                   test_results=None,
                   epoch=None):
    if test_results is None:
        PrintLogger().debug(
            'Epoch: {:04d} '.format(epoch + 1) +
            'loss_train: {:.4f} '.format(train_results['loss']) +
            'temp_loss_train: {:.4f} '.format(train_results['tempo_loss']) +
            'f1_macro_train: {} '.format(train_results['f1_score_macro']) +
            'f1_micro_train {} '.format(train_results['f1_score_micro']) +
            'loss_valid: {:.4f} '.format(valid_results['loss']) +
            'temp_loss_valid: {:.4f} '.format(valid_results['tempo_loss']) +
            'f1_macro_valid: {} '.format(valid_results['f1_score_macro']) +
            'f1_micro_valid {} '.format(valid_results['f1_score_micro']))
    else:
        PrintLogger().debug(
            'loss_train: {:.4f} '.format(train_results['loss'].item()) +
            'temp_loss_train: {:.4f} '.format(
                train_results['tempo_loss'].item()) +
            'f1_macro_train: {} '.format(train_results['f1_score_macro']) +
            'f1_macro_train {} '.format(train_results['f1_score_micro']) +
            'loss_valid: {:.4f} '.format(valid_results['loss'].item()) +
            'temp_loss_valid: {:.4f} '.format(
                valid_results['tempo_loss'].item()) +
            'f1_macro_valid: {} '.format(valid_results['f1_score_macro']) +
            'f1_macro_valid {} '.format(valid_results['f1_score_micro']) +
            'reg_loss_test: {:.4f} '.format(test_results['loss'].item()) +
            'temp_loss_test: {:.4f} '.format(test_results['tempo_loss'].item())
            + 'f1_macro_test: {} '.format(test_results['f1_score_macro']) +
            'f1_micro_test {} '.format(test_results['f1_score_micro']))
示例#3
0
    def __init__(self,
                 database_name,
                 start_time=10,
                 logger: BaseLogger = None,
                 features_meta=None,
                 directed=False,
                 files_path=None,
                 date_format=None,
                 largest_cc=False):
        self._start_time = start_time
        self._features_meta = NODE_FEATURES if features_meta is None else features_meta
        self._largest_cc = largest_cc
        self._date_format = date_format
        self._directed = directed
        self._database_name = database_name + "_directed:" + str(
            directed) + "_lcc:" + str(largest_cc)
        self._path = os.path.join('data', self._database_name)
        if logger:
            self._logger = logger
        else:
            self._logger = PrintLogger("default graphs logger")
        self._files_path = files_path  # location of graphs as files

        # make directories to save features data (as pickles)
        if "data" not in os.listdir("."):
            os.mkdir("data")
        if self._database_name not in os.listdir("data/"):
            os.mkdir(self._path)
        self._logger.debug("graphs initialized")
        self._initiation()
示例#4
0
def test_main():
    import numpy as np
    from features_infra.graph_features import GraphFeatures
    from loggers import PrintLogger
    import os
    import pickle
    import networkx as nx

    dataset = "citeseer"
    logger = PrintLogger("MetaTest")
    base_dir = r"/home/benami/git/pygcn/data"
    gnx = pickle.load(open(os.path.join(base_dir, dataset, "gnx.pkl"), 'rb'))

    max_subgnx = max(nx.connected_component_subgraphs(gnx.to_undirected()),
                     key=len)
    gnx = gnx.subgraph(max_subgnx)

    features = GraphFeatures(gnx,
                             TEST_FEATURES,
                             dir_path="./%s_features_sub" % dataset,
                             logger=logger)
    features.build(should_dump=True)
    measures_mx = features.to_matrix(add_ones=False,
                                     dtype=np.float32,
                                     mtype=np.matrix)
    logger.info("Finished")
示例#5
0
 def __init__(self, graphs: Graphs, feature_pairs, logger: BaseLogger=None):
     if logger:
         self._logger = logger
     else:
         self._logger = PrintLogger("default graphs logger")
     self._graphs = graphs
     self._ftr_pairs = feature_pairs
     self._beta_matrix = np.zeros((self._graphs.number_of_graphs(), len(feature_pairs)))
     self._build()
 def __init__(self, params):
     self._params = params if type(params) is dict else json.load(open(params, "rt"))
     self._logger = PrintLogger("graph-ad")
     self._temporal_graph = self._build_temporal_graph()
     self._ground_truth = self._load_ground_truth(self._params['gt']['filename'])
     self._num_anomalies = len(self._ground_truth)*2
     self._idx_to_graph = list(self._temporal_graph.graph_names())
     self._graph_to_idx = {name: idx for idx, name in enumerate(self._idx_to_graph)}
     self._run_ad()
 def __init__(self, graphs: Graphs, feature_pairs=None, logger: BaseLogger=None):
     if logger:
         self._logger = logger
     else:
         self._logger = PrintLogger("default graphs logger")
     self._graphs = graphs
     self._ftr_pairs = feature_pairs
     num_features = graphs.features_matrix(0).shape[1]
     num_rows = len(feature_pairs) if feature_pairs else int(comb(num_features, 2))
     self._beta_matrix = np.zeros((self._graphs.number_of_graphs(), num_rows))
     self._build()
 def __init__(self, graphs: Graphs, logger :BaseLogger = None, size=10, identical_bar=0.6):
     if logger:
         self._logger = logger
     else:
         self._logger = PrintLogger("default logger")
     self._size = size  # number of pairs to pick
     self._graphs = graphs
     self._features_matrix = self._get_features_np_matrix()
     self._identical_bar = identical_bar  # if feature has identical values to more then bar*|V| - feature is dropped
     self._features_identicality = []  # percentage of biggest vertices group with same value per feature
     self._fill_features_identicality()
     self._best_pairs = self._pick()
示例#9
0
    def __init__(self, data_path, params):
        # parameters - dictionary must contain { database: , logger_name: , date_format: , directed :
        # , max_connected : , ftr_pairs : , identical_bar : , context_beta: }
        self._params = params
        self._white = params['white_label']
        # number of days represented by one time interval
        self._time_split = self._params['days_split']
        self._all_beta_path = ALL_BETA_PATH + "_split_" + str(
            self._time_split) + ".pkl"
        self._start_interval = self._params['start_interval']
        # where to save splitted graph
        self._target_path = os.path.join(DATA_TARGET_FOLDER,
                                         params['database'],
                                         "split_" + str(self._time_split))
        self._logger = PrintLogger(self._params['logger_name'])
        self._params['files_path'] = self._target_path
        self._data_path = data_path
        # split to time intervals - only
        self._partition_data()
        self._timed_graph = None

        self.calc_all_times(
        )  # calc all features for all times and save as pickle
        self._time_idx = 0
        # TOTAL NUMBER OF BLACK IN FINAL TIME
        self.num_blacks = sum([
            val
            for key, val in Counter(self._all_times_data[-1][0][3]).items()
            if key != self._white
        ])
示例#10
0
def build_model(training_data, training_adj, training_labels, eval_data, eval_adj, eval_labels,
                test_data, test_adj, test_labels, learning_hyperparams, class_weights,
                graph_params, dumping_name, is_nni=False, device=1):
    activations = [learning_hyperparams.activation] * (len(learning_hyperparams.hidden_layers) + 1)
    conf = {"model": learning_hyperparams.model, "hidden_layers": learning_hyperparams.hidden_layers,
            "dropout": learning_hyperparams.dropout, "lr": learning_hyperparams.learning_rate,
            "weight_decay": learning_hyperparams.l2_regularization, "training_mat": training_data,
            "training_adj": training_adj, "training_labels": training_labels,
            "eval_mat": eval_data, "eval_adj": eval_adj, "eval_labels": eval_labels,
            "test_mat": test_data, "test_adj": test_adj, "test_labels": test_labels,
            "optimizer": learning_hyperparams.optimizer, "epochs": learning_hyperparams.epochs,
            "activations": activations, "loss_coeffs": learning_hyperparams.loss_coefficients,
            "unary": learning_hyperparams.unary_loss_type,
            "edge_normalization": learning_hyperparams.edge_normalization}

    products_path = os.path.join(os.getcwd(), "logs", *dumping_name, datetime.now().strftime("%Y%m%d_%H%M%S_%f"))
    check_make_dir(products_path)

    logger = multi_logger([
        PrintLogger("MyLogger", level=logging.DEBUG),
        FileLogger("results_" + dumping_name[1], path=products_path, level=logging.INFO)], name=None)

    runner = ModelRunner(conf, logger=logger, weights=class_weights, graph_params=graph_params,
                         early_stop=learning_hyperparams.early_stop, is_nni=is_nni, tmp_path=products_path,
                         device=device)
    return runner
示例#11
0
def create_features():
    for i in range(21):
        with open(os.path.join('graphs_by_years', 'graph_' + str(i) + '.pkl'),
                  'rb') as f:
            gnx = pickle.load(f)

        logger = PrintLogger("MyLogger")

        features_meta = {
            "page_rank":
            FeatureMeta(PageRankCalculator, {"pr"}),
            "general":
            FeatureMeta(GeneralCalculator, {"gen"}),
            "Average_Neighbor_Degree":
            FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}),
            "k_core":
            FeatureMeta(KCoreCalculator, {"kc"}),
        }

        features = GraphFeatures(gnx,
                                 features_meta,
                                 "/home/dsi/racheli/graph_calculations",
                                 logger=logger)
        features.build()

        mx = features.to_matrix(mtype=np.matrix)

        with open(os.path.join('graphs_by_years', 'mx_' + str(i) + '.pkl'),
                  'wb') as f:
            pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL)
示例#12
0
def build_model(training_data, training_labels, test_data, test_labels, adjacency_matrices,
                hid_features, activation, optimizer, epochs, dropout, lr, l2_pen, temporal_pen,
                dumping_name, feature_matrices, is_nni=False):
    optim_name="SGD"
    if optimizer==optim.Adam:
        optim_name = "Adam"
    conf = {"hid_features": hid_features, "dropout": dropout, "lr": lr, "weight_decay": l2_pen,
            "temporal_pen": temporal_pen,
            "training_mat": training_data, "training_labels": training_labels,
            "test_mat": test_data, "test_labels": test_labels, "adj_matrices": adjacency_matrices,
            "optimizer": optimizer, "epochs": epochs, "feature_matrices": feature_matrices, "activation": activation,"optim_name":optim_name}

    products_path = os.path.join(os.getcwd(), "logs", dumping_name, time.strftime("%Y%m%d_%H%M%S"))
    if not os.path.exists(products_path):
        os.makedirs(products_path)

    logger = multi_logger([
        PrintLogger("MyLogger", level=logging.DEBUG),
        FileLogger("results_%s" % dumping_name, path=products_path, level=logging.INFO)], name=None)

    data_logger = CSVLogger("results_%s" % dumping_name, path=products_path)
    data_logger.info("model_name", "loss", "acc")

    ##
    logger.info('STARTING with lr= {:.4f} '.format(lr) + ' dropout= {:.4f} '.format(dropout)+ ' regulariztion_l2_pen= {:.4f} '.format(l2_pen)
                + ' temporal_pen= {:.10f} '.format(temporal_pen)+ ' optimizer= %s ' %optim_name)
    logger.debug('STARTING with lr=  {:.4f} '.format(lr) + ' dropout= {:.4f} '.format(dropout) + ' regulariztion_l2_pen= {:.4f} '.format(l2_pen)
        + ' temporal_pen= {:.10f} '.format(temporal_pen) + ' optimizer= %s ' %optim_name)
    ##

    runner = ModelRunner(conf, logger=logger, data_logger=data_logger, is_nni=is_nni)
    return runner
示例#13
0
    def _gnx_vec(self, gnx_id, gnx: nx.Graph, node_order):
        final_vec = []
        if self._deg:
            degrees = gnx.degree(gnx.nodes)
            final_vec.append(
                np.matrix([np.log(degrees[d] + 1e-3) for d in node_order]).T)
        if self._in_deg:
            degrees = gnx.in_degree(gnx.nodes)
            final_vec.append(
                np.matrix([np.log(degrees[d] + 1e-3) for d in node_order]).T)
        if self._out_deg:
            degrees = gnx.out_degree(gnx.nodes)
            final_vec.append(
                np.matrix([np.log(degrees[d] + 1e-3) for d in node_order]).T)
        if self._is_external_data and self._external_data.is_value:
            final_vec.append(
                np.matrix([
                    self._external_data.value_feature(gnx_id, d)
                    for d in node_order
                ]))
        if self._is_ftr:
            name = str(gnx_id)
            gnx_dir_path = os.path.join(self._ftr_path, name)
            if not os.path.exists(gnx_dir_path):
                os.mkdir(gnx_dir_path)
            raw_ftr = GraphFeatures(gnx,
                                    self._ftr_meta,
                                    dir_path=gnx_dir_path,
                                    is_max_connected=False,
                                    logger=PrintLogger("logger"))
            raw_ftr.build(should_dump=True)  # build features
            final_vec.append(
                FeaturesProcessor(raw_ftr).as_matrix(norm_func=log_norm))

        return np.hstack(final_vec)
示例#14
0
 def __init__(self,
              database_name,
              csv_source,
              time_format,
              time_col,
              src_col,
              dst_col,
              weight_col=None,
              weeks=0,
              days=0,
              hours=0,
              minutes=0,
              seconds=0,
              logger=None,
              time_format_out=None,
              directed=False):
     self._csv_source = csv_source
     self._database_name = database_name
     self._directed = directed
     self._time_format = time_format
     self._time_col = time_col
     self._src_col = src_col
     self._dst_col = dst_col
     self._weight_col = weight_col
     self._format_out = time_format_out if time_format_out else time_format
     self._timedelta = timedelta(weeks=weeks,
                                 days=days,
                                 hours=hours,
                                 minutes=minutes,
                                 seconds=seconds)
     self._time_col = time_col
     self._src_col = src_col
     self._logger = logger if logger else PrintLogger()
     self._mg_dictionary = self._break_by_time()
    def __init__(self, path, dist_type=DistType.Euclidian, eps=0.01, recall=0.7):

        self._params = {
            'database': 'Refael',
            'files_path': path,
            'date_format': None,  # Twitter
            'directed': True,
            'max_connected': False,
            'logger_name': "logger",
            'ftr_pairs': 300,
            'identical_bar': 0.9,
            'context_beta': 1,
        }

        # self._labels = []
        # self._beta_matrix = None
        self.eps = eps
        self.recall = recall
        self.dit_type = dist_type

        self._logger = PrintLogger(self._params['logger_name'])
        self._graphs = Graphs(self._params['database'], files_path=self._params['files_path'], logger=self._logger,
                              features_meta=ANOMALY_DETECTION_FEATURES, directed=self._params['directed'],
                              date_format=self._params['date_format'], largest_cc=self._params['max_connected'])
        self._graphs.build(force_rebuild_ftr=REBUILD_FEATURES, pick_ftr=RE_PICK_FTR, should_zscore=False)
        self.labels = self._graphs.get_labels()

        # normalize features ---------------------------------
        self._graphs.norm_features(log_norm)

        pearson_picker = PearsonFeaturePicker(self._graphs, size=self._params['ftr_pairs'],
                                              logger=self._logger, identical_bar=self._params['identical_bar'])
        best_pairs = pearson_picker.best_pairs()
        beta = LinearContext(self._graphs, best_pairs, split=self._params['context_beta'])
        self.beta_matrix = beta.beta_matrix()
示例#16
0
 def __init__(self, database_name, csv_source, time_format, time_col, src_col, dst_col, subgraph_name_col,
              weight_col=None, label_col=None, weeks=0, days=0, hours=0, minutes=0, seconds=0, logger=None,
              time_format_out=None, directed=False):
     self._labels = {}
     self._order = {}
     self._times = []
     self._times_index = {}
     self._graphs_for_time = {}
     self._subgraph_name_col = subgraph_name_col
     self._csv_source = csv_source
     self._database_name = database_name
     self._directed = directed
     self._time_format = time_format
     self._time_col = time_col
     self._src_col = src_col
     self._dst_col = dst_col
     self._weight_col = weight_col
     self._label_col = label_col
     self._format_out = self._time_format_out(time_format_out)
     self._timedelta = timedelta(weeks=weeks, days=days, hours=hours, minutes=minutes, seconds=seconds)
     self._time_col = time_col
     self._src_col = src_col
     self._logger = logger if logger else PrintLogger()
     self._edge_list_dict = self._break_by_time()
     self._mg_dict = self._build_multi_graphs()
     self._number_of_times = len(self._edge_list_dict)
def build_model(rand_test_indices, train_indices,traint,testt, labels ,X, adj_tr, adj_te, in_features,
                hid_features,out_features,ds_name, activation, optimizer, epochs, dropout, lr, l2_pen,
                beta, gamma, dumping_name, GS,is_nni=False):
    optim_name="SGD"
    if optimizer==optim.Adam:
        optim_name = "Adam"
    conf = {"in_features":in_features, "hid_features": hid_features, "out_features":out_features,"ds_name":ds_name,
            "dropout": dropout, "lr": lr, "weight_decay": l2_pen,
             "beta": beta, "gamma": gamma,
            #"training_mat": training_data, "training_labels": training_labels,
            # "test_mat": test_data, "test_labels": test_labels,
            "train_ind": train_indices, "test_ind": rand_test_indices, "traint":traint,"testt":testt, "labels":labels, "X":X,
            "adj_tr": adj_tr,"adj_te": adj_te,
            "optimizer": optimizer, "epochs": epochs, "activation": activation,"optim_name":optim_name}

    products_path = os.path.join(os.getcwd(), "logs", dumping_name, time.strftime("%Y%m%d_%H%M%S"))
    if not os.path.exists(products_path):
        os.makedirs(products_path)

    logger = multi_logger([
        PrintLogger("MyLogger", level=logging.DEBUG),
        FileLogger("results_%s" % dumping_name, path=products_path, level=logging.INFO)], name=None)

    data_logger = CSVLogger("results_%s" % dumping_name, path=products_path)
    data_logger.info("model_name", "loss", "acc")



    runner = ModelRunner(conf, GS,logger=logger, data_logger=data_logger, is_nni=is_nni)
    return runner
示例#18
0
def create_features(data_name, time_range):
    for i in range(time_range):
        gnx = pickle.load(open("./dataset/"+data_name+"/pkl/gcn_input/"+"graph_"+str(i)+".pkl","rb"))

        # with open(os.path.join('data',str(data_name),'gcn_input', 'graph_'+str(i)+'.pkl'), 'rb') as f:
        #     gnx = pickle.load(f)
        logger = PrintLogger("MyLogger")
        features_meta = {
            "page_rank": FeatureMeta(PageRankCalculator, {"pr"}),
            "general": FeatureMeta(GeneralCalculator, {"gen"}),
            "Average_Neighbor_Degree": FeatureMeta(AverageNeighborDegreeCalculator, {"avg_nd"}),
            "k_core": FeatureMeta(KCoreCalculator, {"kc"})}
    
        features = GraphFeatures(gnx, features_meta, "./dataset/"+str(data_name)+"/pkl/feature", logger=logger)
        features.build()
        mx = features.to_matrix(mtype=np.matrix)

        pickle.dump(mx, open("./dataset/"+data_name+"/pkl/gcn_input/"+"mx_"+str(i)+".pkl", "wb"))
        # with open(os.path.join('data',str(data_name),'gcn_input','mx_'+str(i)+'.pkl'), 'wb') as f:
        #     pickle.dump(mx, f, protocol=pickle.HIGHEST_PROTOCOL)
            
    return

# with open(os.path.join('data',str(data_name),'pkl', 'mx_1.pkl'), 'rb') as f:
#     l = pickle.load(f)
#
# print (l[0])
    def _calc_features(self, pkl=True):
        # load dictionary if exists
        if pkl and self._ftr_pkl_name() in os.listdir(
                os.path.join(self._base_dir, 'pkl', 'ftr_by_time_dictionaries')):
            self._features_by_time, self._multi_graphs_by_time = \
                pickle.load(open(os.path.join(self._base_dir, 'pkl', 'ftr_by_time_dictionaries',
                                              self._ftr_pkl_name()), "rb"))
            return

        self._load_database()
        labels = self._database.labels
        # make directory for database
        dir_path = os.path.join(self._base_dir, 'pkl', 'graph_measures', self._params['database_full_name'])
        if self._params['database_full_name'] not in os.listdir(os.path.join(self._base_dir, 'pkl', 'graph_measures')):
            os.mkdir(dir_path)

        # calculate features
        for multi_graph in self._database.multi_graph_by_window(self._params['window_size'],
                                                                self._params['start_time']):
            ftr_tmp_dict = {}
            for name in multi_graph.graph_names():
                raw_ftr = GraphFeatures(multi_graph.get_gnx(name), NODE_FEATURES_ML, dir_path,
                                        is_max_connected=self._params['max_connected'],
                                        logger=PrintLogger(self._params['database_full_name']))
                nodes_and_edges = [multi_graph.node_count(graph_id=name), multi_graph.edge_count(graph_id=name)]
                ftr_tmp_dict[name] = (FeaturesProcessor(raw_ftr).activate_motif_ratio_vec(to_add=nodes_and_edges),
                                      labels[name])
            self._features_by_time.append(ftr_tmp_dict)

            multi_graph.suspend_logger()
            self._multi_graphs_by_time.append(multi_graph)

        pickle.dump((self._features_by_time, self._multi_graphs_by_time),
                    open(os.path.join(self._base_dir, 'pkl', 'ftr_by_time_dictionaries', self._ftr_pkl_name()), "wb"))
示例#20
0
def test_feature():
    from loggers import PrintLogger
    from measure_tests.test_graph import get_graph
    gnx = get_graph()
    feat = MultiDimensionalScalingCalculator(
        gnx, logger=PrintLogger("Keren's Logger"))
    res = feat.build()
    print(res)
示例#21
0
 def __init__(self,
              source_file,
              num_prefix=120,
              num_suffix=200,
              delta=(0.2, 0.5, 0.3)):
     self._logger = PrintLogger("NLP-ass1")
     self._delta = delta
     self._source = source_file
     self._num_prefix = num_prefix
     self._num_suffix = num_suffix
     # counters
     self._emission_count, self._transition_count, self._suffix_count = self._get_data(
     )
     self._pos_list = list(
         set(list(self._transition_count[0].keys()) + [START]))
     self._num_pos = len(self._pos_list)
     self._pos_idx = {pos: i for i, pos in enumerate(self._pos_list)}
def test_neighbor_histogram():
    gnx = sample_graph()
    logger = PrintLogger()
    calc = NthNeighborNodeEdgeHistogramCalculator(2, gnx, logger=logger)
    calc.build()
    n = calc.to_matrix()
    # (self, gnx, name, abbreviations, logger=None):
    # m = calculate_second_neighbor_vector(gnx, colors)
    print('bla')
示例#23
0
class FeaturesPicker:
    def __init__(self,
                 graphs: Graphs,
                 logger: BaseLogger = None,
                 size=10,
                 identical_bar=0.6):
        if logger:
            self._logger = logger
        else:
            self._logger = PrintLogger("default logger")
        self._size = size  # number of pairs to pick
        self._graphs = graphs
        self._features_matrix = self._get_features_np_matrix()
        self._identical_bar = identical_bar  # if feature has identical values to more then bar*|V| - feature is dropped
        self._features_identicality = [
        ]  # percentage of biggest vertices group with same value per feature
        self._fill_features_identicality()
        self._best_pairs = self._pick()

    def _get_features_np_matrix(self):
        return self._graphs.features_matrix_by_index(for_all=True)

    # fill best pairs with the most informative pair of features
    def _pick(self):
        raise NotImplementedError()

    def best_pairs(self):
        return self._best_pairs

    def _fill_features_identicality(self):
        self._logger.debug("start features identicality")
        rows, cols = self._features_matrix.shape
        for i in range(cols):
            self._features_identicality.append(
                collections.Counter(self._features_matrix[:, i].T.tolist()
                                    [0]).most_common(1)[0][1] / rows)
        self._logger.debug("end_features identicality")

    def _identicality_for(self, feature_index):
        return self._features_identicality[feature_index]

    def _is_feature_relevant(self, feature_index):
        return True if self._features_identicality[
            feature_index] < self._identical_bar else False
示例#24
0
def main_clean():
    args = parse_args()
    dataset = "citeseer"

    seed = random.randint(1, 1000000000)
    # "feat_type": "neighbors",
    conf = {
        "kipf": {
            "hidden": args.hidden,
            "dropout": args.dropout,
            "lr": args.lr,
            "weight_decay": args.weight_decay
        },
        "hidden_layers": [16],
        "multi_hidden_layers": [100, 35],
        "dropout": 0.6,
        "lr": 0.01,
        "weight_decay": 0.001,
        "dataset": dataset,
        "epochs": args.epochs,
        "cuda": args.cuda,
        "fastmode": args.fastmode,
        "seed": seed
    }

    init_seed(conf['seed'], conf['cuda'])
    dataset_path = os.path.join(PROJ_DIR, "data", dataset)

    products_path = os.path.join(CUR_DIR, "logs", args.prefix + dataset,
                                 time.strftime("%Y_%m_%d_%H_%M_%S"))
    if not os.path.exists(products_path):
        os.makedirs(products_path)

    logger = multi_logger([
        PrintLogger("IdansLogger", level=logging.DEBUG),
        FileLogger("results_%s" % conf["dataset"],
                   path=products_path,
                   level=logging.INFO),
        FileLogger("results_%s_all" % conf["dataset"],
                   path=products_path,
                   level=logging.DEBUG),
    ],
                          name=None)

    data_logger = CSVLogger("results_%s" % conf["dataset"], path=products_path)
    data_logger.info("model_name", "loss", "acc", "train_p")

    runner = ModelRunner(dataset_path,
                         conf,
                         logger=logger,
                         data_logger=data_logger)
    # execute_runner(runner, logger, 5, num_iter=30)

    for train_p in range(5, 90, 10):
        execute_runner(runner, logger, train_p, num_iter=10)
    logger.info("Finished")
def calculate_test_feature(calculator, is_max_connected=False):
    from loggers import PrintLogger
    logger = PrintLogger("TestLogger")
    res = {}
    for g_type, gnx in [("directed", get_di_graph()),
                        ("undirected", get_graph())]:
        gnx = filter_gnx(gnx, is_max_connected)
        feat = calculator(gnx, logger=logger)
        res[g_type] = feat.build()
    return res
示例#26
0
 def __init__(self, graphs, scores_list, database_name, logger: BaseLogger = None):
     self._database_name = database_name
     if logger:
         self._logger = logger
     else:
         self._logger = PrintLogger("default anomaly picker logger")
     self._graphs = graphs
     self._scores_list = scores_list
     self._anomalies = []
     self._anomalies_calculated = False
示例#27
0
    def __init__(self, params: AdParams):
        self._base_dir = __file__.replace("/", os.sep)
        self._base_dir = os.path.join(self._base_dir.rsplit(os.sep, 1)[0])
        self._data_path = os.path.join(self._base_dir, "INPUT_DATA", params.database.DATABASE_FILE)
        self._params = params
        self._data_name = params.database.DATABASE_NAME
        self._logger = PrintLogger("Anomaly logger")
        self._temporal_graph = self._build_temporal_graph()
        self._ground_truth = self._load_ground_truth(self._params.database.GROUND_TRUTH)
        # self._temporal_graph.filter(
        #         lambda x: False if self._temporal_graph.node_count(x) < 20 else True,
        #         func_input="graph_name")
        self._idx_to_name = list(self._temporal_graph.graph_names())
        self._name_to_idx = {name: idx for idx, name in enumerate(self._idx_to_name)}

        if self._params.vec_type == "motif_ratio":
            self._build_second_method()
        elif self._params.vec_type == "regression":
            self._build_first_method()
示例#28
0
def test_graph():
    logger = PrintLogger("Oved's logger")
    path = "test_graphs"
    graphs = Graphs("test - Debug", logger=logger, files_path=path)
    graphs.build()
    G_1 = graphs.get_subgraph("time_1")
    G_2 = graphs.get_subgraph("time_2")
    G_3 = graphs.get_subgraph("time_3")

    stop = 0
示例#29
0
    def __init__(self,
                 edge_path,
                 dir_path,
                 features,
                 acc=True,
                 directed=False,
                 gpu=False,
                 device=2,
                 verbose=True,
                 params=None):
        """
        A class used to calculate features for a given graph, input as a text-like file.

        :param edge_path: str
        Path to graph edges file (text-like file, e.g. txt or csv), from which the graph is built using networkx.
        The graph must be unweighted. If its vertices are not [0, 1, ..., n-1], they are mapped to become
        [0, 1, ..., n-1] and the mapping is saved.
        Every row in the edges file should include "source_id,distance_id", without a header row.
        :param dir_path: str
        Path to the directory in which the feature calculations will be (or already are) located.
        :param features: list of strings
        List of the names of each feature. Could be any name from features_meta.py or "additional_features".
        :param acc: bool
        Whether to run the accelerated features, assuming it is possible to do so.
        :param directed: bool
        Whether the built graph is directed.
        :param gpu: bool
        Whether to use GPUs, assuming it is possible to do so (i.e. the GPU exists and the CUDA matches).
        :param device: int
        If gpu is True, indicates on which GPU device to calculate. Will return error if the index doesn't match the
        available GPUs.
        :param verbose: bool
        Whether to print things indicating the phases of calculations.
        :param params: dict, or None
        For clique detection uses, this is a dictionary of the graph settings
        (size, directed, clique size, edge probability). Ignored for any other use.
        """

        self._dir_path = dir_path
        self._features = features  # By their name as appears in accelerated_features_meta
        self._gpu = gpu
        self._device = device
        self._verbose = verbose
        self._logger = multi_logger([PrintLogger("Logger", level=logging.DEBUG),
                                     FileLogger("FLogger", path=dir_path, level=logging.INFO)], name=None) \
            if verbose else None
        self._params = params
        self._load_graph(edge_path, directed)
        self._get_feature_meta(
            features,
            acc)  # acc determines whether to use the accelerated features

        self._adj_matrix = None
        self._raw_features = None
        self._other_features = None
示例#30
0
    def __init__(self, path, eps=0.01, recall=0.7):

        self._params = {
            'database': 'Refael',
            'files_path': path,
            'date_format': None,  # Twitter
            'directed': True,
            'max_connected': False,
            'logger_name': "logger",
            'ftr_pairs': 300,
            'identical_bar': 0.95,
            'context_beta': 1,
        }

        self._logger = PrintLogger(self._params['logger_name'])
        self._graphs = Graphs(self._params['database'],
                              files_path=self._params['files_path'],
                              logger=self._logger,
                              features_meta=ANOMALY_DETECTION_FEATURES,
                              directed=self._params['directed'],
                              date_format=self._params['date_format'],
                              largest_cc=self._params['max_connected'])
        self._graphs.build(force_rebuild_ftr=REBUILD_FEATURES,
                           pick_ftr=RE_PICK_FTR,
                           should_zscore=False)

        # normalize features ---------------------------------
        self._graphs.norm_features(log_norm)

        # labels
        self.labels = self._graphs.get_labels()

        pearson_picker = PearsonFeaturePicker(
            self._graphs,
            size=self._params['ftr_pairs'],
            logger=self._logger,
            identical_bar=self._params['identical_bar'])
        best_pairs = pearson_picker.best_pairs()
        self._pairs_header = best_pairs

        if os.path.exists(BETA_PKL_P):
            self._beta_matrix = pickle.load(open(BETA_PKL_P, "rb"))
        else:
            beta = LinearContext(self._graphs,
                                 best_pairs,
                                 split=self._params['context_beta'])
            self._beta_matrix = beta.beta_matrix()
            pickle.dump(self._beta_matrix, open(BETA_PKL_P, "wb"))

        self._beta_df = self._beta_matrix_to_df(header=self._pairs_header)
        # self._best_beta_df = self._best_pairs_df()
        self._best_beta_df = self._beta_df
        res_df = self._learn_RF(
            self._pca_df(self._best_beta_df, graph_data=True, min_nodes=10))
        self.plot_learning_df(res_df)