Python PrintLogger.info примеры использования

Язык программирования: Python

Пространство имен/Пакет: loggers

Класс/Тип: PrintLogger

Метод/Функция: info

Примеров на hotexamples.com: 7

Python PrintLogger.info - 7 примеров найдено. Это лучшие примеры Python кода для loggers.PrintLogger.info, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PrintLogger(30)

info(7)

debug(4)

Основные методы

PrintLogger (30)

info (7)

debug (4)

Пример #1

Показать файл

def test_main():
    import numpy as np
    from features_infra.graph_features import GraphFeatures
    from loggers import PrintLogger
    import os
    import pickle
    import networkx as nx

    dataset = "citeseer"
    logger = PrintLogger("MetaTest")
    base_dir = r"/home/benami/git/pygcn/data"
    gnx = pickle.load(open(os.path.join(base_dir, dataset, "gnx.pkl"), 'rb'))

    max_subgnx = max(nx.connected_component_subgraphs(gnx.to_undirected()),
                     key=len)
    gnx = gnx.subgraph(max_subgnx)

    features = GraphFeatures(gnx,
                             TEST_FEATURES,
                             dir_path="./%s_features_sub" % dataset,
                             logger=logger)
    features.build(should_dump=True)
    measures_mx = features.to_matrix(add_ones=False,
                                     dtype=np.float32,
                                     mtype=np.matrix)
    logger.info("Finished")

Пример #2

Показать файл

Файл: anomaly_detection.py Проект: ovednagar/louzounlab-graph_ad

class TPGAD:
    def __init__(self, params):
        self._params = params if type(params) is dict else json.load(open(params, "rt"))
        self._logger = PrintLogger("graph-ad")
        self._temporal_graph = self._build_temporal_graph()
        self._ground_truth = self._load_ground_truth(self._params['gt']['filename'])
        self._num_anomalies = len(self._ground_truth)*2
        self._idx_to_graph = list(self._temporal_graph.graph_names())
        self._graph_to_idx = {name: idx for idx, name in enumerate(self._idx_to_graph)}
        self._run_ad()

    def _load_ground_truth(self, gt_file):
        df = pd.read_csv(gt_file)
        return {self._temporal_graph.name_to_index(row.anomaly): row.get("score", 1) for i, row in df.iterrows()}

    def data_name(self):
        max_connected = "max_connected_" if self._params['features']['max_connected'] else ""
        directed = "directed" if self._params['dataset']['directed'] else "undirected"
        weighted = "weighted_" if self._params['dataset']['weight_col'] is not None else ""
        return f"{self._params['dataset']['name']}_{weighted}{max_connected}{directed}"

    def _build_temporal_graph(self):
        tg_pkl_dir = os.path.join(self._params['general']['pkl_path'], "temporal_graphs")
        tg_pkl_path = os.path.join(tg_pkl_dir, f"{self.data_name()}_tg.pkl")
        if os.path.exists(tg_pkl_path):
            self._logger.info("loading pkl file - temporal_graphs")
            tg = pickle.load(open(tg_pkl_path, "rb"))
        else:
            tg = TemporalGraph(self.data_name(), self._params['dataset']['filename'], self._params['dataset']['time_format'],
                               self._params['dataset']['time_col'], self._params['dataset']['src_col'],
                               self._params['dataset']['dst_col'],
                               weight_col=self._params['dataset'].get('weight_col', None),
                               weeks=self._params['dataset'].get('week_split', None),
                               days=self._params['dataset'].get('day_split', None),
                               hours=self._params['dataset'].get('hour_split', None),
                               minutes=self._params['dataset'].get('min_split', None),
                               seconds=self._params['dataset'].get('sec_split', None),
                               directed=self._params['dataset']['directed'],
                               logger=self._logger).to_multi_graph()

            tg.suspend_logger()
            if self._params['general']["dump_pkl"]:
                os.makedirs(tg_pkl_dir, exist_ok=True)
                pickle.dump(tg, open(tg_pkl_path, "wb"))
            tg.wake_logger()
        return tg

    def _calc_tg_feature_matrix(self):
        log_ext = "log_" if self._params['features']['log'] else ""
        feature_matrix_dir = os.path.join(self._params['general']['pkl_path'], "gt_feature_matrix")
        mat_pkl = os.path.join(feature_matrix_dir, f"{self.data_name()}_{log_ext}tg_feature_matrices.pkl")

        if os.path.exists(mat_pkl):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl, "rb"))

        gnx_to_vec = {}
        # create dir for database
        database_pkl_dir = os.path.join(self._params['general']['pkl_path'], "features", self.data_name())
        for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, re.sub('[^a-zA-Z0-9]', '_', gnx_name))
            if self._params['general']["dump_pkl"]:
                os.makedirs(gnx_path, exist_ok=True)

            gnx_ftr = GraphFeatures(gnx, ANOMALY_DETECTION_FEATURES, dir_path=gnx_path, logger=self._logger,
                                    is_max_connected=self._params['features']['max_connected'])
            gnx_ftr.build(should_dump=self._params['general']["dump_pkl"],
                          force_build=self._params['general']['FORCE_REBUILD_FEATURES'])  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(norm_func=log_norm if self._params['features']['log'] else None)
        if self._params['general']['dump_pkl']:
            os.makedirs(feature_matrix_dir, exist_ok=True)
            pickle.dump(gnx_to_vec, open(mat_pkl, "wb"))
        return gnx_to_vec

    def _get_beta_vec(self, mx_dict, best_pairs):
        self._logger.debug("calculating beta vectors")

        if self._params['beta_vectors']['type'] == "regression":
            beta = LinearContext(self._temporal_graph, mx_dict, best_pairs,
                                 window_size=self._params['beta_vectors']['window_size'])
        elif self._params['beta_vectors']['type'] == "mean_regression":
            beta = LinearMeanContext(self._temporal_graph, mx_dict, best_pairs,
                                     window_size=self._params['beta_vectors']['window_size'])
        else:
            raise RuntimeError(f"invalid value for params[beta_vectors][type], got {self._params['beta_vectors']['type']}"
                               f" while valid options are: regression/mean_regression ")
        if self._params['general']['dump_pkl']:
            beta_pkl_dir = os.path.join(self._params['general']['pkl_path'], "beta_matrix")
            tg_pkl_path = os.path.join(beta_pkl_dir, f"{self.data_name()}_beta.pkl")
            os.makedirs(beta_pkl_dir, exist_ok=True)
            pickle.dump(beta.beta_matrix(), open(tg_pkl_path, "wb"))
        self._logger.debug("finish calculating beta vectors")

        return beta

    def _get_graphs_score(self, beta_matrix):
        score_type = self._params['score']['type']
        if score_type == "knn":
            return KnnScore(beta_matrix, self._params['score']['params']['knn']['k'], self.data_name(),
                            window_size=self._params['score']['window_size'])
        elif score_type == "gmm":
            return GmmScore(beta_matrix, self.data_name(), window_size=self._params['score']['window_size'],
                            n_components=self._params['score']['params']['gmm']['n_components'])
        elif score_type == "local_outlier":
            return LocalOutlierFactorScore(beta_matrix, self.data_name(), window_size=self._params['score']['window_size'],
                                           n_neighbors=self._params['score']['params']['local_outlier']['n_neighbors'])
        else:
            raise RuntimeError(f"invalid value for params[beta_vectors][type], got {score_type}"
                               f" while valid options are: knn/gmm/local_outlier")

    def _run_ad(self):
        mx_dict = self._calc_tg_feature_matrix()
        concat_mx = np.vstack([mx for name, mx in mx_dict.items()])
        pearson_picker = PearsonFeaturePicker(concat_mx, size=self._params['feature_pair_picker']['num_pairs'],
                                              logger=self._logger, identical_bar=self._params['feature_pair_picker']['overlap_bar'])
        best_pairs = pearson_picker.best_pairs()
        beta_matrix = self._get_beta_vec(mx_dict, best_pairs).beta_matrix()
        scores = self._get_graphs_score(beta_matrix).score_list()

        anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, scores, self.data_name(),
                                             num_anomalies=self._num_anomalies)
        anomaly_picker.build()
        anomaly_picker.plot_anomalies_bokeh("", truth=self._ground_truth,
                                            info_text=str(self._params))

Пример #3

Показать файл

class MleEstimator:
    def __init__(self,
                 source_file,
                 num_prefix=120,
                 num_suffix=200,
                 delta=(0.2, 0.5, 0.3)):
        self._logger = PrintLogger("NLP-ass1")
        self._delta = delta
        self._source = source_file
        self._num_prefix = num_prefix
        self._num_suffix = num_suffix
        # counters
        self._emission_count, self._transition_count, self._suffix_count = self._get_data(
        )
        self._pos_list = list(
            set(list(self._transition_count[0].keys()) + [START]))
        self._num_pos = len(self._pos_list)
        self._pos_idx = {pos: i for i, pos in enumerate(self._pos_list)}
        # probabilities
        #self._emission, self._transition, self._prefix, self._suffix = self._calc_probabilities()

    def _get_data(self):
        self._logger.info("get-data - start")
        transition = {0: {}, 1: {}, 2: {}}
        emission = {}
        suffix = {}
        word_counter = 0
        src_file = open(self._source, "rt")  # open file
        for line in src_file:
            t1 = START
            t2 = START
            w_pos = []
            for w_p in line.split():  # break line to [.. (word, POS) ..]
                word, pos = w_p.rsplit("/", 1)
                w_pos.append((word, pos))
            for i, (word, pos) in enumerate(w_pos):
                word_counter += 1
                # -------- EMISSION ----------
                emission[(word, pos)] = emission.get(
                    (word, pos), 0) + 1  # count (word, POS)++
                # --------- SUFFIX -----------
                if word_counter % 10 == 0:
                    suffix[(word[-SUFF:], pos)] = suffix.get(
                        (word[-SUFF:], pos), 0) + 1
                # ------- TRANSITION ---------
                transition[0][pos] = transition[0].get(pos,
                                                       0) + 1  # count(POS)
                transition[1][(t1, pos)] = transition[1].get(
                    (t1, pos), 0) + 1  # count(POS_1, POS_2)
                transition[2][(t2, t1, pos)] = transition[2].get(
                    (t2, t1, pos), 0) + 1  # count(POS_0, POS_1, POS_2)
                t2 = t1
                t1 = pos
        self._logger.info("get-data - end")
        return emission, transition, suffix

    def mle_count_to_txt(self, e_mle_path, q_mle_path):
        self._logger.info("writing e_mle...")
        out_e = open(e_mle_path, "wt")
        out_e.writelines([
            word + " " + pos + "\t" + str(count) + "\n"
            for (word, pos), count in self._emission_count.items()
        ])
        out_e.writelines([
            "^" + sufi + " " + pos + "\t" + str(count) + "\n"
            for (sufi, pos), count in self._suffix_count.items()
        ])
        out_e.close()
        self._logger.info("writing q_mle...")
        out_q = open(q_mle_path, "wt")
        out_q.writelines([
            pos + "\t" + str(count) + "\n"
            for pos, count in self._transition_count[0].items()
        ])
        out_q.writelines([
            pos1 + " " + pos0 + "\t" + str(count) + "\n"
            for (pos1, pos0), count in self._transition_count[1].items()
        ])
        out_q.writelines([
            pos2 + " " + pos1 + " " + pos0 + "\t" + str(count) + "\n"
            for (pos2, pos1, pos0), count in self._transition_count[2].items()
        ])
        out_q.close()

Пример #4

Показать файл

Файл: anomaly_detection.py Проект: louzounlab/graph-ad

class AnomalyDetection:
    def __init__(self, params: AdParams):
        self._base_dir = __file__.replace("/", os.sep)
        self._base_dir = os.path.join(self._base_dir.rsplit(os.sep, 1)[0])
        self._data_path = os.path.join(self._base_dir, "INPUT_DATA", params.database.DATABASE_FILE)
        self._params = params
        self._data_name = params.database.DATABASE_NAME
        self._logger = PrintLogger("Anomaly logger")
        self._temporal_graph = self._build_temporal_graph()
        self._ground_truth = self._load_ground_truth(self._params.database.GROUND_TRUTH)
        # self._temporal_graph.filter(
        #         lambda x: False if self._temporal_graph.node_count(x) < 20 else True,
        #         func_input="graph_name")
        self._idx_to_name = list(self._temporal_graph.graph_names())
        self._name_to_idx = {name: idx for idx, name in enumerate(self._idx_to_name)}

        if self._params.vec_type == "motif_ratio":
            self._build_second_method()
        elif self._params.vec_type == "regression":
            self._build_first_method()

    def _load_ground_truth(self, gd):
        if type(gd) is list:
            return {self._temporal_graph.name_to_index(g_id): 1 for g_id in gd}
        elif type(gd) is dict:
            return {self._temporal_graph.name_to_index(g_id): float(val) for g_id, val in gd.items()}
        return None

    def _build_temporal_graph(self):
        database_name = self._params.database.DATABASE_NAME + "_" + str(self._params.max_connected)\
                        + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(self._base_dir, "pkl", "temporal_graphs", database_name + "_tg.pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - temoral_graphs")
            tg = pickle.load(open(vec_pkl_path, "rb"))
        else:
            tg = TemporalGraph(database_name, self._data_path, self._params.database.DATE_FORMAT,
                               self._params.database.TIME_COL, self._params.database.SRC_COL,
                               self._params.database.DST_COL, weight_col=self._params.database.WEIGHT_COL,
                               weeks=self._params.database.WEEK_SPLIT, days=self._params.database.DAY_SPLIT,
                               hours=self._params.database.HOUR_SPLIT, minutes=self._params.database.MIN_SPLIT,
                               seconds=self._params.database.SEC_SPLIT, directed=self._params.directed,
                               logger=self._logger).to_multi_graph()
            tg.suspend_logger()
            pickle.dump(tg, open(vec_pkl_path, "wb"))
        tg.wake_logger()
        return tg

    def _calc_matrix(self):
        database_name = self._params.database.DATABASE_NAME + "_" + str(self._params.max_connected) + "_" + str(
            self._params.directed)
        mat_pkl_path = os.path.join(self._base_dir, "pkl", "vectors", database_name + "_matrix_log" +
                                    str(self._params.log) + ".pkl")
        if os.path.exists(mat_pkl_path):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl_path, "rb"))

        gnx_to_vec = {}
        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(gnx, self._params.features, dir_path=gnx_path, logger=self._logger,
                                    is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES)  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(norm_func=log_norm) if self._params.log else \
                FeaturesProcessor(gnx_ftr).as_matrix()

        pickle.dump(gnx_to_vec, open(mat_pkl_path, "wb"))
        return gnx_to_vec

    def _calc_vec(self):
        database_name = self._params.database.DATABASE_NAME + "_" + \
                        str(self._params.max_connected) + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(self._base_dir, "pkl", "vectors", database_name + "_vectors_log_" +
                                    str(self._params.log) + ".pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - graph_vectors")
            return pickle.load(open(vec_pkl_path, "rb"))

        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        gnx_to_vec = {}
        for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(gnx, self._params.features, dir_path=gnx_path, logger=self._logger,
                                    is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES)  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).activate_motif_ratio_vec(norm_func=log_norm)\
                if self._params.log else FeaturesProcessor(gnx_ftr).activate_motif_ratio_vec()

        pickle.dump(gnx_to_vec, open(vec_pkl_path, "wb"))
        return gnx_to_vec

    def _build_first_method(self):
        mx_dict = self._calc_matrix()
        concat_mx = np.vstack([mx for name, mx in mx_dict.items()])
        pearson_picker = PearsonFeaturePicker(concat_mx, size=self._params.ftr_pairs,
                                              logger=self._logger, identical_bar=self._params.identical_bar)
        best_pairs = pearson_picker.best_pairs()
        beta = LinearContext(self._temporal_graph, mx_dict, best_pairs, window_size=self._params.window_correlation)
        beta_matrix = beta.beta_matrix()
        if self._params.score_type == "knn":
            score = KnnScore(beta_matrix, self._params.KNN_k, self._data_name,
                             window_size=self._params.window_score)
        elif self._params.score_type == "gmm":
            score = GmmScore(beta_matrix, self._data_name, window_size=self._params.window_score,
                             n_components=self._params.n_components)
        else:   # self._params["score_type"] == "local_outlier":
            score = LocalOutlierFactorScore(beta_matrix, self._data_name, window_size=self._params.window_score,
                                            n_neighbors=self._params.n_neighbors)
        anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, score.score_list(), self._data_name,
                                             num_anomalies=self._params.n_outliers)
        anomaly_picker.build()
        anomaly_picker.plot_anomalies_bokeh(self._params.anomalies_file_name, truth=self._ground_truth,
                                            info_text=self._params.tostring())

    def _build_second_method(self):
        self._graph_to_vec = self._calc_vec()
        self._graph_matrix = np.vstack([self._graph_to_vec[name] for name in self._temporal_graph.graph_names()])
        if self._params.log:
            self._graph_matrix = log_norm(self._graph_matrix)

        if self._params.score_type == "knn":
            score = KnnScore(self._graph_matrix, self._params.KNN_k, self._data_name,
                             window_size=self._params.window_score)
        elif self._params.score_type == "gmm":
            score = GmmScore(self._graph_matrix, self._data_name, window_size=self._params.window_score,
                             n_components=self._params.n_components)
        else:   # self._params["score_type"] == "local_outlier":
            score = LocalOutlierFactorScore(self._graph_matrix, self._data_name,
                                            window_size=self._params.window_score,
                                            n_neighbors=self._params.n_neighbors)

        anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, score.score_list(), self._data_name,
                                             num_anomalies=self._params.n_outliers)
        anomaly_picker.build()
        anomaly_picker.plot_anomalies_bokeh(self._params.anomalies_file_name, truth=self._ground_truth,
                                            info_text=self._params.tostring())

Пример #5

Показать файл

class AnomalyDetectionOperationResearch:
    def __init__(self, params: AdParams, name):
        self._base_dir = __file__.replace("/", os.sep)
        self._base_dir = os.path.join(
            self._base_dir.rsplit(os.sep, 1)[0], "..")
        self._data_path = os.path.join(self._base_dir, "INPUT_DATA",
                                       params.database.DATABASE_FILE)
        self._params = params
        self._data_name = params.database.DATABASE_NAME
        self._logger = PrintLogger("Anomaly logger")
        self._temporal_graph = self._build_temporal_graph()
        self._ground_truth = self._load_ground_truth(
            self._params.database.GROUND_TRUTH)
        self._idx_to_name = list(self._temporal_graph.graph_names())
        self._name_to_idx = {
            name: idx
            for idx, name in enumerate(self._idx_to_name)
        }
        self._out = open(os.path.join("..", name), "wt")
        self._out.write(",".join([
            "FN", "TN", "TP", "FP", "recall", "precision", "specificity", "F1",
            self._params.attr_string()
        ]) + "\n")
        self._build()

    def _load_ground_truth(self, gd):
        if type(gd) is list:
            return {self._temporal_graph.name_to_index(g_id): 1 for g_id in gd}
        elif type(gd) is dict:
            return {
                self._temporal_graph.name_to_index(g_id): float(val)
                for g_id, val in gd.items()
            }
        return None

    def _build_temporal_graph(self):
        database_name = self._params.database.DATABASE_NAME + "_" + str(self._params.max_connected)\
                        + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(self._base_dir, "pkl", "temporal_graphs",
                                    database_name + "_tg.pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - temoral_graphs")
            tg = pickle.load(open(vec_pkl_path, "rb"))
        else:
            tg = TemporalGraph(database_name,
                               self._data_path,
                               self._params.database.DATE_FORMAT,
                               self._params.database.TIME_COL,
                               self._params.database.SRC_COL,
                               self._params.database.DST_COL,
                               weight_col=self._params.database.WEIGHT_COL,
                               weeks=self._params.database.WEEK_SPLIT,
                               days=self._params.database.DAY_SPLIT,
                               hours=self._params.database.HOUR_SPLIT,
                               minutes=self._params.database.MIN_SPLIT,
                               seconds=self._params.database.SEC_SPLIT,
                               directed=self._params.directed,
                               logger=self._logger).to_multi_graph()
            tg.suspend_logger()
            pickle.dump(tg, open(vec_pkl_path, "wb"))
        tg.wake_logger()
        return tg

    def _calc_matrix(self):
        database_name = self._params.database.DATABASE_NAME + "_" + str(
            self._params.max_connected) + "_" + str(self._params.directed)
        mat_pkl_path = os.path.join(
            self._base_dir, "pkl", "vectors",
            database_name + "_matrix_log" + str(self._params.log) + ".pkl")
        if os.path.exists(mat_pkl_path):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl_path, "rb"))

        gnx_to_vec = {}
        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        for gnx_name, gnx in zip(self._temporal_graph.graph_names(),
                                 self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_name_path = gnx_name.replace(':', '_')
            gnx_name_path = gnx_name_path.replace('/', '_')
            gnx_path = os.path.join(database_pkl_dir, gnx_name_path)
            if gnx_name_path not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(
                gnx,
                self._params.features,
                dir_path=gnx_path,
                logger=self._logger,
                is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True,
                          force_build=self._params.FORCE_REBUILD_FEATURES
                          )  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(
                norm_func=log_norm)

        pickle.dump(gnx_to_vec, open(mat_pkl_path, "wb"))
        return gnx_to_vec

    def _calc_vec(self):
        database_name = self._params.database.DATABASE_NAME + "_" + \
                        str(self._params.max_connected) + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(
            self._base_dir, "pkl", "vectors",
            database_name + "_vectors_log_" + str(self._params.log) + ".pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - graph_vectors")
            return pickle.load(open(vec_pkl_path, "rb"))

        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        gnx_to_vec = {}
        for gnx_name, gnx in zip(self._temporal_graph.graph_names(),
                                 self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(
                gnx,
                self._params.features,
                dir_path=gnx_path,
                logger=self._logger,
                is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True,
                          force_build=self._params.FORCE_REBUILD_FEATURES
                          )  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(
                gnx_ftr).activate_motif_ratio_vec(norm_func=log_norm)

        pickle.dump(gnx_to_vec, open(vec_pkl_path, "wb"))
        return gnx_to_vec

    def _build(self):
        for lg in [True, False]:
            self._params.log = lg
            for vec_type in ["mean_regression", "regression"]:  # motif_ratio
                self._params.vec_type = vec_type
                self.features = ANOMALY_DETECTION_FEATURES if self._params.vec_type == "regression" else MOTIF_FEATURES,
                if self._params.vec_type == "regression" or self._params.vec_type == "mean_regression":
                    mx_dict = self._calc_matrix()
                    concat_mx = np.vstack([mx for name, mx in mx_dict.items()])
                    for ftr_pairs in [
                            3, 4, 5
                    ]:  # [1, 2, 3, 4, 5, 10] [5, 10, 15, 20, 25, 30, 40, 45, 50]: # [1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 70, 90, 110, 130, 150, 170, 200]: #  [5, 10, 15, 20, 25, 30, 40, 45, 50]
                        self._params.ftr_pairs = ftr_pairs
                        for identical in [
                                0.99
                        ]:  #  [0.7, 0.8, 0.9, 0.95, 0.99] [0.7, 0.8, 0.9, 0.95, 0.99]
                            self._params.identical_bar = identical

                            pearson_picker = PearsonFeaturePicker(
                                concat_mx,
                                size=self._params.ftr_pairs,
                                logger=self._logger,
                                identical_bar=self._params.identical_bar)
                            for win in list(
                                    range(
                                        25,
                                        min(
                                            100,
                                            self._temporal_graph.
                                            number_of_graphs()), 25)):
                                self._params.window_correlation = win
                                best_pairs = pearson_picker.best_pairs()
                                if best_pairs is None:
                                    continue
                                if self._params.vec_type == "regression":
                                    beta = LinearContext(
                                        self._temporal_graph,
                                        mx_dict,
                                        best_pairs,
                                        window_size=self._params.
                                        window_correlation)
                                else:
                                    beta = LinearMeanContext(
                                        self._temporal_graph,
                                        mx_dict,
                                        best_pairs,
                                        window_size=self._params.
                                        window_correlation)
                                beta_matrix = beta.beta_matrix()
                                self._pick_anomalies(beta_matrix)

                elif self._params.vec_type == "motif_ratio":
                    self._graph_to_vec = self._calc_vec()
                    beta_matrix = np.vstack([
                        self._graph_to_vec[name]
                        for name in self._temporal_graph.graph_names()
                    ])
                    self._pick_anomalies(beta_matrix)

    def _pick_anomalies(self, beta_matrix):
        for score_type in ["knn", "gmm", "local_outlier"]:
            self._params.score_type = score_type
            if self._params.score_type == "knn":
                for win in list(
                        range(
                            25,
                            min(100, self._temporal_graph.number_of_graphs()),
                            25)):
                    self._params.window_score = win
                    for k in list(range(5, min(win, 50) - 1, 5)):
                        self._params.KNN_k = k
                        score = KnnScore(beta_matrix,
                                         self._params.KNN_k,
                                         self._data_name,
                                         window_size=self._params.window_score)
                        anomaly_picker = SimpleAnomalyPicker(
                            self._temporal_graph,
                            score.score_list(),
                            self._data_name,
                            num_anomalies=self._params.n_outliers)
                        truth = [
                            self._temporal_graph.name_to_index(g_id)
                            for g_id in self._params.database.GROUND_TRUTH
                        ] if self._params.database.GROUND_TRUTH else None
                        FN, TN, TP, FP, recall, precision, specificity, F1 = anomaly_picker.build(
                            truth=truth)
                        self._out.write(",".join([
                            str(FN),
                            str(TN),
                            str(TP),
                            str(FP),
                            str(recall),
                            str(precision),
                            str(specificity),
                            str(F1),
                            self._params.attr_val_string()
                        ]) + "\n")

            elif self._params.score_type == "gmm":
                for win in list(
                        range(
                            25,
                            min(100, self._temporal_graph.number_of_graphs()),
                            25)):
                    self._params.window_score = win
                    for comp in [1, 2, 3, 4, 5]:
                        self._params.n_components = comp
                        score = GmmScore(
                            beta_matrix,
                            self._data_name,
                            window_size=self._params.window_score,
                            n_components=self._params.n_components)
                        anomaly_picker = SimpleAnomalyPicker(
                            self._temporal_graph,
                            score.score_list(),
                            self._data_name,
                            num_anomalies=self._params.n_outliers)
                        truth = [
                            self._temporal_graph.name_to_index(g_id)
                            for g_id in self._params.database.GROUND_TRUTH
                        ] if self._params.database.GROUND_TRUTH else None
                        FN, TN, TP, FP, recall, precision, specificity, F1 = anomaly_picker.build(
                            truth=truth)
                    self._out.write(",".join([
                        str(FN),
                        str(TN),
                        str(TP),
                        str(FP),
                        str(recall),
                        str(precision),
                        str(specificity),
                        str(F1),
                        self._params.attr_val_string()
                    ]) + "\n")

            elif self._params.score_type == "local_outlier":
                for win in list(
                        range(
                            25,
                            min(100, self._temporal_graph.number_of_graphs()),
                            25)):
                    self._params.window_score = win
                    for neighbors in list(range(5, min(win, 50), 5)):
                        self._params.n_neighbors = neighbors
                        score = LocalOutlierFactorScore(
                            beta_matrix,
                            self._data_name,
                            window_size=self._params.window_score,
                            n_neighbors=self._params.n_neighbors)
                        anomaly_picker = SimpleAnomalyPicker(
                            self._temporal_graph,
                            score.score_list(),
                            self._data_name,
                            num_anomalies=self._params.n_outliers)
                        truth = [
                            self._temporal_graph.name_to_index(g_id)
                            for g_id in self._params.database.GROUND_TRUTH
                        ] if self._params.database.GROUND_TRUTH else None
                        FN, TN, TP, FP, recall, precision, specificity, F1 = anomaly_picker.build(
                            truth=truth)
                        self._out.write(",".join([
                            str(FN),
                            str(TN),
                            str(TP),
                            str(FP),
                            str(recall),
                            str(precision),
                            str(specificity),
                            str(F1),
                            self._params.attr_val_string()
                        ]) + "\n")

Пример #6

Показать файл

class DatasetStat:
    def __init__(self, params: AdParams):
        self._index_ftr = None
        self._base_dir = __file__.replace("/", os.sep)
        self._base_dir = os.path.join(
            self._base_dir.rsplit(os.sep, 1)[0], "..")
        self._data_path = os.path.join(self._base_dir, "INPUT_DATA",
                                       params.database.DATABASE_FILE)
        self._params = params
        self._ground_truth = params.database.GROUND_TRUTH
        self._data_name = params.database.DATABASE_NAME
        self._logger = PrintLogger("Anomaly logger")
        self._temporal_graph = self._build_temporal_graph()
        # self._temporal_graph.filter(
        #         lambda x: False if self._temporal_graph.node_count(x) < 20 else True,
        #         func_input="graph_name")
        self._idx_to_name = list(self._temporal_graph.graph_names())
        self._name_to_idx = {
            name: idx
            for idx, name in enumerate(self._idx_to_name)
        }
        self._graph_to_vec = self._calc_vec()

    def _build_temporal_graph(self):
        database_name = self._data_name + "_" + str(
            self._params.max_connected) + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(self._base_dir, "pkl", "temporal_graphs",
                                    database_name + "_tg.pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - temoral_graphs")
            tg = pickle.load(open(vec_pkl_path, "rb"))
        else:
            tg = TemporalGraph(database_name,
                               self._data_path,
                               self._params.database.DATE_FORMAT,
                               self._params.database.TIME_COL,
                               self._params.database.SRC_COL,
                               self._params.database.DST_COL,
                               weight_col=self._params.database.WEIGHT_COL,
                               weeks=self._params.database.WEEK_SPLIT,
                               days=self._params.database.DAY_SPLIT,
                               hours=self._params.database.HOUR_SPLIT,
                               minutes=self._params.database.MIN_SPLIT,
                               seconds=self._params.database.SEC_SPLIT,
                               directed=self._params.directed,
                               logger=self._logger).to_multi_graph()
            tg.suspend_logger()
            pickle.dump(tg, open(vec_pkl_path, "wb"))
        tg.wake_logger()
        return tg

    def _calc_vec(self):
        database_name = self._params.database.DATABASE_NAME + "_" + \
                        str(self._params.max_connected) + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(
            self._base_dir, "pkl", "vectors",
            database_name + "_vectors_log_" + str(self._params.log) + ".pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - graph_vectors")
            return pickle.load(open(vec_pkl_path, "rb"))

        gnx_to_vec = {}
        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        for gnx_name, gnx in zip(self._temporal_graph.graph_names(),
                                 self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(
                gnx,
                self._params.features,
                dir_path=gnx_path,
                logger=self._logger,
                is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True,
                          force_build=self._params.FORCE_REBUILD_FEATURES
                          )  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(
                gnx_ftr).activate_motif_ratio_vec()

        pickle.dump(gnx_to_vec, open(vec_pkl_path, "wb"))
        return gnx_to_vec

    def _calc_matrix(self):
        database_name = self._data_name + "_" + str(
            self._params.max_connected) + "_" + str(self._params.directed)
        mat_pkl_path = os.path.join(self._base_dir, "pkl", "vectors",
                                    database_name + "_matrix.pkl")
        if os.path.exists(mat_pkl_path):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl_path, "rb"))

        gnx_to_vec = {}
        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        for gnx_name, gnx in zip(self._temporal_graph.graph_names(),
                                 self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(
                gnx,
                self._params.features,
                dir_path=gnx_path,
                logger=self._logger,
                is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True,
                          force_build=self._params.FORCE_REBUILD_FEATURES
                          )  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix()

        pickle.dump(gnx_to_vec, open(mat_pkl_path, "wb"))
        return gnx_to_vec

    # map matrix rows to features + count if there's more then one from feature
    def _set_index_to_ftr(self):
        gnx_name = self._temporal_graph.graph_names().__next__()
        gnx = self._temporal_graph.graphs().__next__()
        database_name = self._data_name + "_" + str(
            self._params.max_connected) + "_" + str(self._params.directed)
        gnx_path = os.path.join(self._base_dir, "pkl", "features",
                                database_name, gnx_name)
        gnx_ftr = GraphFeatures(gnx,
                                self._params.features,
                                dir_path=gnx_path,
                                logger=self._logger,
                                is_max_connected=self._params.max_connected)
        gnx_ftr.build(
            should_dump=False,
            force_build=self._params.FORCE_REBUILD_FEATURES)  # build features

        if not self._index_ftr:
            sorted_ftr = [
                f for f in sorted(gnx_ftr) if gnx_ftr[f].is_relevant()
            ]  # fix feature order (names)
            self._index_ftr = []

            for ftr in sorted_ftr:
                len_ftr = len(gnx_ftr[ftr])
                # fill list with (ftr, counter)
                self._index_ftr += self._get_motif_type(ftr, len_ftr) if ftr == 'motif3' or ftr == 'motif4' else \
                    [(ftr, i) for i in range(len_ftr)]
        return self._index_ftr

    # return [ ... (motif_type, counter) ... ]
    def _get_motif_type(self, motif_type, num_motifs):
        header = []
        for i in range(num_motifs):
            header.append((motif_type, i))
        return header

    def plot_nodes_by_time(self):
        # collect data for plot
        nodes_count_by_time = self._temporal_graph.node_count(
        )  # num of nodes per time
        edges_count_by_time = self._temporal_graph.edge_count(
        )  # num of edges per time

        len_mg = self._temporal_graph.number_of_graphs(
        )  # num of graphs (times)
        x_axis = list(range(len_mg))  # [0... num of times]

        p = figure(plot_width=600,
                   plot_height=250,
                   title=self._data_name + ", node & edge count",
                   x_axis_label="time",
                   y_axis_label="nodes_count")  # create figure

        p.line(x_axis, nodes_count_by_time, legend="nodes",
               line_color="blue")  # plot nodes
        p.line(x_axis, edges_count_by_time, legend="edges",
               line_color="green")  # plot edges

        # plot vertical lines for ground truth
        anomalies = [
            self._name_to_idx[anomaly] for anomaly in self._ground_truth
        ]
        y = [edges_count_by_time[time] for time in anomalies]
        p.scatter(anomalies,
                  y,
                  legend="anomalies",
                  line_color="red",
                  fill_color="red")  # plot nodes
        p.xaxis.major_label_overrides = {
            i: graph_name
            for i, graph_name in enumerate(self._temporal_graph.graph_names())
        }  # time to graph_name dict
        p.legend.location = "top_left"
        show(p)

    def plot_timed_mean_std(self):
        NUM_PLOT_FTR = 20
        mat_dict = self._calc_matrix()
        ftrs = self._set_index_to_ftr()
        ftrs = [str(x) for x in ftrs]

        all_mx = np.vstack([mx for name, mx in mat_dict.items()])
        # sort by highest mean
        global_mean = {
            i: m
            for i, m in enumerate(np.mean(all_mx, 0).tolist()[0])
        }
        sorted_mean = [
            i for i, m in sorted(global_mean.items(), key=lambda x: -x[1])
        ][0:NUM_PLOT_FTR]

        # ----------------------- mean -------------------------
        heat_mx = []
        mean_curves = [[] for i in range(NUM_PLOT_FTR)]
        std_curves = [[] for i in range(NUM_PLOT_FTR)]
        for name, mx in mat_dict.items():
            for i, idx in enumerate(sorted_mean):
                mx_mean = np.mean(mx, 0).tolist()[0]
                mx_std = np.std(mx, 0).tolist()[0]
                mean_curves[i].append(mx_mean[idx])
                std_curves[i].append(mx_std[idx])

        x_axis = list(range(
            self._temporal_graph.number_of_graphs()))  # [0... num of times]
        for i in range(1):  #len(std_curves)):
            i = 16
            p = figure(plot_width=600,
                       plot_height=250,
                       title=self._data_name + " std/mean for " +
                       ftrs[sorted_mean[i]],
                       x_axis_label="time",
                       y_axis_label="nodes_count")  # create figure

            p.line(x_axis, mean_curves[i], legend="mean",
                   line_color="blue")  # plot nodes
            p.line(x_axis, std_curves[i], legend="std",
                   line_color="green")  # plot edges

            # plot vertical lines for ground truth
            anomalies = [
                self._name_to_idx[anomaly] for anomaly in self._ground_truth
            ]
            y = [std_curves[i][time] for time in anomalies]
            p.scatter(anomalies,
                      y,
                      legend="anomalies",
                      line_color="red",
                      fill_color="red")  # plot nodes
            p.xaxis.major_label_overrides = {
                i: graph_name
                for i, graph_name in enumerate(
                    self._temporal_graph.graph_names())
            }  # time to graph_name dict
            p.legend.location = "top_left"
            show(p)
            e = 0

    def plot_mean_std_sheatmap(self):
        ftrs = self._set_index_to_ftr()
        ftrs = [str(x) for x in ftrs]
        mat_dict = self._calc_matrix()
        # sort by highest std
        all_mx = np.vstack([mx for name, mx in mat_dict.items()])
        global_std = {
            i: m
            for i, m in enumerate(np.std(all_mx, 0).tolist()[0])
        }
        sorted_std = [
            i for i, m in sorted(global_std.items(), key=lambda x: -x[1])
        ][0:30]

        # sort by highest mean
        global_mean = {
            i: m
            for i, m in enumerate(np.mean(all_mx, 0).tolist()[0])
        }
        sorted_mean = [
            i for i, m in sorted(global_mean.items(), key=lambda x: -x[1])
        ][0:30]

        # global_max
        global_sum = {
            i: m
            for i, m in enumerate(np.max(all_mx, 0).tolist()[0])
        }

        anomalies = [
            self._name_to_idx[anomaly] for anomaly in self._ground_truth
        ]

        # ----------------------- mean -------------------------
        heat_mx = []
        for name, mx in mat_dict.items():
            heat_day_mean = {
                i: m
                for i, m in enumerate(np.mean(mx, 0).tolist()[0])
            }
            heat_day_mean = [
                heat_day_mean[i] / global_sum[i] for i in sorted_mean
            ]
            heat_mx.append(heat_day_mean)
        plt.subplots(figsize=(20, 15))
        heat_mx = np.vstack(heat_mx)
        ax = sns.heatmap(heat_mx, vmin=0.0005, vmax=0.005)
        plt.xticks(list(range(30)), ftrs[:30], rotation='vertical')
        for i in anomalies:
            ax.axhline(y=i, color='red', linewidth=0.4)
        plt.savefig("mean_heatmap")
        e = 0

        plt.clf()

        # ----------------------- std -------------------------
        heat_mx = []
        for name, mx in mat_dict.items():
            heat_day_std = {
                i: m
                for i, m in enumerate(np.std(mx, 0).tolist()[0])
            }
            heat_day_std = [
                heat_day_std[i] / global_sum[i] for i in sorted_std
            ]
            heat_mx.append(heat_day_std)
        heat_mx = np.vstack(heat_mx)
        ax = sns.heatmap(heat_mx, vmin=0.005, vmax=0.05)
        plt.xticks(list(range(30)), ftrs[:30], rotation='vertical')
        for i in anomalies:
            ax.axhline(y=i, color='red', linewidth=0.4)
        plt.savefig("std_heatmap")
        e = 0

    def plot_features_mean_std(self):  # matrix: np.matrix):
        ftrs = self._set_index_to_ftr()
        ftrs = [str(x) for x in ftrs]

        #  -------------------- prepare matrix anomalies and rest of data
        all_list = []
        anomal_list = []
        for name, mx in self._calc_matrix().items():
            if name in self._ground_truth:
                anomal_list.append(mx)
            else:
                all_list.append(mx)

        all_mx = np.vstack(all_list)
        anomal_mx = np.vstack(anomal_list)

        global_mean = {
            i: m
            for i, m in enumerate(np.mean(all_mx, 0).tolist()[0])
        }
        global_max = {
            i: m
            for i, m in enumerate(np.std(all_mx, 0).tolist()[0])
        }
        sorted_keys = [
            i for i, m in sorted(global_mean.items(), key=lambda x: -x[1])
        ]

        groups = []
        prev_val = global_max[sorted_keys[0]]
        sub_group = []
        size_ = 0
        for i in sorted_keys:
            if 100 * prev_val >= global_max[i] >= 00.1 * prev_val and size_ < 6:
                sub_group.append(i)
                size_ += 1
            else:
                prev_val = global_mean[i]
                groups.append(sub_group)
                sub_group = [i]
                size_ = 1

        for group_num in range(1):
            group_num = 2
            curr_ftr = []
            for i in groups[group_num]:
                curr_ftr.append(ftrs[i])
                curr_ftr.append("A_" + ftrs[i])
            mid = []
            bottom = []
            top = []
            for i in groups[group_num]:
                bottom.append(
                    np.percentile(all_mx[:, i], 25, axis=0).tolist()[0])
                bottom.append(
                    np.percentile(anomal_mx[:, i], 25, axis=0).tolist()[0])
                mid.append(np.percentile(all_mx[:, i], 50, axis=0).tolist()[0])
                mid.append(
                    np.percentile(anomal_mx[:, i], 50, axis=0).tolist()[0])
                top.append(np.percentile(all_mx[:, i], 75, axis=0).tolist()[0])
                top.append(
                    np.percentile(anomal_mx[:, i], 75, axis=0).tolist()[0])

            bottom = np.array(bottom)
            mid = np.array(mid)
            top = np.array(top)
            # find the quartiles and IQR for each category
            iqr = top - bottom
            upper = top + 1.5 * iqr
            lower = bottom - 1.5 * iqr

            p = figure(tools="",
                       background_fill_color="#efefef",
                       x_range=curr_ftr,
                       toolbar_location=None,
                       plot_width=600,
                       plot_height=600,
                       title=self._data_name + "_percentile=(25-50-75)")

            colors = ["black", "red"] * int(mid.shape[0] / 2)
            # stems
            p.segment(curr_ftr, upper, curr_ftr, top, line_color=colors)
            p.segment(curr_ftr, lower, curr_ftr, bottom, line_color=colors)

            # boxes
            p.vbar(curr_ftr,
                   0.7,
                   mid,
                   top,
                   fill_color="#E08E79",
                   line_color=colors)
            p.vbar(curr_ftr,
                   0.7,
                   bottom,
                   mid,
                   fill_color="#3B8686",
                   line_color=colors)

            # whiskers (almost-0 height rects simpler than segments)
            p.rect(curr_ftr, lower, 0.2, 0.0000001, line_color=colors)
            p.rect(curr_ftr, upper, 0.2, 0.0000001, line_color=colors)

            p.xaxis.major_label_orientation = np.pi / 2
            p.xgrid.grid_line_color = None
            p.ygrid.grid_line_color = "white"
            p.grid.grid_line_width = 2
            p.xaxis.major_label_text_font_size = "12pt"
            show(p)
            # plot = Plot(output_backend="svg")
            # plot.output_backend(p, filename=str(group_num) + "_svg")

    def plot_correlations(self):
        from sklearn import linear_model
        mx_dict = self._calc_matrix()
        concat_mx = np.vstack([mx for name, mx in mx_dict.items()])
        pearson_picker = PearsonFeaturePicker(
            concat_mx,
            size=self._params.ftr_pairs,
            logger=self._logger,
            identical_bar=self._params.identical_bar)
        best_pairs = pearson_picker.best_pairs()
        for i, j, u in best_pairs:
            reg = linear_model.LinearRegression().fit(
                np.transpose(concat_mx[:, i].T), np.transpose(concat_mx[:,
                                                                        j].T))
            m = reg.coef_
            b = reg.intercept_

            ftr_i = concat_mx[:, i].T.tolist()[0]
            ftr_j = concat_mx[:, j].T.tolist()[0]

            p = figure(plot_width=600,
                       plot_height=250,
                       title=self._data_name + " regression " + str((i, j)),
                       x_axis_label="time",
                       y_axis_label="nodes_count")  # create figure

            p.line(list(range(int(max(ftr_i)) + 1)),
                   [m * i + b for i in range(10)],
                   line_color="blue")  # plot nodes

            p.scatter(list(ftr_i), list(ftr_j))  # plot nodes
            p.xaxis.major_label_overrides = {
                i: graph_name
                for i, graph_name in enumerate(
                    self._temporal_graph.graph_names())
            }  # time to graph_name dict
            p.legend.location = "top_left"
            show(p)

        e = 0

Пример #7

Показать файл

Файл: ‏‏mle_estimator - better.py Проект: ovednagar/Viterbi_NLP

class MleEstimator:
    def __init__(self,
                 source_file,
                 num_prefix=120,
                 num_suffix=200,
                 delta=(0.2, 0.5, 0.3),
                 gamma=(0.6, 0.4)):
        self._logger = PrintLogger("NLP-ass1")
        self._delta = delta
        self._gamma = gamma
        self._source = source_file
        self._num_prefix = num_prefix
        self._num_suffix = num_suffix
        # counters
        self._emmision_count, self._transition_count, self._prefix_count, self._suffix_count = self._get_data(
        )
        self._pos_list = list(
            set(list(self._transition_count[0].keys()) + [START]))
        self._num_pos = len(self._pos_list)
        self._pos_idx = {pos: i for i, pos in enumerate(self._pos_list)}
        # probabilities
        self._emmision, self._transition, self._prefix, self._suffix = self._calc_probabilities(
        )

    def _get_data(self):
        self._logger.info("get-data - start")
        transition = {0: {}, 1: {}, 2: {}}
        t1 = START
        t2 = START
        emmision = {}
        prefix = {}
        suffix = {}
        src_file = open(self._source, "rt")  # open file
        for line in src_file:
            # ---------- BREAK -----------
            w_pos = []
            for w_p in line.split():  # break line to [.. (word, POS) ..]
                word, pos = w_p.rsplit("/", 1)
                w_pos.append((word, pos))
            for i, (word, pos) in enumerate(w_pos):
                # -------- EMISSION ----------
                emmision[(word, pos)] = emmision.get(
                    (word, pos), 0) + 1  # count (word, POS)++
                # --------- PREFIX -----------
                prefix[(word[:PREF], pos)] = prefix.get(
                    (word[:PREF], pos), 0) + 1  # count bigram prefixes
                suffix[(word[-SUFF:], pos)] = prefix.get(
                    (word[-SUFF:], pos), 0) + 1  # count bigram prefixes
                # ------- TRANSITION ---------
                transition[0][pos] = transition[0].get(pos,
                                                       0) + 1  # count(POS)
                transition[1][(t1, pos)] = transition[1].get(
                    (t1, pos), 0) + 1  # count(POS_1, POS_2)
                transition[2][(t2, t1, pos)] = transition[2].get(
                    (t2, t1, pos), 0) + 1  # count(POS_0, POS_1, POS_2)
                t2 = t1
                t1 = pos
        prefix = {
            pre: pos
            for i, (
                pre,
                pos) in enumerate(sorted(prefix.items(), key=lambda x: -x[1]))
            if i < self._num_prefix
        }
        suffix = {
            pre: pos
            for i, (
                pre,
                pos) in enumerate(sorted(suffix.items(), key=lambda x: -x[1]))
            if i < self._num_suffix
        }
        # take K most common prefixes
        self._logger.info("get-data - end")
        return emmision, transition, prefix, suffix

    @staticmethod
    def _my_log(x):
        if x == 0:
            return -100
        if x == 1:
            return -0.001
        else:
            return np.log(x)

    def _calc_probabilities(self):
        self._logger.info("calc-probabilities - start")
        transition_prob = {}

        # -------- EMISSION ----------
        # e(word| pos)
        emmision_prob = {
            (word, pos):
            ((1 - CUT) * w_p_count / self._transition_count[0][pos]) + CUT
            for (word, pos), w_p_count in self._emmision_count.items()
        }

        # --------- PREFIX -----------
        # given word [w_1, w_2 , ... , w_n-1, w_n]
        # e(w_n-1, w_n| pos)
        prefix_bi_prob = {
            (pre, pos):
            ((1 - CUT) * s_p_count / self._transition_count[0][pos]) + CUT
            for (pre, pos), s_p_count in self._prefix_count.items()
        }
        suffix_bi_prob = {
            (sufi, pos):
            ((1 - CUT) * s_p_count / self._transition_count[0][pos]) + CUT
            for (sufi, pos), s_p_count in self._suffix_count.items()
        }

        # ------- TRANSITION ---------
        sum_words = np.sum(list(self._transition_count[0].values()))
        # sequence = [pos2, pos1, pos0]
        # q(pos0)
        transition_prob[0] = {
            pos: ((1 - CUT) * pos_count / sum_words) + CUT
            for pos, pos_count in self._transition_count[0].items()
        }
        # q(pos0| pos1)
        transition_prob[1] = {
            (pos1, pos0):
            ((1 - CUT) * count / self._transition_count[0][pos1]) + CUT
            for (pos1, pos0), count in self._transition_count[1].items()
            if pos1 in self._transition_count[0]
        }
        # q(pos0| pos2, pos1)
        transition_prob[2] = {
            (pos2, pos1, pos0):
            ((1 - CUT) * count / self._transition_count[1][(pos2, pos1)]) + CUT
            for (pos2, pos1, pos0), count in self._transition_count[2].items()
            if (pos2, pos1) in self._transition_count[1]
        }
        self._logger.info("calc-probabilities - end")
        return emmision_prob, transition_prob, prefix_bi_prob, suffix_bi_prob

    def emmision(self, word_pos: tuple, log=False):
        # break
        word, pos = word_pos
        # if there is a value e(word| vec)
        if (word, pos) in self._emmision:
            return self._my_log(
                self._emmision[word_pos]) if log else self._emmision[word_pos]
        # if not then check if there is a value e(w_1, w_2| pos)
        pref = word[:PREF]
        if (pref, pos) in self._prefix:
            return self._my_log(
                self._prefix[(pref, pos)]) if log else self._prefix[(pref,
                                                                     pos)]
        # if not then check if there is a value e(w_n-1, w_n| pos)
        suf = word[-SUFF:]
        if (suf, pos) in self._suffix:
            return self._my_log(
                self._suffix[(suf, pos)]) if log else self._suffix[(suf, pos)]
        return self._my_log(0) if log else 0

    def transition(self, pos_sequence: tuple, log=False):
        # break sequence
        pos0 = pos_sequence[-1]
        pos1 = pos_sequence[-2]
        pos2 = pos_sequence[-3] if len(pos_sequence) > 2 else None
        # calculate:   d1*q(pos0| pos2, pos1)   +   d2*q(pos0| pos1)   +   d3*q(pos0)
        tran_0 = self._delta[0] * self._transition[0].get(pos0, 0)
        tran_1 = self._delta[1] * self._transition[1].get((pos1, pos0), 0)
        tran_2 = self._delta[2] * self._transition[2].get(
            (pos2, pos1, pos0), 0) if pos2 else 0
        return self._my_log(tran_2 + tran_1 +
                            tran_0) if log else tran_2 + tran_1 + tran_0

    def mle_count_to_txt(self, e_mle_path, q_mle_path):
        self._logger.info("writing e_mle...")
        out_e = open(e_mle_path, "wt")
        out_e.writelines([
            word + " " + pos + "\t" + str(count) + "\n"
            for (word, pos), count in self._emmision_count.items()
        ])
        out_e.writelines([
            "^" + pref + " " + pos + "\t" + str(count) + "\n"
            for (pref, pos), count in self._prefix_count.items()
        ])
        out_e.writelines([
            "^" + sufi + " " + pos + "\t" + str(count) + "\n"
            for (sufi, pos), count in self._suffix_count.items()
        ])
        out_e.close()
        self._logger.info("writing q_mle...")
        out_q = open(q_mle_path, "wt")
        out_q.writelines([
            pos + "\t" + str(count) + "\n"
            for pos, count in self._transition_count[0].items()
        ])
        out_q.writelines([
            pos1 + " " + pos0 + "\t" + str(count) + "\n"
            for (pos1, pos0), count in self._transition_count[1].items()
        ])
        out_q.writelines([
            pos2 + " " + pos1 + " " + pos0 + "\t" + str(count) + "\n"
            for (pos2, pos1, pos0), count in self._transition_count[2].items()
        ])
        out_q.close()

    def pred_viterbi(self, sequence, log=False):
        self._logger.info("Viterbi - START...")
        self._logger.info("Viterbi - INITIALIZATION...")
        # ------------ INITIALIZATION --------------
        len_seq = len(sequence) + 1
        base_score = self._my_log(0) if log else 0
        v_mx = [[[(base_score, (-1, self._pos_idx[START],
                                self._pos_idx[START]))
                  for _ in range(self._num_pos)] for _ in range(self._num_pos)]
                for _ in range(len_seq)]
        bp = (-1, self._pos_idx[START], self._pos_idx[START])
        base_score = self._my_log(1) if log else 1
        v_mx[0][self._pos_idx[START]][self._pos_idx[START]] = (base_score, bp)

        self._logger.info("Viterbi - FORWARD...")
        # ------- RECURSIVE STEP / FORWARD ---------
        print("Viterbi - forward: " + str(sequence) + "\nProgress:          ",
              end="")
        for i in range(1, len_seq):
            print("." * (len(sequence[i - 1]) + 3) + "|", end="")
            for j, pos2 in enumerate(self._pos_list):
                for k, pos1 in enumerate(self._pos_list):
                    score, bp = self._max_and_bp(v_mx,
                                                 i,
                                                 sequence[i - 1],
                                                 j,
                                                 pos2,
                                                 pos1,
                                                 log=log)
                    bp = (i - 1, bp, j)
                    v_mx[i][j][k] = (score, bp)
        print(" -- forward completed --")
        self._logger.info("Viterbi - BACKWARDS...")
        # ------- REPRODUCTION / BACKWARDS ---------
        # find max and arg max at v_max[last_layer]
        max_val = self._my_log(0) if log else 0
        max_i = 0
        max_j = 0
        for i in range(self._num_pos):
            for j in range(self._num_pos):
                if v_mx[len_seq - 1][i][j][0] > max_val:
                    max_val = v_mx[len_seq - 1][i][j][0]
                    max_i = i
                    max_j = j
        # reconstruct Part Of Speech
        prediction = [self._pos_list[max_i], self._pos_list[max_j]]
        ps = v_mx[len_seq - 1][max_i][max_j][1]
        for word_idx in range(len_seq - 1, 0, -1):
            curr_pos = self._pos_list[ps[1]]
            if curr_pos == START:
                break
            prediction = [curr_pos] + prediction
            ps = v_mx[ps[0]][ps[1]][ps[2]][1]
        return prediction

    def _max_and_bp(self,
                    v_mx,
                    word_idx,
                    word,
                    pos2_idx,
                    pos2,
                    pos1,
                    log=False):
        # given a word w_n and pos2, pos1
        # we want to maximize w_n is pos1 coming after a pos2 word
        # scores = V(w_n-1, pos_i, pos2) * q(pos1| pos_i, pos2) * e(w_n| pos1)  i = 0..num_pos
        if log:
            scores = [
                v_mx[word_idx - 1][i][pos2_idx][0] +
                (self._gamma[1] * self.transition(
                    (self._pos_list[i], pos2, pos1), log=log) +
                 self._gamma[0] * self.emmision((word, pos1), log=log))
                for i in range(self._num_pos)
            ]
        else:
            scores = [
                v_mx[word_idx - 1][i][pos2_idx][0] * self.transition(
                    (self._pos_list[i], pos2, pos1), log=log) * self.emmision(
                        (word, pos1), log=log) for i in range(self._num_pos)
            ]
        max_score = np.max(scores)
        argmax_score = np.argmax(scores)
        return max_score, argmax_score