def test_main(): import numpy as np from features_infra.graph_features import GraphFeatures from loggers import PrintLogger import os import pickle import networkx as nx dataset = "citeseer" logger = PrintLogger("MetaTest") base_dir = r"/home/benami/git/pygcn/data" gnx = pickle.load(open(os.path.join(base_dir, dataset, "gnx.pkl"), 'rb')) max_subgnx = max(nx.connected_component_subgraphs(gnx.to_undirected()), key=len) gnx = gnx.subgraph(max_subgnx) features = GraphFeatures(gnx, TEST_FEATURES, dir_path="./%s_features_sub" % dataset, logger=logger) features.build(should_dump=True) measures_mx = features.to_matrix(add_ones=False, dtype=np.float32, mtype=np.matrix) logger.info("Finished")
class TPGAD: def __init__(self, params): self._params = params if type(params) is dict else json.load(open(params, "rt")) self._logger = PrintLogger("graph-ad") self._temporal_graph = self._build_temporal_graph() self._ground_truth = self._load_ground_truth(self._params['gt']['filename']) self._num_anomalies = len(self._ground_truth)*2 self._idx_to_graph = list(self._temporal_graph.graph_names()) self._graph_to_idx = {name: idx for idx, name in enumerate(self._idx_to_graph)} self._run_ad() def _load_ground_truth(self, gt_file): df = pd.read_csv(gt_file) return {self._temporal_graph.name_to_index(row.anomaly): row.get("score", 1) for i, row in df.iterrows()} def data_name(self): max_connected = "max_connected_" if self._params['features']['max_connected'] else "" directed = "directed" if self._params['dataset']['directed'] else "undirected" weighted = "weighted_" if self._params['dataset']['weight_col'] is not None else "" return f"{self._params['dataset']['name']}_{weighted}{max_connected}{directed}" def _build_temporal_graph(self): tg_pkl_dir = os.path.join(self._params['general']['pkl_path'], "temporal_graphs") tg_pkl_path = os.path.join(tg_pkl_dir, f"{self.data_name()}_tg.pkl") if os.path.exists(tg_pkl_path): self._logger.info("loading pkl file - temporal_graphs") tg = pickle.load(open(tg_pkl_path, "rb")) else: tg = TemporalGraph(self.data_name(), self._params['dataset']['filename'], self._params['dataset']['time_format'], self._params['dataset']['time_col'], self._params['dataset']['src_col'], self._params['dataset']['dst_col'], weight_col=self._params['dataset'].get('weight_col', None), weeks=self._params['dataset'].get('week_split', None), days=self._params['dataset'].get('day_split', None), hours=self._params['dataset'].get('hour_split', None), minutes=self._params['dataset'].get('min_split', None), seconds=self._params['dataset'].get('sec_split', None), directed=self._params['dataset']['directed'], logger=self._logger).to_multi_graph() tg.suspend_logger() if self._params['general']["dump_pkl"]: os.makedirs(tg_pkl_dir, exist_ok=True) pickle.dump(tg, open(tg_pkl_path, "wb")) tg.wake_logger() return tg def _calc_tg_feature_matrix(self): log_ext = "log_" if self._params['features']['log'] else "" feature_matrix_dir = os.path.join(self._params['general']['pkl_path'], "gt_feature_matrix") mat_pkl = os.path.join(feature_matrix_dir, f"{self.data_name()}_{log_ext}tg_feature_matrices.pkl") if os.path.exists(mat_pkl): self._logger.info("loading pkl file - graph_matrix") return pickle.load(open(mat_pkl, "rb")) gnx_to_vec = {} # create dir for database database_pkl_dir = os.path.join(self._params['general']['pkl_path'], "features", self.data_name()) for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()): # create dir for specific graph features gnx_path = os.path.join(database_pkl_dir, re.sub('[^a-zA-Z0-9]', '_', gnx_name)) if self._params['general']["dump_pkl"]: os.makedirs(gnx_path, exist_ok=True) gnx_ftr = GraphFeatures(gnx, ANOMALY_DETECTION_FEATURES, dir_path=gnx_path, logger=self._logger, is_max_connected=self._params['features']['max_connected']) gnx_ftr.build(should_dump=self._params['general']["dump_pkl"], force_build=self._params['general']['FORCE_REBUILD_FEATURES']) # build features # calc motif ratio vector gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(norm_func=log_norm if self._params['features']['log'] else None) if self._params['general']['dump_pkl']: os.makedirs(feature_matrix_dir, exist_ok=True) pickle.dump(gnx_to_vec, open(mat_pkl, "wb")) return gnx_to_vec def _get_beta_vec(self, mx_dict, best_pairs): self._logger.debug("calculating beta vectors") if self._params['beta_vectors']['type'] == "regression": beta = LinearContext(self._temporal_graph, mx_dict, best_pairs, window_size=self._params['beta_vectors']['window_size']) elif self._params['beta_vectors']['type'] == "mean_regression": beta = LinearMeanContext(self._temporal_graph, mx_dict, best_pairs, window_size=self._params['beta_vectors']['window_size']) else: raise RuntimeError(f"invalid value for params[beta_vectors][type], got {self._params['beta_vectors']['type']}" f" while valid options are: regression/mean_regression ") if self._params['general']['dump_pkl']: beta_pkl_dir = os.path.join(self._params['general']['pkl_path'], "beta_matrix") tg_pkl_path = os.path.join(beta_pkl_dir, f"{self.data_name()}_beta.pkl") os.makedirs(beta_pkl_dir, exist_ok=True) pickle.dump(beta.beta_matrix(), open(tg_pkl_path, "wb")) self._logger.debug("finish calculating beta vectors") return beta def _get_graphs_score(self, beta_matrix): score_type = self._params['score']['type'] if score_type == "knn": return KnnScore(beta_matrix, self._params['score']['params']['knn']['k'], self.data_name(), window_size=self._params['score']['window_size']) elif score_type == "gmm": return GmmScore(beta_matrix, self.data_name(), window_size=self._params['score']['window_size'], n_components=self._params['score']['params']['gmm']['n_components']) elif score_type == "local_outlier": return LocalOutlierFactorScore(beta_matrix, self.data_name(), window_size=self._params['score']['window_size'], n_neighbors=self._params['score']['params']['local_outlier']['n_neighbors']) else: raise RuntimeError(f"invalid value for params[beta_vectors][type], got {score_type}" f" while valid options are: knn/gmm/local_outlier") def _run_ad(self): mx_dict = self._calc_tg_feature_matrix() concat_mx = np.vstack([mx for name, mx in mx_dict.items()]) pearson_picker = PearsonFeaturePicker(concat_mx, size=self._params['feature_pair_picker']['num_pairs'], logger=self._logger, identical_bar=self._params['feature_pair_picker']['overlap_bar']) best_pairs = pearson_picker.best_pairs() beta_matrix = self._get_beta_vec(mx_dict, best_pairs).beta_matrix() scores = self._get_graphs_score(beta_matrix).score_list() anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, scores, self.data_name(), num_anomalies=self._num_anomalies) anomaly_picker.build() anomaly_picker.plot_anomalies_bokeh("", truth=self._ground_truth, info_text=str(self._params))
class MleEstimator: def __init__(self, source_file, num_prefix=120, num_suffix=200, delta=(0.2, 0.5, 0.3)): self._logger = PrintLogger("NLP-ass1") self._delta = delta self._source = source_file self._num_prefix = num_prefix self._num_suffix = num_suffix # counters self._emission_count, self._transition_count, self._suffix_count = self._get_data( ) self._pos_list = list( set(list(self._transition_count[0].keys()) + [START])) self._num_pos = len(self._pos_list) self._pos_idx = {pos: i for i, pos in enumerate(self._pos_list)} # probabilities #self._emission, self._transition, self._prefix, self._suffix = self._calc_probabilities() def _get_data(self): self._logger.info("get-data - start") transition = {0: {}, 1: {}, 2: {}} emission = {} suffix = {} word_counter = 0 src_file = open(self._source, "rt") # open file for line in src_file: t1 = START t2 = START w_pos = [] for w_p in line.split(): # break line to [.. (word, POS) ..] word, pos = w_p.rsplit("/", 1) w_pos.append((word, pos)) for i, (word, pos) in enumerate(w_pos): word_counter += 1 # -------- EMISSION ---------- emission[(word, pos)] = emission.get( (word, pos), 0) + 1 # count (word, POS)++ # --------- SUFFIX ----------- if word_counter % 10 == 0: suffix[(word[-SUFF:], pos)] = suffix.get( (word[-SUFF:], pos), 0) + 1 # ------- TRANSITION --------- transition[0][pos] = transition[0].get(pos, 0) + 1 # count(POS) transition[1][(t1, pos)] = transition[1].get( (t1, pos), 0) + 1 # count(POS_1, POS_2) transition[2][(t2, t1, pos)] = transition[2].get( (t2, t1, pos), 0) + 1 # count(POS_0, POS_1, POS_2) t2 = t1 t1 = pos self._logger.info("get-data - end") return emission, transition, suffix def mle_count_to_txt(self, e_mle_path, q_mle_path): self._logger.info("writing e_mle...") out_e = open(e_mle_path, "wt") out_e.writelines([ word + " " + pos + "\t" + str(count) + "\n" for (word, pos), count in self._emission_count.items() ]) out_e.writelines([ "^" + sufi + " " + pos + "\t" + str(count) + "\n" for (sufi, pos), count in self._suffix_count.items() ]) out_e.close() self._logger.info("writing q_mle...") out_q = open(q_mle_path, "wt") out_q.writelines([ pos + "\t" + str(count) + "\n" for pos, count in self._transition_count[0].items() ]) out_q.writelines([ pos1 + " " + pos0 + "\t" + str(count) + "\n" for (pos1, pos0), count in self._transition_count[1].items() ]) out_q.writelines([ pos2 + " " + pos1 + " " + pos0 + "\t" + str(count) + "\n" for (pos2, pos1, pos0), count in self._transition_count[2].items() ]) out_q.close()
class AnomalyDetection: def __init__(self, params: AdParams): self._base_dir = __file__.replace("/", os.sep) self._base_dir = os.path.join(self._base_dir.rsplit(os.sep, 1)[0]) self._data_path = os.path.join(self._base_dir, "INPUT_DATA", params.database.DATABASE_FILE) self._params = params self._data_name = params.database.DATABASE_NAME self._logger = PrintLogger("Anomaly logger") self._temporal_graph = self._build_temporal_graph() self._ground_truth = self._load_ground_truth(self._params.database.GROUND_TRUTH) # self._temporal_graph.filter( # lambda x: False if self._temporal_graph.node_count(x) < 20 else True, # func_input="graph_name") self._idx_to_name = list(self._temporal_graph.graph_names()) self._name_to_idx = {name: idx for idx, name in enumerate(self._idx_to_name)} if self._params.vec_type == "motif_ratio": self._build_second_method() elif self._params.vec_type == "regression": self._build_first_method() def _load_ground_truth(self, gd): if type(gd) is list: return {self._temporal_graph.name_to_index(g_id): 1 for g_id in gd} elif type(gd) is dict: return {self._temporal_graph.name_to_index(g_id): float(val) for g_id, val in gd.items()} return None def _build_temporal_graph(self): database_name = self._params.database.DATABASE_NAME + "_" + str(self._params.max_connected)\ + "_" + str(self._params.directed) vec_pkl_path = os.path.join(self._base_dir, "pkl", "temporal_graphs", database_name + "_tg.pkl") if os.path.exists(vec_pkl_path): self._logger.info("loading pkl file - temoral_graphs") tg = pickle.load(open(vec_pkl_path, "rb")) else: tg = TemporalGraph(database_name, self._data_path, self._params.database.DATE_FORMAT, self._params.database.TIME_COL, self._params.database.SRC_COL, self._params.database.DST_COL, weight_col=self._params.database.WEIGHT_COL, weeks=self._params.database.WEEK_SPLIT, days=self._params.database.DAY_SPLIT, hours=self._params.database.HOUR_SPLIT, minutes=self._params.database.MIN_SPLIT, seconds=self._params.database.SEC_SPLIT, directed=self._params.directed, logger=self._logger).to_multi_graph() tg.suspend_logger() pickle.dump(tg, open(vec_pkl_path, "wb")) tg.wake_logger() return tg def _calc_matrix(self): database_name = self._params.database.DATABASE_NAME + "_" + str(self._params.max_connected) + "_" + str( self._params.directed) mat_pkl_path = os.path.join(self._base_dir, "pkl", "vectors", database_name + "_matrix_log" + str(self._params.log) + ".pkl") if os.path.exists(mat_pkl_path): self._logger.info("loading pkl file - graph_matrix") return pickle.load(open(mat_pkl_path, "rb")) gnx_to_vec = {} # create dir for database pkl_dir = os.path.join(self._base_dir, "pkl", "features") database_pkl_dir = os.path.join(pkl_dir, database_name) if database_name not in os.listdir(pkl_dir): os.mkdir(database_pkl_dir) for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()): # create dir for specific graph features gnx_path = os.path.join(database_pkl_dir, gnx_name) if gnx_name not in os.listdir(database_pkl_dir): os.mkdir(gnx_path) gnx_ftr = GraphFeatures(gnx, self._params.features, dir_path=gnx_path, logger=self._logger, is_max_connected=self._params.max_connected) gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES) # build features # calc motif ratio vector gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(norm_func=log_norm) if self._params.log else \ FeaturesProcessor(gnx_ftr).as_matrix() pickle.dump(gnx_to_vec, open(mat_pkl_path, "wb")) return gnx_to_vec def _calc_vec(self): database_name = self._params.database.DATABASE_NAME + "_" + \ str(self._params.max_connected) + "_" + str(self._params.directed) vec_pkl_path = os.path.join(self._base_dir, "pkl", "vectors", database_name + "_vectors_log_" + str(self._params.log) + ".pkl") if os.path.exists(vec_pkl_path): self._logger.info("loading pkl file - graph_vectors") return pickle.load(open(vec_pkl_path, "rb")) # create dir for database pkl_dir = os.path.join(self._base_dir, "pkl", "features") database_pkl_dir = os.path.join(pkl_dir, database_name) if database_name not in os.listdir(pkl_dir): os.mkdir(database_pkl_dir) gnx_to_vec = {} for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()): # create dir for specific graph features gnx_path = os.path.join(database_pkl_dir, gnx_name) if gnx_name not in os.listdir(database_pkl_dir): os.mkdir(gnx_path) gnx_ftr = GraphFeatures(gnx, self._params.features, dir_path=gnx_path, logger=self._logger, is_max_connected=self._params.max_connected) gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES) # build features # calc motif ratio vector gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).activate_motif_ratio_vec(norm_func=log_norm)\ if self._params.log else FeaturesProcessor(gnx_ftr).activate_motif_ratio_vec() pickle.dump(gnx_to_vec, open(vec_pkl_path, "wb")) return gnx_to_vec def _build_first_method(self): mx_dict = self._calc_matrix() concat_mx = np.vstack([mx for name, mx in mx_dict.items()]) pearson_picker = PearsonFeaturePicker(concat_mx, size=self._params.ftr_pairs, logger=self._logger, identical_bar=self._params.identical_bar) best_pairs = pearson_picker.best_pairs() beta = LinearContext(self._temporal_graph, mx_dict, best_pairs, window_size=self._params.window_correlation) beta_matrix = beta.beta_matrix() if self._params.score_type == "knn": score = KnnScore(beta_matrix, self._params.KNN_k, self._data_name, window_size=self._params.window_score) elif self._params.score_type == "gmm": score = GmmScore(beta_matrix, self._data_name, window_size=self._params.window_score, n_components=self._params.n_components) else: # self._params["score_type"] == "local_outlier": score = LocalOutlierFactorScore(beta_matrix, self._data_name, window_size=self._params.window_score, n_neighbors=self._params.n_neighbors) anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, score.score_list(), self._data_name, num_anomalies=self._params.n_outliers) anomaly_picker.build() anomaly_picker.plot_anomalies_bokeh(self._params.anomalies_file_name, truth=self._ground_truth, info_text=self._params.tostring()) def _build_second_method(self): self._graph_to_vec = self._calc_vec() self._graph_matrix = np.vstack([self._graph_to_vec[name] for name in self._temporal_graph.graph_names()]) if self._params.log: self._graph_matrix = log_norm(self._graph_matrix) if self._params.score_type == "knn": score = KnnScore(self._graph_matrix, self._params.KNN_k, self._data_name, window_size=self._params.window_score) elif self._params.score_type == "gmm": score = GmmScore(self._graph_matrix, self._data_name, window_size=self._params.window_score, n_components=self._params.n_components) else: # self._params["score_type"] == "local_outlier": score = LocalOutlierFactorScore(self._graph_matrix, self._data_name, window_size=self._params.window_score, n_neighbors=self._params.n_neighbors) anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, score.score_list(), self._data_name, num_anomalies=self._params.n_outliers) anomaly_picker.build() anomaly_picker.plot_anomalies_bokeh(self._params.anomalies_file_name, truth=self._ground_truth, info_text=self._params.tostring())
class AnomalyDetectionOperationResearch: def __init__(self, params: AdParams, name): self._base_dir = __file__.replace("/", os.sep) self._base_dir = os.path.join( self._base_dir.rsplit(os.sep, 1)[0], "..") self._data_path = os.path.join(self._base_dir, "INPUT_DATA", params.database.DATABASE_FILE) self._params = params self._data_name = params.database.DATABASE_NAME self._logger = PrintLogger("Anomaly logger") self._temporal_graph = self._build_temporal_graph() self._ground_truth = self._load_ground_truth( self._params.database.GROUND_TRUTH) self._idx_to_name = list(self._temporal_graph.graph_names()) self._name_to_idx = { name: idx for idx, name in enumerate(self._idx_to_name) } self._out = open(os.path.join("..", name), "wt") self._out.write(",".join([ "FN", "TN", "TP", "FP", "recall", "precision", "specificity", "F1", self._params.attr_string() ]) + "\n") self._build() def _load_ground_truth(self, gd): if type(gd) is list: return {self._temporal_graph.name_to_index(g_id): 1 for g_id in gd} elif type(gd) is dict: return { self._temporal_graph.name_to_index(g_id): float(val) for g_id, val in gd.items() } return None def _build_temporal_graph(self): database_name = self._params.database.DATABASE_NAME + "_" + str(self._params.max_connected)\ + "_" + str(self._params.directed) vec_pkl_path = os.path.join(self._base_dir, "pkl", "temporal_graphs", database_name + "_tg.pkl") if os.path.exists(vec_pkl_path): self._logger.info("loading pkl file - temoral_graphs") tg = pickle.load(open(vec_pkl_path, "rb")) else: tg = TemporalGraph(database_name, self._data_path, self._params.database.DATE_FORMAT, self._params.database.TIME_COL, self._params.database.SRC_COL, self._params.database.DST_COL, weight_col=self._params.database.WEIGHT_COL, weeks=self._params.database.WEEK_SPLIT, days=self._params.database.DAY_SPLIT, hours=self._params.database.HOUR_SPLIT, minutes=self._params.database.MIN_SPLIT, seconds=self._params.database.SEC_SPLIT, directed=self._params.directed, logger=self._logger).to_multi_graph() tg.suspend_logger() pickle.dump(tg, open(vec_pkl_path, "wb")) tg.wake_logger() return tg def _calc_matrix(self): database_name = self._params.database.DATABASE_NAME + "_" + str( self._params.max_connected) + "_" + str(self._params.directed) mat_pkl_path = os.path.join( self._base_dir, "pkl", "vectors", database_name + "_matrix_log" + str(self._params.log) + ".pkl") if os.path.exists(mat_pkl_path): self._logger.info("loading pkl file - graph_matrix") return pickle.load(open(mat_pkl_path, "rb")) gnx_to_vec = {} # create dir for database pkl_dir = os.path.join(self._base_dir, "pkl", "features") database_pkl_dir = os.path.join(pkl_dir, database_name) if database_name not in os.listdir(pkl_dir): os.mkdir(database_pkl_dir) for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()): # create dir for specific graph features gnx_name_path = gnx_name.replace(':', '_') gnx_name_path = gnx_name_path.replace('/', '_') gnx_path = os.path.join(database_pkl_dir, gnx_name_path) if gnx_name_path not in os.listdir(database_pkl_dir): os.mkdir(gnx_path) gnx_ftr = GraphFeatures( gnx, self._params.features, dir_path=gnx_path, logger=self._logger, is_max_connected=self._params.max_connected) gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES ) # build features # calc motif ratio vector gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix( norm_func=log_norm) pickle.dump(gnx_to_vec, open(mat_pkl_path, "wb")) return gnx_to_vec def _calc_vec(self): database_name = self._params.database.DATABASE_NAME + "_" + \ str(self._params.max_connected) + "_" + str(self._params.directed) vec_pkl_path = os.path.join( self._base_dir, "pkl", "vectors", database_name + "_vectors_log_" + str(self._params.log) + ".pkl") if os.path.exists(vec_pkl_path): self._logger.info("loading pkl file - graph_vectors") return pickle.load(open(vec_pkl_path, "rb")) # create dir for database pkl_dir = os.path.join(self._base_dir, "pkl", "features") database_pkl_dir = os.path.join(pkl_dir, database_name) if database_name not in os.listdir(pkl_dir): os.mkdir(database_pkl_dir) gnx_to_vec = {} for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()): # create dir for specific graph features gnx_path = os.path.join(database_pkl_dir, gnx_name) if gnx_name not in os.listdir(database_pkl_dir): os.mkdir(gnx_path) gnx_ftr = GraphFeatures( gnx, self._params.features, dir_path=gnx_path, logger=self._logger, is_max_connected=self._params.max_connected) gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES ) # build features # calc motif ratio vector gnx_to_vec[gnx_name] = FeaturesProcessor( gnx_ftr).activate_motif_ratio_vec(norm_func=log_norm) pickle.dump(gnx_to_vec, open(vec_pkl_path, "wb")) return gnx_to_vec def _build(self): for lg in [True, False]: self._params.log = lg for vec_type in ["mean_regression", "regression"]: # motif_ratio self._params.vec_type = vec_type self.features = ANOMALY_DETECTION_FEATURES if self._params.vec_type == "regression" else MOTIF_FEATURES, if self._params.vec_type == "regression" or self._params.vec_type == "mean_regression": mx_dict = self._calc_matrix() concat_mx = np.vstack([mx for name, mx in mx_dict.items()]) for ftr_pairs in [ 3, 4, 5 ]: # [1, 2, 3, 4, 5, 10] [5, 10, 15, 20, 25, 30, 40, 45, 50]: # [1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 70, 90, 110, 130, 150, 170, 200]: # [5, 10, 15, 20, 25, 30, 40, 45, 50] self._params.ftr_pairs = ftr_pairs for identical in [ 0.99 ]: # [0.7, 0.8, 0.9, 0.95, 0.99] [0.7, 0.8, 0.9, 0.95, 0.99] self._params.identical_bar = identical pearson_picker = PearsonFeaturePicker( concat_mx, size=self._params.ftr_pairs, logger=self._logger, identical_bar=self._params.identical_bar) for win in list( range( 25, min( 100, self._temporal_graph. number_of_graphs()), 25)): self._params.window_correlation = win best_pairs = pearson_picker.best_pairs() if best_pairs is None: continue if self._params.vec_type == "regression": beta = LinearContext( self._temporal_graph, mx_dict, best_pairs, window_size=self._params. window_correlation) else: beta = LinearMeanContext( self._temporal_graph, mx_dict, best_pairs, window_size=self._params. window_correlation) beta_matrix = beta.beta_matrix() self._pick_anomalies(beta_matrix) elif self._params.vec_type == "motif_ratio": self._graph_to_vec = self._calc_vec() beta_matrix = np.vstack([ self._graph_to_vec[name] for name in self._temporal_graph.graph_names() ]) self._pick_anomalies(beta_matrix) def _pick_anomalies(self, beta_matrix): for score_type in ["knn", "gmm", "local_outlier"]: self._params.score_type = score_type if self._params.score_type == "knn": for win in list( range( 25, min(100, self._temporal_graph.number_of_graphs()), 25)): self._params.window_score = win for k in list(range(5, min(win, 50) - 1, 5)): self._params.KNN_k = k score = KnnScore(beta_matrix, self._params.KNN_k, self._data_name, window_size=self._params.window_score) anomaly_picker = SimpleAnomalyPicker( self._temporal_graph, score.score_list(), self._data_name, num_anomalies=self._params.n_outliers) truth = [ self._temporal_graph.name_to_index(g_id) for g_id in self._params.database.GROUND_TRUTH ] if self._params.database.GROUND_TRUTH else None FN, TN, TP, FP, recall, precision, specificity, F1 = anomaly_picker.build( truth=truth) self._out.write(",".join([ str(FN), str(TN), str(TP), str(FP), str(recall), str(precision), str(specificity), str(F1), self._params.attr_val_string() ]) + "\n") elif self._params.score_type == "gmm": for win in list( range( 25, min(100, self._temporal_graph.number_of_graphs()), 25)): self._params.window_score = win for comp in [1, 2, 3, 4, 5]: self._params.n_components = comp score = GmmScore( beta_matrix, self._data_name, window_size=self._params.window_score, n_components=self._params.n_components) anomaly_picker = SimpleAnomalyPicker( self._temporal_graph, score.score_list(), self._data_name, num_anomalies=self._params.n_outliers) truth = [ self._temporal_graph.name_to_index(g_id) for g_id in self._params.database.GROUND_TRUTH ] if self._params.database.GROUND_TRUTH else None FN, TN, TP, FP, recall, precision, specificity, F1 = anomaly_picker.build( truth=truth) self._out.write(",".join([ str(FN), str(TN), str(TP), str(FP), str(recall), str(precision), str(specificity), str(F1), self._params.attr_val_string() ]) + "\n") elif self._params.score_type == "local_outlier": for win in list( range( 25, min(100, self._temporal_graph.number_of_graphs()), 25)): self._params.window_score = win for neighbors in list(range(5, min(win, 50), 5)): self._params.n_neighbors = neighbors score = LocalOutlierFactorScore( beta_matrix, self._data_name, window_size=self._params.window_score, n_neighbors=self._params.n_neighbors) anomaly_picker = SimpleAnomalyPicker( self._temporal_graph, score.score_list(), self._data_name, num_anomalies=self._params.n_outliers) truth = [ self._temporal_graph.name_to_index(g_id) for g_id in self._params.database.GROUND_TRUTH ] if self._params.database.GROUND_TRUTH else None FN, TN, TP, FP, recall, precision, specificity, F1 = anomaly_picker.build( truth=truth) self._out.write(",".join([ str(FN), str(TN), str(TP), str(FP), str(recall), str(precision), str(specificity), str(F1), self._params.attr_val_string() ]) + "\n")
class DatasetStat: def __init__(self, params: AdParams): self._index_ftr = None self._base_dir = __file__.replace("/", os.sep) self._base_dir = os.path.join( self._base_dir.rsplit(os.sep, 1)[0], "..") self._data_path = os.path.join(self._base_dir, "INPUT_DATA", params.database.DATABASE_FILE) self._params = params self._ground_truth = params.database.GROUND_TRUTH self._data_name = params.database.DATABASE_NAME self._logger = PrintLogger("Anomaly logger") self._temporal_graph = self._build_temporal_graph() # self._temporal_graph.filter( # lambda x: False if self._temporal_graph.node_count(x) < 20 else True, # func_input="graph_name") self._idx_to_name = list(self._temporal_graph.graph_names()) self._name_to_idx = { name: idx for idx, name in enumerate(self._idx_to_name) } self._graph_to_vec = self._calc_vec() def _build_temporal_graph(self): database_name = self._data_name + "_" + str( self._params.max_connected) + "_" + str(self._params.directed) vec_pkl_path = os.path.join(self._base_dir, "pkl", "temporal_graphs", database_name + "_tg.pkl") if os.path.exists(vec_pkl_path): self._logger.info("loading pkl file - temoral_graphs") tg = pickle.load(open(vec_pkl_path, "rb")) else: tg = TemporalGraph(database_name, self._data_path, self._params.database.DATE_FORMAT, self._params.database.TIME_COL, self._params.database.SRC_COL, self._params.database.DST_COL, weight_col=self._params.database.WEIGHT_COL, weeks=self._params.database.WEEK_SPLIT, days=self._params.database.DAY_SPLIT, hours=self._params.database.HOUR_SPLIT, minutes=self._params.database.MIN_SPLIT, seconds=self._params.database.SEC_SPLIT, directed=self._params.directed, logger=self._logger).to_multi_graph() tg.suspend_logger() pickle.dump(tg, open(vec_pkl_path, "wb")) tg.wake_logger() return tg def _calc_vec(self): database_name = self._params.database.DATABASE_NAME + "_" + \ str(self._params.max_connected) + "_" + str(self._params.directed) vec_pkl_path = os.path.join( self._base_dir, "pkl", "vectors", database_name + "_vectors_log_" + str(self._params.log) + ".pkl") if os.path.exists(vec_pkl_path): self._logger.info("loading pkl file - graph_vectors") return pickle.load(open(vec_pkl_path, "rb")) gnx_to_vec = {} # create dir for database pkl_dir = os.path.join(self._base_dir, "pkl", "features") database_pkl_dir = os.path.join(pkl_dir, database_name) if database_name not in os.listdir(pkl_dir): os.mkdir(database_pkl_dir) for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()): # create dir for specific graph features gnx_path = os.path.join(database_pkl_dir, gnx_name) if gnx_name not in os.listdir(database_pkl_dir): os.mkdir(gnx_path) gnx_ftr = GraphFeatures( gnx, self._params.features, dir_path=gnx_path, logger=self._logger, is_max_connected=self._params.max_connected) gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES ) # build features # calc motif ratio vector gnx_to_vec[gnx_name] = FeaturesProcessor( gnx_ftr).activate_motif_ratio_vec() pickle.dump(gnx_to_vec, open(vec_pkl_path, "wb")) return gnx_to_vec def _calc_matrix(self): database_name = self._data_name + "_" + str( self._params.max_connected) + "_" + str(self._params.directed) mat_pkl_path = os.path.join(self._base_dir, "pkl", "vectors", database_name + "_matrix.pkl") if os.path.exists(mat_pkl_path): self._logger.info("loading pkl file - graph_matrix") return pickle.load(open(mat_pkl_path, "rb")) gnx_to_vec = {} # create dir for database pkl_dir = os.path.join(self._base_dir, "pkl", "features") database_pkl_dir = os.path.join(pkl_dir, database_name) if database_name not in os.listdir(pkl_dir): os.mkdir(database_pkl_dir) for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()): # create dir for specific graph features gnx_path = os.path.join(database_pkl_dir, gnx_name) if gnx_name not in os.listdir(database_pkl_dir): os.mkdir(gnx_path) gnx_ftr = GraphFeatures( gnx, self._params.features, dir_path=gnx_path, logger=self._logger, is_max_connected=self._params.max_connected) gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES ) # build features # calc motif ratio vector gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix() pickle.dump(gnx_to_vec, open(mat_pkl_path, "wb")) return gnx_to_vec # map matrix rows to features + count if there's more then one from feature def _set_index_to_ftr(self): gnx_name = self._temporal_graph.graph_names().__next__() gnx = self._temporal_graph.graphs().__next__() database_name = self._data_name + "_" + str( self._params.max_connected) + "_" + str(self._params.directed) gnx_path = os.path.join(self._base_dir, "pkl", "features", database_name, gnx_name) gnx_ftr = GraphFeatures(gnx, self._params.features, dir_path=gnx_path, logger=self._logger, is_max_connected=self._params.max_connected) gnx_ftr.build( should_dump=False, force_build=self._params.FORCE_REBUILD_FEATURES) # build features if not self._index_ftr: sorted_ftr = [ f for f in sorted(gnx_ftr) if gnx_ftr[f].is_relevant() ] # fix feature order (names) self._index_ftr = [] for ftr in sorted_ftr: len_ftr = len(gnx_ftr[ftr]) # fill list with (ftr, counter) self._index_ftr += self._get_motif_type(ftr, len_ftr) if ftr == 'motif3' or ftr == 'motif4' else \ [(ftr, i) for i in range(len_ftr)] return self._index_ftr # return [ ... (motif_type, counter) ... ] def _get_motif_type(self, motif_type, num_motifs): header = [] for i in range(num_motifs): header.append((motif_type, i)) return header def plot_nodes_by_time(self): # collect data for plot nodes_count_by_time = self._temporal_graph.node_count( ) # num of nodes per time edges_count_by_time = self._temporal_graph.edge_count( ) # num of edges per time len_mg = self._temporal_graph.number_of_graphs( ) # num of graphs (times) x_axis = list(range(len_mg)) # [0... num of times] p = figure(plot_width=600, plot_height=250, title=self._data_name + ", node & edge count", x_axis_label="time", y_axis_label="nodes_count") # create figure p.line(x_axis, nodes_count_by_time, legend="nodes", line_color="blue") # plot nodes p.line(x_axis, edges_count_by_time, legend="edges", line_color="green") # plot edges # plot vertical lines for ground truth anomalies = [ self._name_to_idx[anomaly] for anomaly in self._ground_truth ] y = [edges_count_by_time[time] for time in anomalies] p.scatter(anomalies, y, legend="anomalies", line_color="red", fill_color="red") # plot nodes p.xaxis.major_label_overrides = { i: graph_name for i, graph_name in enumerate(self._temporal_graph.graph_names()) } # time to graph_name dict p.legend.location = "top_left" show(p) def plot_timed_mean_std(self): NUM_PLOT_FTR = 20 mat_dict = self._calc_matrix() ftrs = self._set_index_to_ftr() ftrs = [str(x) for x in ftrs] all_mx = np.vstack([mx for name, mx in mat_dict.items()]) # sort by highest mean global_mean = { i: m for i, m in enumerate(np.mean(all_mx, 0).tolist()[0]) } sorted_mean = [ i for i, m in sorted(global_mean.items(), key=lambda x: -x[1]) ][0:NUM_PLOT_FTR] # ----------------------- mean ------------------------- heat_mx = [] mean_curves = [[] for i in range(NUM_PLOT_FTR)] std_curves = [[] for i in range(NUM_PLOT_FTR)] for name, mx in mat_dict.items(): for i, idx in enumerate(sorted_mean): mx_mean = np.mean(mx, 0).tolist()[0] mx_std = np.std(mx, 0).tolist()[0] mean_curves[i].append(mx_mean[idx]) std_curves[i].append(mx_std[idx]) x_axis = list(range( self._temporal_graph.number_of_graphs())) # [0... num of times] for i in range(1): #len(std_curves)): i = 16 p = figure(plot_width=600, plot_height=250, title=self._data_name + " std/mean for " + ftrs[sorted_mean[i]], x_axis_label="time", y_axis_label="nodes_count") # create figure p.line(x_axis, mean_curves[i], legend="mean", line_color="blue") # plot nodes p.line(x_axis, std_curves[i], legend="std", line_color="green") # plot edges # plot vertical lines for ground truth anomalies = [ self._name_to_idx[anomaly] for anomaly in self._ground_truth ] y = [std_curves[i][time] for time in anomalies] p.scatter(anomalies, y, legend="anomalies", line_color="red", fill_color="red") # plot nodes p.xaxis.major_label_overrides = { i: graph_name for i, graph_name in enumerate( self._temporal_graph.graph_names()) } # time to graph_name dict p.legend.location = "top_left" show(p) e = 0 def plot_mean_std_sheatmap(self): ftrs = self._set_index_to_ftr() ftrs = [str(x) for x in ftrs] mat_dict = self._calc_matrix() # sort by highest std all_mx = np.vstack([mx for name, mx in mat_dict.items()]) global_std = { i: m for i, m in enumerate(np.std(all_mx, 0).tolist()[0]) } sorted_std = [ i for i, m in sorted(global_std.items(), key=lambda x: -x[1]) ][0:30] # sort by highest mean global_mean = { i: m for i, m in enumerate(np.mean(all_mx, 0).tolist()[0]) } sorted_mean = [ i for i, m in sorted(global_mean.items(), key=lambda x: -x[1]) ][0:30] # global_max global_sum = { i: m for i, m in enumerate(np.max(all_mx, 0).tolist()[0]) } anomalies = [ self._name_to_idx[anomaly] for anomaly in self._ground_truth ] # ----------------------- mean ------------------------- heat_mx = [] for name, mx in mat_dict.items(): heat_day_mean = { i: m for i, m in enumerate(np.mean(mx, 0).tolist()[0]) } heat_day_mean = [ heat_day_mean[i] / global_sum[i] for i in sorted_mean ] heat_mx.append(heat_day_mean) plt.subplots(figsize=(20, 15)) heat_mx = np.vstack(heat_mx) ax = sns.heatmap(heat_mx, vmin=0.0005, vmax=0.005) plt.xticks(list(range(30)), ftrs[:30], rotation='vertical') for i in anomalies: ax.axhline(y=i, color='red', linewidth=0.4) plt.savefig("mean_heatmap") e = 0 plt.clf() # ----------------------- std ------------------------- heat_mx = [] for name, mx in mat_dict.items(): heat_day_std = { i: m for i, m in enumerate(np.std(mx, 0).tolist()[0]) } heat_day_std = [ heat_day_std[i] / global_sum[i] for i in sorted_std ] heat_mx.append(heat_day_std) heat_mx = np.vstack(heat_mx) ax = sns.heatmap(heat_mx, vmin=0.005, vmax=0.05) plt.xticks(list(range(30)), ftrs[:30], rotation='vertical') for i in anomalies: ax.axhline(y=i, color='red', linewidth=0.4) plt.savefig("std_heatmap") e = 0 def plot_features_mean_std(self): # matrix: np.matrix): ftrs = self._set_index_to_ftr() ftrs = [str(x) for x in ftrs] # -------------------- prepare matrix anomalies and rest of data all_list = [] anomal_list = [] for name, mx in self._calc_matrix().items(): if name in self._ground_truth: anomal_list.append(mx) else: all_list.append(mx) all_mx = np.vstack(all_list) anomal_mx = np.vstack(anomal_list) global_mean = { i: m for i, m in enumerate(np.mean(all_mx, 0).tolist()[0]) } global_max = { i: m for i, m in enumerate(np.std(all_mx, 0).tolist()[0]) } sorted_keys = [ i for i, m in sorted(global_mean.items(), key=lambda x: -x[1]) ] groups = [] prev_val = global_max[sorted_keys[0]] sub_group = [] size_ = 0 for i in sorted_keys: if 100 * prev_val >= global_max[i] >= 00.1 * prev_val and size_ < 6: sub_group.append(i) size_ += 1 else: prev_val = global_mean[i] groups.append(sub_group) sub_group = [i] size_ = 1 for group_num in range(1): group_num = 2 curr_ftr = [] for i in groups[group_num]: curr_ftr.append(ftrs[i]) curr_ftr.append("A_" + ftrs[i]) mid = [] bottom = [] top = [] for i in groups[group_num]: bottom.append( np.percentile(all_mx[:, i], 25, axis=0).tolist()[0]) bottom.append( np.percentile(anomal_mx[:, i], 25, axis=0).tolist()[0]) mid.append(np.percentile(all_mx[:, i], 50, axis=0).tolist()[0]) mid.append( np.percentile(anomal_mx[:, i], 50, axis=0).tolist()[0]) top.append(np.percentile(all_mx[:, i], 75, axis=0).tolist()[0]) top.append( np.percentile(anomal_mx[:, i], 75, axis=0).tolist()[0]) bottom = np.array(bottom) mid = np.array(mid) top = np.array(top) # find the quartiles and IQR for each category iqr = top - bottom upper = top + 1.5 * iqr lower = bottom - 1.5 * iqr p = figure(tools="", background_fill_color="#efefef", x_range=curr_ftr, toolbar_location=None, plot_width=600, plot_height=600, title=self._data_name + "_percentile=(25-50-75)") colors = ["black", "red"] * int(mid.shape[0] / 2) # stems p.segment(curr_ftr, upper, curr_ftr, top, line_color=colors) p.segment(curr_ftr, lower, curr_ftr, bottom, line_color=colors) # boxes p.vbar(curr_ftr, 0.7, mid, top, fill_color="#E08E79", line_color=colors) p.vbar(curr_ftr, 0.7, bottom, mid, fill_color="#3B8686", line_color=colors) # whiskers (almost-0 height rects simpler than segments) p.rect(curr_ftr, lower, 0.2, 0.0000001, line_color=colors) p.rect(curr_ftr, upper, 0.2, 0.0000001, line_color=colors) p.xaxis.major_label_orientation = np.pi / 2 p.xgrid.grid_line_color = None p.ygrid.grid_line_color = "white" p.grid.grid_line_width = 2 p.xaxis.major_label_text_font_size = "12pt" show(p) # plot = Plot(output_backend="svg") # plot.output_backend(p, filename=str(group_num) + "_svg") def plot_correlations(self): from sklearn import linear_model mx_dict = self._calc_matrix() concat_mx = np.vstack([mx for name, mx in mx_dict.items()]) pearson_picker = PearsonFeaturePicker( concat_mx, size=self._params.ftr_pairs, logger=self._logger, identical_bar=self._params.identical_bar) best_pairs = pearson_picker.best_pairs() for i, j, u in best_pairs: reg = linear_model.LinearRegression().fit( np.transpose(concat_mx[:, i].T), np.transpose(concat_mx[:, j].T)) m = reg.coef_ b = reg.intercept_ ftr_i = concat_mx[:, i].T.tolist()[0] ftr_j = concat_mx[:, j].T.tolist()[0] p = figure(plot_width=600, plot_height=250, title=self._data_name + " regression " + str((i, j)), x_axis_label="time", y_axis_label="nodes_count") # create figure p.line(list(range(int(max(ftr_i)) + 1)), [m * i + b for i in range(10)], line_color="blue") # plot nodes p.scatter(list(ftr_i), list(ftr_j)) # plot nodes p.xaxis.major_label_overrides = { i: graph_name for i, graph_name in enumerate( self._temporal_graph.graph_names()) } # time to graph_name dict p.legend.location = "top_left" show(p) e = 0
class MleEstimator: def __init__(self, source_file, num_prefix=120, num_suffix=200, delta=(0.2, 0.5, 0.3), gamma=(0.6, 0.4)): self._logger = PrintLogger("NLP-ass1") self._delta = delta self._gamma = gamma self._source = source_file self._num_prefix = num_prefix self._num_suffix = num_suffix # counters self._emmision_count, self._transition_count, self._prefix_count, self._suffix_count = self._get_data( ) self._pos_list = list( set(list(self._transition_count[0].keys()) + [START])) self._num_pos = len(self._pos_list) self._pos_idx = {pos: i for i, pos in enumerate(self._pos_list)} # probabilities self._emmision, self._transition, self._prefix, self._suffix = self._calc_probabilities( ) def _get_data(self): self._logger.info("get-data - start") transition = {0: {}, 1: {}, 2: {}} t1 = START t2 = START emmision = {} prefix = {} suffix = {} src_file = open(self._source, "rt") # open file for line in src_file: # ---------- BREAK ----------- w_pos = [] for w_p in line.split(): # break line to [.. (word, POS) ..] word, pos = w_p.rsplit("/", 1) w_pos.append((word, pos)) for i, (word, pos) in enumerate(w_pos): # -------- EMISSION ---------- emmision[(word, pos)] = emmision.get( (word, pos), 0) + 1 # count (word, POS)++ # --------- PREFIX ----------- prefix[(word[:PREF], pos)] = prefix.get( (word[:PREF], pos), 0) + 1 # count bigram prefixes suffix[(word[-SUFF:], pos)] = prefix.get( (word[-SUFF:], pos), 0) + 1 # count bigram prefixes # ------- TRANSITION --------- transition[0][pos] = transition[0].get(pos, 0) + 1 # count(POS) transition[1][(t1, pos)] = transition[1].get( (t1, pos), 0) + 1 # count(POS_1, POS_2) transition[2][(t2, t1, pos)] = transition[2].get( (t2, t1, pos), 0) + 1 # count(POS_0, POS_1, POS_2) t2 = t1 t1 = pos prefix = { pre: pos for i, ( pre, pos) in enumerate(sorted(prefix.items(), key=lambda x: -x[1])) if i < self._num_prefix } suffix = { pre: pos for i, ( pre, pos) in enumerate(sorted(suffix.items(), key=lambda x: -x[1])) if i < self._num_suffix } # take K most common prefixes self._logger.info("get-data - end") return emmision, transition, prefix, suffix @staticmethod def _my_log(x): if x == 0: return -100 if x == 1: return -0.001 else: return np.log(x) def _calc_probabilities(self): self._logger.info("calc-probabilities - start") transition_prob = {} # -------- EMISSION ---------- # e(word| pos) emmision_prob = { (word, pos): ((1 - CUT) * w_p_count / self._transition_count[0][pos]) + CUT for (word, pos), w_p_count in self._emmision_count.items() } # --------- PREFIX ----------- # given word [w_1, w_2 , ... , w_n-1, w_n] # e(w_n-1, w_n| pos) prefix_bi_prob = { (pre, pos): ((1 - CUT) * s_p_count / self._transition_count[0][pos]) + CUT for (pre, pos), s_p_count in self._prefix_count.items() } suffix_bi_prob = { (sufi, pos): ((1 - CUT) * s_p_count / self._transition_count[0][pos]) + CUT for (sufi, pos), s_p_count in self._suffix_count.items() } # ------- TRANSITION --------- sum_words = np.sum(list(self._transition_count[0].values())) # sequence = [pos2, pos1, pos0] # q(pos0) transition_prob[0] = { pos: ((1 - CUT) * pos_count / sum_words) + CUT for pos, pos_count in self._transition_count[0].items() } # q(pos0| pos1) transition_prob[1] = { (pos1, pos0): ((1 - CUT) * count / self._transition_count[0][pos1]) + CUT for (pos1, pos0), count in self._transition_count[1].items() if pos1 in self._transition_count[0] } # q(pos0| pos2, pos1) transition_prob[2] = { (pos2, pos1, pos0): ((1 - CUT) * count / self._transition_count[1][(pos2, pos1)]) + CUT for (pos2, pos1, pos0), count in self._transition_count[2].items() if (pos2, pos1) in self._transition_count[1] } self._logger.info("calc-probabilities - end") return emmision_prob, transition_prob, prefix_bi_prob, suffix_bi_prob def emmision(self, word_pos: tuple, log=False): # break word, pos = word_pos # if there is a value e(word| vec) if (word, pos) in self._emmision: return self._my_log( self._emmision[word_pos]) if log else self._emmision[word_pos] # if not then check if there is a value e(w_1, w_2| pos) pref = word[:PREF] if (pref, pos) in self._prefix: return self._my_log( self._prefix[(pref, pos)]) if log else self._prefix[(pref, pos)] # if not then check if there is a value e(w_n-1, w_n| pos) suf = word[-SUFF:] if (suf, pos) in self._suffix: return self._my_log( self._suffix[(suf, pos)]) if log else self._suffix[(suf, pos)] return self._my_log(0) if log else 0 def transition(self, pos_sequence: tuple, log=False): # break sequence pos0 = pos_sequence[-1] pos1 = pos_sequence[-2] pos2 = pos_sequence[-3] if len(pos_sequence) > 2 else None # calculate: d1*q(pos0| pos2, pos1) + d2*q(pos0| pos1) + d3*q(pos0) tran_0 = self._delta[0] * self._transition[0].get(pos0, 0) tran_1 = self._delta[1] * self._transition[1].get((pos1, pos0), 0) tran_2 = self._delta[2] * self._transition[2].get( (pos2, pos1, pos0), 0) if pos2 else 0 return self._my_log(tran_2 + tran_1 + tran_0) if log else tran_2 + tran_1 + tran_0 def mle_count_to_txt(self, e_mle_path, q_mle_path): self._logger.info("writing e_mle...") out_e = open(e_mle_path, "wt") out_e.writelines([ word + " " + pos + "\t" + str(count) + "\n" for (word, pos), count in self._emmision_count.items() ]) out_e.writelines([ "^" + pref + " " + pos + "\t" + str(count) + "\n" for (pref, pos), count in self._prefix_count.items() ]) out_e.writelines([ "^" + sufi + " " + pos + "\t" + str(count) + "\n" for (sufi, pos), count in self._suffix_count.items() ]) out_e.close() self._logger.info("writing q_mle...") out_q = open(q_mle_path, "wt") out_q.writelines([ pos + "\t" + str(count) + "\n" for pos, count in self._transition_count[0].items() ]) out_q.writelines([ pos1 + " " + pos0 + "\t" + str(count) + "\n" for (pos1, pos0), count in self._transition_count[1].items() ]) out_q.writelines([ pos2 + " " + pos1 + " " + pos0 + "\t" + str(count) + "\n" for (pos2, pos1, pos0), count in self._transition_count[2].items() ]) out_q.close() def pred_viterbi(self, sequence, log=False): self._logger.info("Viterbi - START...") self._logger.info("Viterbi - INITIALIZATION...") # ------------ INITIALIZATION -------------- len_seq = len(sequence) + 1 base_score = self._my_log(0) if log else 0 v_mx = [[[(base_score, (-1, self._pos_idx[START], self._pos_idx[START])) for _ in range(self._num_pos)] for _ in range(self._num_pos)] for _ in range(len_seq)] bp = (-1, self._pos_idx[START], self._pos_idx[START]) base_score = self._my_log(1) if log else 1 v_mx[0][self._pos_idx[START]][self._pos_idx[START]] = (base_score, bp) self._logger.info("Viterbi - FORWARD...") # ------- RECURSIVE STEP / FORWARD --------- print("Viterbi - forward: " + str(sequence) + "\nProgress: ", end="") for i in range(1, len_seq): print("." * (len(sequence[i - 1]) + 3) + "|", end="") for j, pos2 in enumerate(self._pos_list): for k, pos1 in enumerate(self._pos_list): score, bp = self._max_and_bp(v_mx, i, sequence[i - 1], j, pos2, pos1, log=log) bp = (i - 1, bp, j) v_mx[i][j][k] = (score, bp) print(" -- forward completed --") self._logger.info("Viterbi - BACKWARDS...") # ------- REPRODUCTION / BACKWARDS --------- # find max and arg max at v_max[last_layer] max_val = self._my_log(0) if log else 0 max_i = 0 max_j = 0 for i in range(self._num_pos): for j in range(self._num_pos): if v_mx[len_seq - 1][i][j][0] > max_val: max_val = v_mx[len_seq - 1][i][j][0] max_i = i max_j = j # reconstruct Part Of Speech prediction = [self._pos_list[max_i], self._pos_list[max_j]] ps = v_mx[len_seq - 1][max_i][max_j][1] for word_idx in range(len_seq - 1, 0, -1): curr_pos = self._pos_list[ps[1]] if curr_pos == START: break prediction = [curr_pos] + prediction ps = v_mx[ps[0]][ps[1]][ps[2]][1] return prediction def _max_and_bp(self, v_mx, word_idx, word, pos2_idx, pos2, pos1, log=False): # given a word w_n and pos2, pos1 # we want to maximize w_n is pos1 coming after a pos2 word # scores = V(w_n-1, pos_i, pos2) * q(pos1| pos_i, pos2) * e(w_n| pos1) i = 0..num_pos if log: scores = [ v_mx[word_idx - 1][i][pos2_idx][0] + (self._gamma[1] * self.transition( (self._pos_list[i], pos2, pos1), log=log) + self._gamma[0] * self.emmision((word, pos1), log=log)) for i in range(self._num_pos) ] else: scores = [ v_mx[word_idx - 1][i][pos2_idx][0] * self.transition( (self._pos_list[i], pos2, pos1), log=log) * self.emmision( (word, pos1), log=log) for i in range(self._num_pos) ] max_score = np.max(scores) argmax_score = np.argmax(scores) return max_score, argmax_score