def process(CONFIG, _DIR, train_x_pos, train_x_neg, test_pos, test_anomaly): global logger num_neg_samples = train_x_neg.shape[1] CONFIG[_DIR]['num_neg_samples'] = num_neg_samples model_obj = set_up_model(CONFIG, _DIR) _use_pretrained = CONFIG[_DIR]['use_pretrained'] if _use_pretrained is True: saved_file_path = None pretrained_file = CONFIG[_DIR]['saved_model_file'] print('Pretrained File :', pretrained_file) saved_file_path = os.path.join(SAVE_DIR, 'checkpoints', pretrained_file) if saved_file_path is not None: model_obj.set_pretrained_model_file(saved_file_path) else: model_obj.train_model(train_x_pos, train_x_neg) elif _use_pretrained is False: model_obj.train_model(train_x_pos, train_x_neg) test_normal_ids = test_pos[0] test_anomaly_ids = test_anomaly[0] test_ids = list(np.hstack([test_normal_ids, test_anomaly_ids])) print(' Len of test_ids ', len(test_ids)) test_normal_data = test_pos[1] test_anomaly_data = test_anomaly[1] test_data_x = np.vstack([test_normal_data, test_anomaly_data]) print('Length of test data', test_data_x.shape) res = model_obj.get_event_score(test_data_x) print('Length of results ', len(res)) test_ids = list(test_ids) bounds = [] training_pos_scores = model_obj.get_event_score(train_x_pos) training_pos_scores = [_[0] for _ in training_pos_scores] train_noise = np.reshape(train_x_neg, [-1, train_x_pos.shape[-1]]) training_noise_scores = model_obj.get_event_score(train_noise) training_noise_scores = [_[0] for _ in training_noise_scores] bounds.append(min(training_noise_scores)) bounds.append(max(training_pos_scores)) print('Length of results ', len(res)) res = list(res) _id_score_dict = {id: _res for id, _res in zip(test_ids, res)} ''' sort by ascending since lower likelihood means anomalous ''' tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1)) sorted_id_score_dict = OrderedDict() for e in tmp: sorted_id_score_dict[e[0]] = e[1][0] recall, precison = eval.precision_recall_curve( sorted_id_score_dict, anomaly_id_list=test_anomaly_ids, bounds=bounds) _auc = auc(recall, precison) logger.info('AUC') logger.info(str(_auc)) print('--------------------------') plt.figure(figsize=[14, 8]) plt.plot(recall, precison, color='blue', linewidth=1.75) plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Recall | AUC ' + str(_auc), fontsize=15) f_name = 'precison-recall_1_test_' + '.png' f_path = os.path.join(OP_DIR, f_name) plt.show() # plt.savefig(f_path) plt.close() return
def process_all(CONFIG, _DIR, train_x_pos, train_x_neg, testing_dict): global logger logger.info('setting up number of negative samples ') logger.info(CONFIG[_DIR]['num_neg_samples']) num_neg_samples = train_x_neg.shape[1] CONFIG[_DIR]['num_neg_samples'] = num_neg_samples model_obj = set_up_model(CONFIG, _DIR) _use_pretrained = CONFIG[_DIR]['use_pretrained'] if _use_pretrained is True: saved_file_path = None pretrained_file = CONFIG[_DIR]['saved_model_file'] print('Pretrained File :', pretrained_file) saved_file_path = os.path.join(SAVE_DIR, 'checkpoints', pretrained_file) if saved_file_path is not None: model_obj.set_pretrained_model_file(saved_file_path) else: model_obj.train_model(train_x_pos, train_x_neg) elif _use_pretrained is False: model_obj.train_model(train_x_pos, train_x_neg) # 3 test cases by value of c for _c, test_data_item in testing_dict.items(): print('----->', _c) logger.info(' >> c = ' + str(_c)) test_pos = test_data_item[0] test_anomaly = test_data_item[1] test_normal_ids = test_pos[0] test_anomaly_ids = test_anomaly[0] test_ids = list(np.hstack([test_normal_ids, test_anomaly_ids])) print(' Len of test_ids ', len(test_ids)) test_normal_data = test_pos[1] test_anomaly_data = test_anomaly[1] test_data_x = np.vstack([test_normal_data, test_anomaly_data]) print('Length of test data', test_data_x.shape) res = model_obj.get_event_score(test_data_x) print('Length of results ', len(res)) test_ids = list(test_ids) bounds = [] training_pos_scores = model_obj.get_event_score(train_x_pos) training_pos_scores = [_[0] for _ in training_pos_scores] train_noise = np.reshape(train_x_neg, [-1, train_x_pos.shape[-1]]) training_noise_scores = model_obj.get_event_score(train_noise) training_noise_scores = [_[0] for _ in training_noise_scores] bounds.append(min(training_noise_scores)) bounds.append(max(training_pos_scores)) print('Length of results ', len(res)) res = list(res) _id_score_dict = {id: _res for id, _res in zip(test_ids, res)} ''' sort by ascending since lower likelihood means anomalous ''' tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1)) sorted_id_score_dict = OrderedDict() for e in tmp: sorted_id_score_dict[e[0]] = e[1][0] recall, precison = eval.precision_recall_curve( sorted_id_score_dict, anomaly_id_list=test_anomaly_ids, bounds=bounds) ''' Save the precision recall values Later to be plotted in ipynb ''' recall_str = ','.join([str(_) for _ in recall]) precision_str = ','.join([str(_) for _ in precison]) logger.info(precision_str) logger.info(recall_str) _auc = auc(recall, precison) logger.info('AUC') logger.info(str(_auc)) print('--------------------------')
def main(): global DATA_DIR global _TIME_IT global _DIR global OP_DIR global SAVE_DIR global config setup() train_x_pos, test_dict_cIdx_data = get_data(DATA_DIR, _DIR) ''' TRAIN model ''' # ---- Core ------ # _df_input = [] for _j in range(train_x_pos.shape[0]): _df_input.append(list(train_x_pos[_j])) cols = ['f' + str(j) for j in range(train_x_pos.shape[1])] X = pd.DataFrame(_df_input, columns=cols, index=[_j for _j in range(train_x_pos.shape[0])], dtype='category') estimator = CompreX(logging_level=logging.ERROR) estimator.transform(X) estimator.fit(X) ''' Start of test ''' for c, _t_data in test_dict_cIdx_data.items(): test_ids = _t_data[0] test_data_x = _t_data[1] test_anomaly_idList = _t_data[2] start = time.time() test_result_r = [] test_result_p = [] start_time = time.time() res = estimator.predict(test_data_x) ''' 'res' is ordered in the order of the input match it with the ordered list of ids ''' anomaly_scores = list(res) anomaly_score_dict = {k: v for k, v in zip(test_ids, anomaly_scores)} # --------------- # ''' Sort in reverse order, since higher score means anomaly ''' tmp = sorted(anomaly_score_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_id_score_dict = OrderedDict() for e in tmp: sorted_id_score_dict[e[0]] = e[1] recall, precison = eval.precision_recall_curve( sorted_id_score_dict, anomaly_id_list=test_anomaly_idList) end_time = time.time() time_taken = end_time - start_time _auc = auc(recall, precison) print('Time taken [seconds]', time_taken, 'AUC', _auc) print('--------------------------')
def main(): global _DIR global OP_DIR global SAVE_DIR global DATA_DIR global config setup() if not os.path.exists(SAVE_DIR): os.mkdir(SAVE_DIR) if os.path.exists(os.path.join(SAVE_DIR, _DIR)): os.mkdir(os.path.join(SAVE_DIR, _DIR)) checkpoint_dir = os.path.join(SAVE_DIR) print(' > ', os.getcwd()) train_x_pos, train_x_neg, APE_term_2, APE_term_4, test_pos, test_anomaly, domain_dims = data_fetcher.get_data_v1( DATA_DIR, _DIR) neg_samples = train_x_neg.shape[1] start_time = time.time() num_domains = len(domain_dims) inp_dims = list(domain_dims.values()) print('Number of domains ', num_domains) print(' domains ', inp_dims) model_obj = tf_model_ape_1.model_ape_1(MODEL_NAME) model_obj.set_model_params(num_entities=num_domains, inp_dims=inp_dims, neg_samples=neg_samples, batch_size=config[_DIR]['batch_size'], num_epochs=config[_DIR]['num_epochs'], lr=config[_DIR]['learning_rate'], chkpt_dir=checkpoint_dir) _emb_size = int(config[_DIR]['embed_size']) model_obj.set_hyper_parameters(emb_dims=[_emb_size], use_bias=[True, False]) _use_pretrained = config[_DIR]['use_pretrained'] if _use_pretrained is False: model_obj.build_model() model_obj.train_model(train_x_pos, train_x_neg, APE_term_2, APE_term_4) # test for c = 1, 2, 3 bounds = [] training_pos_scores = model_obj.inference(train_x_pos) training_pos_scores = [_[0] for _ in training_pos_scores] train_noise = np.reshape(train_x_neg, [-1, train_x_pos.shape[-1]]) training_noise_scores = model_obj.inference(train_noise) training_noise_scores = [_[0] for _ in training_noise_scores] bounds.append(min(training_noise_scores)) bounds.append(max(training_pos_scores)) for c in range(1, 3 + 1): _, _, _, _, test_pos, test_anomaly, _ = data_fetcher.get_data_v1( DATA_DIR, _DIR, c=c) ''' join the normal data + anomaly data join the normal data id + anomaly data id Maintain order ''' test_normal_ids = test_pos[0] test_anomaly_ids = test_anomaly[0] test_ids = list(np.hstack([test_normal_ids, test_anomaly_ids])) print(' Len of test_ids ', len(test_ids)) test_normal_data = test_pos[1] test_anomaly_data = test_anomaly[1] test_data_x = np.vstack([test_normal_data, test_anomaly_data]) # ---------- # # ---------- # print('Length of test data', test_data_x.shape) res = model_obj.inference(test_data_x) test_ids = list(test_ids) print('Length of results ', len(res)) res = list(res) _id_score_dict = {id: _res for id, _res in zip(test_ids, res)} ''' sort by ascending since lower likelihood means anomalous ''' tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1)) sorted_id_score_dict = OrderedDict() for e in tmp: sorted_id_score_dict[e[0]] = e[1][0] recall, precison = eval.precision_recall_curve( sorted_id_score_dict, anomaly_id_list=test_anomaly_ids) recall_str = ','.join([str(_) for _ in recall]) precision_str = ','.join([str(_) for _ in precison]) logger.info(precision_str) logger.info(recall_str) _auc = auc(recall, precison) logger.info('c=' + str(c)) logger.info('AUC') logger.info(str(_auc)) ''' if _TIME_IT == False: _auc = auc(recall, precison) print('AUC', _auc) plt.figure(figsize=[14, 8]) plt.plot( recall, precison, color='blue', linewidth=1.75) plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Recall | AUC ' + str(_auc), fontsize=15) f_name = 'precison-recall_1_test_' + str(i) + '.png' f_path = os.path.join(OP_DIR, f_name) # plt.savefig(f_path) test_result_r.append(recall) test_result_p.append(precison) plt.close() ''' print('----------------------------') end_time = time.time() elapsed_time = (end_time - start_time) logging.info('time taken') logging.info(str(elapsed_time))
def process(_dir=None): global DATA_DIR global _DIR global config global OP_DIR global logger setup(_dir) logger.info('--------') logger.info(_DIR) logger.info('--------') k_val = None train_x, test_dict_cIdx_data = get_data(DATA_DIR, _DIR) if k_val is not None: K = k_val else: K = int(config['K']) ''' Start of train ''' # Number of training instances t1 = time.time() N = train_x.shape[0] obj_ADTree = ad_tree_v1.ADT() obj_ADTree.setup(train_x) attribute_list = list(range(train_x.shape[1])) attribute_set_pairs = get_attribute_sets(train_x, attribute_list, obj_ADTree, k=K) t2 = time.time() train_time = t2 - t1 logger.info('train_time taken ' + str(train_time)) print(train_time) print(' Number of attribute set pairs ', len(attribute_set_pairs)) ''' Start of test ''' for c, _t_data in test_dict_cIdx_data.items(): test_ids = _t_data[0] test_data_x = _t_data[1] test_anomaly_idList = _t_data[2] start = time.time() results = OrderedDict() for _id, record in zip(test_ids, test_data_x): _, r_score = get_r_value(_id, record, obj_ADTree, attribute_set_pairs, N) results[_id] = r_score end = time.time() _time = end - start # print(' Time taken :', end - start) tmp = sorted(results.items(), key=operator.itemgetter(1)) sorted_id_score_dict = OrderedDict() for e in tmp: sorted_id_score_dict[e[0]] = e[1] recall, precison = eval.precision_recall_curve( sorted_id_score_dict=sorted_id_score_dict, anomaly_id_list=test_anomaly_idList) _auc = auc(recall, precison) print('AUC ', _auc) logger.info('c = ' + str(c)) logger.info('Time taken ' + str(_time)) logger.info('AUC : ' + str(_auc)) plt.figure(figsize=[14, 8]) plt.plot(recall, precison, color='blue', linewidth=1.75) plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Precision Recall | AUC ' + str(_auc), fontsize=15) f_name = 'precison-recall_1' + '_test_' + str(c) + '.png' f_path = os.path.join(OP_DIR, f_name) plt.savefig(f_path) plt.close()
clf_1 = LocalOutlierFactor(n_neighbors=20, novelty=True) clf_1.fit(_train_x) _test_x = tsvd.transform(test_x) result = clf_1.score_samples(_test_x) res = list(result) _id_score_dict = {id: _res for id, _res in zip(all_ids, res)} tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1)) sorted_id_score_dict = OrderedDict() for e in tmp: sorted_id_score_dict[e[0]] = e[1] bounds = [] # training_pos_scores = clf.score_samples( # _train_x # ) # training_pos_scores = [_[0] for _ in training_pos_scores] from pprint import pprint pprint(result) bounds.append(-1) bounds.append(0) recall, precison = eval.precision_recall_curve(sorted_id_score_dict, anomaly_id_list=anom_ids, bounds=bounds) _auc = auc(recall, precison) print('_auc', _auc)