def mining_invariants(self, para): para['path'] = self.input_dir print(para) if self.data_type == 'time_based': para['log_file_name'] = self.log_seq raw_data, event_mapping_data = data_loader.deepia_data_loader(para) event_count_matrix = data_loader.deepia_preprocess_data( para, raw_data, event_mapping_data) r = mi.estimate_invar_spce(para, event_count_matrix) invar_dict = mi.invariant_search(para, event_count_matrix, r) log_template_path = self.input_dir + self.log_seq.split( '.log')[0] + '.log_templates.csv' structured_log_path = self.input_dir + self.log_seq.split( '.log')[0] + '.log_structured.csv' window_split_file_path = para['save_path'] + 'sliding_' + str( para['window_size']) + 'h_' + str(para['step_size']) + 'h.csv' predictions, anomalies = mi.deepia_evaluate( event_count_matrix, invar_dict, log_template_path, structured_log_path, window_split_file_path) elif self.data_type == 'time_based_bgl': para['log_file_name'] = self.log_seq raw_data, event_mapping_data = data_loader.bgl_data_loader(para) event_count_matrix, labels = data_loader.bgl_preprocess_data( para, raw_data, event_mapping_data) r = mi.estimate_invar_spce(para, event_count_matrix) invar_dict = mi.invariant_search(para, event_count_matrix, r) predictions = mi.evaluate(event_count_matrix, invar_dict, labels) elif self.data_type == 'event_based': para['log_seq_file_name'] = self.log_seq raw_data, label_data = data_loader.hdfs_data_loader(para) r = mi.estimate_invar_spce(para, raw_data) invar_dict = mi.invariant_search(para, raw_data, r) predictions = mi.evaluate(raw_data, invar_dict, label_data) return raw_data, event_mapping_data, event_count_matrix, r, invar_dict, predictions, anomalies
'../time_windows/', # dir for saving sliding window data files to avoid splitting 'select_column': [ 0, 4 ], # select the corresponding columns (label and time) in the raw log file 'window_size': 3, # time window (unit: hour) 'step_size': 1, # step size (unit: hour) 'training_percent': 0.8, # training data percentage 'tf-idf': True, # whether to use tf-idf 'models': 'DT', # select from ['DT', 'LR', 'SVM'] 'cross_validate': False # set to True to avoid over_fitting (10-CV), but if we want to predict anomalies, it should set to False, Default: False } if __name__ == '__main__': model = para['models'] assert model in ['DT', 'LR', 'SVM'] raw_data, event_mapping_data = data_loader.bgl_data_loader(para) event_count_matrix, labels = data_loader.bgl_preprocess_data( para, raw_data, event_mapping_data) train_data, train_labels, testing_data, testing_labels = cl.data_split( para, event_count_matrix, labels) # Select one models out of three provided models if model == 'DT': cl.decision_tree(para, train_data, train_labels, testing_data, testing_labels) elif model == 'LR': cl.logsitic_regression(para, train_data, train_labels, testing_data, testing_labels) elif model == 'SVM': cl.SVM(para, train_data, train_labels, testing_data, testing_labels)
para={ 'path':'../../Data/BGL_data/', # directory for input data 'log_file_name':'BGL_MERGED.log', # filename for log data file 'log_event_mapping':'logTemplateMap.csv', # filename for log-event mapping. A list of event index, where each row represents a log 'save_path': '../time_windows/', # dir for saving sliding window data files to avoid splitting 'select_column':[0,4], # select the corresponding columns (label and time) in the raw log file 'window_size':3, # time window (unit: hour) 'step_size': 1, # step size (unit: hour) 'training_percent': 0.8, # training data percentage 'tf-idf': True, # whether to use tf-idf 'models': 'DT', # select from ['DT', 'LR', 'SVM'] 'cross_validate': False # set to True to avoid over_fitting (10-CV), but if we want to predict anomalies, it should set to False, Default: False } if __name__ == '__main__': model = para['models'] assert model in ['DT', 'LR', 'SVM'] raw_data, event_mapping_data = data_loader.bgl_data_loader(para) event_count_matrix, labels = data_loader.bgl_preprocess_data(para, raw_data, event_mapping_data) train_data, train_labels, testing_data, testing_labels = cl.data_split(para, event_count_matrix, labels) # Select one models out of three provided models if model == 'DT': cl.decision_tree(para, train_data, train_labels, testing_data, testing_labels) elif model == 'LR': cl.logsitic_regression(para, train_data, train_labels, testing_data, testing_labels) elif model == 'SVM': cl.SVM(para, train_data, train_labels, testing_data, testing_labels)