def save_load_snapshot(self, max_clusters): persistence = MemoryBufferPersistence() config = TemplateMinerConfig() config.drain_max_clusters = max_clusters template_miner1 = TemplateMiner(persistence, config) print(template_miner1.add_log_message("hello")) print(template_miner1.add_log_message("hello ABC")) print(template_miner1.add_log_message("hello BCD")) print(template_miner1.add_log_message("hello XYZ")) print(template_miner1.add_log_message("goodbye XYZ")) template_miner2 = TemplateMiner(persistence, config) self.assertListEqual(list(template_miner1.drain.id_to_cluster.keys()), list(template_miner2.drain.id_to_cluster.keys())) self.assertListEqual( list(template_miner1.drain.root_node.key_to_child_node.keys()), list(template_miner2.drain.root_node.key_to_child_node.keys())) def get_tree_lines(template_miner): sio = io.StringIO() template_miner.drain.print_tree(sio) sio.seek(0) return sio.readlines() self.assertListEqual(get_tree_lines(template_miner1), get_tree_lines(template_miner2)) print(template_miner2.add_log_message("hello yyy")) print(template_miner2.add_log_message("goodbye ABC"))
def __init__(self, logs: pd.DataFrame): self.logs = logs self.template_miner = TemplateMiner() self.cleaned_logs = pd.DataFrame self.clusters = {} self.results = {} self.n_clusters = 0
def __init__(self, prefix_file, model_name, num_candidates, window_size, device, lr, lr_step, lr_decay_ratio, max_iter): Path("data").mkdir(parents=True, exist_ok=True) self.persistence_path = prefix_file + "_templates_persist.bin" persistence = FilePersistence(self.persistence_path) config = TemplateMinerConfig() config.load("ailoganalyzer/drain3.ini") config.profiling_enabled = False self.template_miner = TemplateMiner(persistence, config) if device == "auto": device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") super().__init__(model_name) self.prefix_file = prefix_file self.num_candidates = num_candidates self.window_size = window_size self.device = device self.lr = lr self.lr_step = lr_step self.lr_decay_ratio = lr_decay_ratio self.nb_epoch = max_iter self.semantic = False self.sequentials = False self.quantitatives = False self.model = None self.sequence = [] self.train_seq = [] self.train_loader = None self.valid_loader = None self.model_path = self.prefix_file + "_last.pth"
def parse_file_drain3(data: DefaultDict) -> Dict: template_miner = TemplateMiner() cluster_ids = defaultdict(list) log_lines = defaultdict(list) for block_id, logs in data.items(): for log in logs: line = log.rstrip().partition(': ')[ 2] # produces tuple (pre, delimiter, post) result = template_miner.add_log_message(line) cluster_ids[block_id].append(result['cluster_id']) log_lines[block_id].append(line) log_structure = get_log_structure(log_lines, cluster_ids, template_miner.drain.clusters) return log_structure
def test_get_param_list(self): config = TemplateMinerConfig() mi = MaskingInstruction( "((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM") config.masking_instructions.append(mi) config.mask_prefix = "[:" config.mask_suffix = ":]" template_miner = TemplateMiner(None, config) def add_and_test(msg, expected_params): print(f"msg: {msg}") res = template_miner.add_log_message(msg) print(f"result: {res}") params = template_miner.get_parameter_list(res["template_mined"], msg) print(f"params: {params}") self.assertListEqual(params, expected_params) add_and_test("hello", []) add_and_test("hello ABC", []) add_and_test("hello BCD", ["BCD"]) add_and_test("request took 123 ms", ["123"]) add_and_test("file saved [test.xml]", []) add_and_test("new order received: [:xyz:]", []) add_and_test("order type: new, order priority:3", ["3"]) add_and_test("order type: changed, order priority:5", ["changed,", "5"])
def test_extract_parameters(self): config = TemplateMinerConfig() mi = MaskingInstruction( "((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM") config.masking_instructions.append(mi) mi = MaskingInstruction(r"multiple words", "WORDS") config.masking_instructions.append(mi) config.mask_prefix = "[:" config.mask_suffix = ":]" template_miner = TemplateMiner(None, config) def add_and_test(msg, expected_params, exact_matching=False): print(f"msg: {msg}") res = template_miner.add_log_message(msg) print(f"result: {res}") extracted_parameters = template_miner.extract_parameters( res["template_mined"], msg, exact_matching=exact_matching) self.assertIsNotNone(extracted_parameters) params = [parameter.value for parameter in extracted_parameters] print(f"params: {params}") self.assertListEqual(params, expected_params) add_and_test("hello", []) add_and_test("hello ABC", []) add_and_test("hello BCD", ["BCD"]) add_and_test("hello BCD", ["BCD"]) add_and_test("hello\tBCD", ["BCD"]) add_and_test("request took 123 ms", ["123"]) add_and_test("file saved [test.xml]", []) add_and_test("new order received: [:xyz:]", []) add_and_test("order type: new, order priority:3", ["3"]) add_and_test("order type: changed, order priority:5", ["changed,", "5"]) add_and_test("sometimes one needs multiple words", ["multiple words"], True) add_and_test("sometimes one needs not", ["not"], True) add_and_test("sometimes one needs multiple words", ["multiple words"], True)
def test_match_strategies(self): miner = TemplateMiner() print(miner.add_log_message("training4Model start")) print(miner.add_log_message("loadModel start")) print(miner.add_log_message("loadModel stop")) print(miner.add_log_message("this is a test")) miner.drain.print_tree() self.assertIsNotNone( miner.match("loadModel start", full_search_strategy="fallback")) self.assertIsNotNone( miner.match("loadModel start", full_search_strategy="always")) self.assertIsNone( miner.match("loadModel start", full_search_strategy="never")) print(miner.add_log_message("loadModel start")) self.assertIsNotNone( miner.match("loadModel start", full_search_strategy="fallback")) self.assertIsNotNone( miner.match("loadModel start", full_search_strategy="always")) self.assertIsNotNone( miner.match("loadModel start", full_search_strategy="never")) config = TemplateMinerConfig() config.parametrize_numeric_tokens = False miner = TemplateMiner(config=config) print(miner.add_log_message("training4Model start")) print(miner.add_log_message("loadModel start")) print(miner.add_log_message("loadModel stop")) print(miner.add_log_message("this is a test")) self.assertIsNotNone( miner.match("loadModel start", full_search_strategy="fallback")) self.assertIsNotNone( miner.match("loadModel start", full_search_strategy="always")) self.assertIsNotNone( miner.match("loadModel start", full_search_strategy="never")) self.assertIsNone(miner.match("", full_search_strategy="never")) self.assertIsNone(miner.match("", full_search_strategy="always")) self.assertIsNone(miner.match("", full_search_strategy="fallback")) print(miner.add_log_message("")) self.assertIsNotNone(miner.match("", full_search_strategy="never")) self.assertIsNotNone(miner.match("", full_search_strategy="always")) self.assertIsNotNone(miner.match("", full_search_strategy="fallback"))
def test_match_only(self): config = TemplateMinerConfig() config.drain_extra_delimiters = ["_"] mi = MaskingInstruction( "((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM") config.masking_instructions.append(mi) tm = TemplateMiner(None, config) res = tm.add_log_message("aa aa aa") print(res) res = tm.add_log_message("aa aa bb") print(res) res = tm.add_log_message("xx yy zz") print(res) res = tm.add_log_message("rrr qqq 123") print(res) c = tm.match("aa aa tt") self.assertEqual(1, c.cluster_id) c = tm.match("aa aa 12") self.assertEqual(1, c.cluster_id) c = tm.match("xx yy zz") self.assertEqual(2, c.cluster_id) c = tm.match("xx yy rr") self.assertIsNone(c) c = tm.match("nothing") self.assertIsNone(c) c = tm.match("rrr qqq 456 ") self.assertEqual(3, c.cluster_id) c = tm.match("rrr qqq 555.2") self.assertIsNone(c) c = tm.match("rrr qqq num") self.assertIsNone(c)
def test_extract_parameters_direct(self): config = TemplateMinerConfig() mi = MaskingInstruction(r"hdfs://[\w.:@-]*((/[\w.~%+-]+)+/?)?", "hdfs_uri") config.masking_instructions.append(mi) mi = MaskingInstruction(r"(?P<quote>[\"'`]).*?(?P=quote)", "quoted_string") config.masking_instructions.append(mi) mi = MaskingInstruction(r"((?P<p_0>[*_])\2{0,2}).*?\1", "markdown_emph") config.masking_instructions.append(mi) mi = MaskingInstruction(r"multiple \*word\* pattern", "*words*") config.masking_instructions.append(mi) mi = MaskingInstruction(r"some \S+ \S+ pattern", "*words*") config.masking_instructions.append(mi) mi = MaskingInstruction(r"(\d{1,3}\.){3}\d{1,3}", "ip") config.masking_instructions.append(mi) mi = MaskingInstruction(r"(?P<number>\d+)\.\d+", "float") config.masking_instructions.append(mi) mi = MaskingInstruction(r"0[xX][a-fA-F0-9]+", "integer") config.masking_instructions.append(mi) mi = MaskingInstruction(r"(?P<number>\d+)", "integer") config.masking_instructions.append(mi) mi = MaskingInstruction(r"HelloWorld", "*") config.masking_instructions.append(mi) mi = MaskingInstruction(r"MaskPrefix", "<") config.masking_instructions.append(mi) template_miner = TemplateMiner(None, config) test_vectors = [ ("<hdfs_uri>:<integer>+<integer>", "hdfs://*****:*****@<integer>", "some other cool pattern@0xe1f", ["some other cool pattern", "0xe1f"], ["*words*", "integer"]), ("Another test with <*words*> that includes <integer><integer> and <integer> <*> <integer>", "Another test with some other 0Xadded pattern that includes 500xc0ffee and 0X4 times 5", [ "some other 0Xadded pattern", "50", "0xc0ffee", "0X4", "times", "5" ], ["*words*", "integer", "integer", "integer", "*", "integer"]), ("some <*words*> <*words*>", "some multiple *word* pattern some confusing *word* pattern", ["multiple *word* pattern", "some confusing *word* pattern"], ["*words*", "*words*"]), ("<*words*> <*>", "multiple *word* pattern <*words*>", ["multiple *word* pattern", "<*words*>"], ["*words*", "*"]), ("<*> <*>", "HelloWorld Test", ["HelloWorld", "Test"], ["*", "*"]), ("<*> <*>", "HelloWorld <anything>", ["HelloWorld", "<anything>"], ["*", "*"]), ("<*><integer>", "HelloWorld1", ["HelloWorld", "1"], ["*", "integer"]), ("<*> works <*>", "This works as-expected", ["This", "as-expected"], ["*", "*"]), ("<memory:<integer>>", "<memory:8>", ["8"], ["integer"]), ("<memory:<integer> <core:<float>>>", "<memory:8 <core:0.5>>", ["8", "0.5"], ["integer", "float"]), ("<*> <memory:<<integer> <core:<float>>>", "New: <memory:<8 <core:0.5>>", ["New:", "8", "0.5"], ["*", "integer", "float"]), ("<<>", "MaskPrefix", ["MaskPrefix"], ["<"]), ("<<<>>", "<MaskPrefix>", ["MaskPrefix"], ["<"]), ("There are no parameters here.", "There are no parameters here.", [], []), ("<float> <float>", "0.15 10.16 3.19", None, None), ("<float> <float>", "0.15 10.16 test 3.19", None, None), ("<memory:<<integer> <core:<float>>>", "<memory:8 <core:0.5>>", None, None), ("<<>", "<<>", None, None), ("<*words*> <*words*>", "0.15 0.15", None, None), ] for template, content, expected_parameters, expected_mask_names in test_vectors: with self.subTest(template=template, content=content, expected_parameters=expected_parameters): extracted_parameters = template_miner.extract_parameters( template, content, exact_matching=True) if expected_parameters is None: self.assertIsNone(extracted_parameters) else: self.assertIsNotNone(extracted_parameters) self.assertListEqual([ parameter.value for parameter in extracted_parameters ], expected_parameters) self.assertListEqual([ parameter.mask_name for parameter in extracted_parameters ], expected_mask_names)
elif persistence_type == "REDIS": from drain3.redis_persistence import RedisPersistence persistence = RedisPersistence(redis_host='', redis_port=25061, redis_db=0, redis_pass='', is_ssl=True, redis_key="drain3_state_key") else: persistence = None config = TemplateMinerConfig() config.load(dirname(__file__) + "/drain3.ini") template_miner = TemplateMiner(persistence, config) print(f"Drain3 started with '{persistence_type}' persistence") print(f"reading from std-in (input 'q' to finish)") while True: log_line = input("> ") if log_line == 'q': break result = template_miner.add_log_message(log_line) result_json = json.dumps(result) print(result_json) params = template_miner.get_parameter_list(result["template_mined"], log_line) print("parameters: " + str(params)) print("Clusters:") for cluster in template_miner.drain.clusters:
# persistence_type = "NONE" # persistence_type = "KAFKA" persistence_type = "FILE" config = configparser.ConfigParser() config.read('drain3.ini') logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') if persistence_type == "KAFKA": persistence = KafkaPersistence("localhost:9092", "drain3_state") elif persistence_type == "FILE": persistence = FilePersistence("results/{}/drain3_state[{}].bin".format(log_type,log_type)) else: persistence = None template_miner = TemplateMiner(persistence) print(f"Drain3 started with '{persistence_type}' persistence") df = pd.read_csv("/container/drain3/parser/results/{}/{}_struct.csv".format(log_type,log_file))# content=df.loc[:,'Content'] for idx in content.index: #component=line['Component'] #level=line['Level'] result=template_miner.add_log_message(content[idx]) result_json = json.dumps(result) #print(result_json) print("Clusters:") for cluster in template_miner.drain.clusters:
from drain3.redis_persistence import RedisPersistence persistence = RedisPersistence(redis_host='', redis_port=25061, redis_db=0, redis_pass='', is_ssl=True, redis_key="drain3_state_key") else: persistence = None config = TemplateMinerConfig() config.load(dirname(__file__) + "/drain3.ini") config.profiling_enabled = False template_miner = TemplateMiner(persistence, config) print(f"Drain3 started with '{persistence_type}' persistence") print(f"{len(config.masking_instructions)} masking instructions are in use") print(f"Starting training mode. Reading from std-in ('q' to finish)") while True: log_line = input("> ") if log_line == 'q': break result = template_miner.add_log_message(log_line) result_json = json.dumps(result) print(result_json) template = result["template_mined"] params = template_miner.extract_parameters(template, log_line) print("Parameters: " + str(params)) print("Training done. Mined clusters:")
def evaluate_HDFS(self, train=True): config = TemplateMinerConfig() config.load("ailoganalyzer/drain3.ini") config.profiling_enabled = False self.template_miner = TemplateMiner(config=config) hdfs_log = "../../Documents/HDFS_1/HDFS.log" hdfs_anomaly_label = "../../Documents/HDFS_1/anomaly_label.csv" nb_block = 30000 with open(hdfs_anomaly_label, "r") as f: hdfs_labels = {} for i, line in tqdm(enumerate(f), total=nb_block): label = line.strip().split(",") hdfs_labels[label[0]] = (label[1] == "Anomaly") keys = random.sample(list(hdfs_labels), nb_block) values = [hdfs_labels[k] for k in keys] hdfs_labels = dict(zip(keys, values)) blk_finder_2 = re.compile(r"(blk_-?\d+)") with open(hdfs_log, "r") as f: data_dict = {key: [] for key in hdfs_labels.keys()} for line in tqdm(f): blk = re.search(blk_finder_2, line).group() if blk in data_dict: msg = " ".join(line.strip().split()[5:]) result = self.template_miner.add_log_message(msg) cluster_id = result["cluster_id"] - 1 data_dict[blk].append(cluster_id) abnormal = [] normal = [] abnormal_label = [] normal_label = [] abnormal_blk = [] for blk, seq in data_dict.items(): if len(seq) > self.window_size: labels = seq[self.window_size:] seqs = sliding_window_view(seq[:-1], self.window_size) if hdfs_labels[blk]: abnormal.append(seqs) abnormal_label.append(labels) abnormal_blk.append(blk) else: normal.append(seqs) normal_label.append(labels) print("normal : ", len(normal)) print("abnormal : ", len(abnormal)) train_seq, test_seq, train_label, test_label = train_test_split( normal, normal_label, train_size=0.8) train_seq = np.concatenate(train_seq) train_label = np.concatenate(train_label) if train: self.set_dataLoader_training_1(train_seq, train_label) self.train() # predict FP = 0 TP = 0 mem = {} for seqs, labels in tqdm(zip(test_seq, test_label), total=len(test_seq)): for seq, label in zip(seqs, labels): seq_tuple = tuple(seq + [label]) if seq_tuple in mem: result = mem[seq_tuple] else: result = self.predict_seq(seq, label) mem[seq_tuple] = result if result: FP += 1 break for seqs, labels in tqdm(zip(abnormal, abnormal_label), total=len(abnormal)): for seq, label in zip(seqs, labels): seq_tuple = tuple(seq + [label]) if seq_tuple in mem: result = mem[seq_tuple] else: result = self.predict_seq(seq, label) mem[seq_tuple] = result if result: TP += 1 break FN = len(abnormal) - TP P = 100 * TP / (TP + FP) R = 100 * TP / (TP + FN) F1 = 2 * P * R / (P + R) print('''false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'''.format(FP, FN, P, R, F1))
class LSTMLogSequence(AnomalyDetector): """An abstract class for implementing anomaly detection models. ... Attributes ---------- prefix_file : str the string wich will be added at the beginning the persistent file of drain3, and the .path file of the model num_candidates : int for prediction phase : the number of possible candidate for a log. The lower the value, the sensible the detection window_size : int the window size to use for the LSTM model device : {'cpu', 'cuda', 'auto'} the device to be used to train the model and predict. 'cpu' will work everytime. To use 'cuda' you need to have a compatible graphic card, and a proper installation of CUDA. 'auto' will use cuda if is_available, else it will use cpu. lr : int learning rate for training. Methods ------- add_train_log(log) add a log that will be used the next time train() will be called. The logs have to be added in the correct order. predict(log) return True if the log is abnormal, False otherwise train() train the model with the data added via the add_train_log function """ def __init__(self, prefix_file, model_name, num_candidates, window_size, device, lr, lr_step, lr_decay_ratio, max_iter): Path("data").mkdir(parents=True, exist_ok=True) self.persistence_path = prefix_file + "_templates_persist.bin" persistence = FilePersistence(self.persistence_path) config = TemplateMinerConfig() config.load("ailoganalyzer/drain3.ini") config.profiling_enabled = False self.template_miner = TemplateMiner(persistence, config) if device == "auto": device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") super().__init__(model_name) self.prefix_file = prefix_file self.num_candidates = num_candidates self.window_size = window_size self.device = device self.lr = lr self.lr_step = lr_step self.lr_decay_ratio = lr_decay_ratio self.nb_epoch = max_iter self.semantic = False self.sequentials = False self.quantitatives = False self.model = None self.sequence = [] self.train_seq = [] self.train_loader = None self.valid_loader = None self.model_path = self.prefix_file + "_last.pth" def add_train_log(self, log): cluster_id = self.log_to_key(log) self.train_seq.append(cluster_id) def predict(self, log): cluster_id = self.log_to_key(log) if len(self.train_seq) > 0: self.train_seq = [] self.sequence = np.array(self.sequence) label = np.array([cluster_id]) if len(self.sequence) == self.window_size: res = self.predict_seq(self.sequence, label) else: res = False if len(self.sequence) == self.window_size: self.sequence = self.sequence[1:] self.sequence = np.append(self.sequence, cluster_id) return res def initialize_model(self): state = None if os.path.isfile(self.model_path): state = torch.load(self.model_path) num_classes = state["num_keys"] self.is_trained = True else: num_classes = self.get_number_classes() self.is_trained = False self.num_classes = num_classes if self.model_name == "loganomaly": self.model = loganomaly(hidden_size=128, num_layers=2, num_keys=num_classes) self.input_size = 300 self.semantic = True self.quantitatives = True self.batch_size = 256 elif self.model_name == "deeplog": self.model = deeplog(hidden_size=64, num_layers=2, num_keys=num_classes) self.input_size = 1 self.sequentials = True self.batch_size = 2048 elif self.model_name == "robustlog": raise NotImplementedError else: raise NotImplementedError if state is not None: self.model.load_state_dict(state["state_dict"]) def train(self): if len(self.train_seq) < self.window_size: raise RuntimeError( "There is not enought data for training. Add logs with the add_train_log function." ) if self.train_loader is None or self.valid_loader is None: self.set_dataLoader_training() print("num classes:", self.num_classes) trainer = Trainer(self.model, self.train_loader, self.valid_loader, self.num_classes, self.prefix_file, self.model_name, self.window_size, max_epoch=self.nb_epoch, lr_step=self.lr_step, model_path=self.model_path, device=self.device) trainer.start_train() self.is_trained = True def set_dataLoader_training(self): self.train_seq = np.array(self.train_seq) labels = self.train_seq[self.window_size:] sequences = sliding_window_view(self.train_seq[:-1], self.window_size) self.set_dataLoader_training_1(sequences, labels) def set_dataLoader_training_1(self, sequences, labels): self.initialize_model() train_seq, val_seq, train_label, val_label = train_test_split( sequences, labels, train_size=0.8) print("number train sequences :", len(train_seq)) print("number val sequences :", len(val_seq)) self.num_classes = self.get_number_classes() event2vec = self.template_to_vec_all() train_dataset = sliddingWindowDataset(train_seq, train_label, self.window_size, event2vec, num_classes=self.num_classes, seq=self.sequentials, quan=self.quantitatives, sem=self.semantic) valid_dataset = sliddingWindowDataset(val_seq, val_label, self.window_size, event2vec, num_classes=self.num_classes, seq=self.sequentials, quan=self.quantitatives, sem=self.semantic) self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, pin_memory=True) self.valid_loader = DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False, pin_memory=True) def predict_seq(self, sequence, label): if not self.is_trained: raise RuntimeError("You need to train the model before predicting") sequence = sequence[np.newaxis] event2vec = self.template_to_vec_all() if self.model is None: self.initialize_model() self.model = self.model.eval().to(self.device) label = np.array([label]) dataset = sliddingWindowDataset(sequence, label, self.window_size, event2vec, num_classes=self.num_classes, seq=self.sequentials, quan=self.quantitatives, sem=self.semantic) data, label = dataset[0] features = [] for value in data.values(): features.append(value[np.newaxis].to(self.device)) label = torch.tensor(label).view(-1).to(self.device) output = self.model(features=features, device=self.device) predicted = torch.argsort(output, 1)[0][-self.num_candidates:] if label not in predicted: return True else: return False def evaluate_HDFS(self, train=True): config = TemplateMinerConfig() config.load("ailoganalyzer/drain3.ini") config.profiling_enabled = False self.template_miner = TemplateMiner(config=config) hdfs_log = "../../Documents/HDFS_1/HDFS.log" hdfs_anomaly_label = "../../Documents/HDFS_1/anomaly_label.csv" nb_block = 30000 with open(hdfs_anomaly_label, "r") as f: hdfs_labels = {} for i, line in tqdm(enumerate(f), total=nb_block): label = line.strip().split(",") hdfs_labels[label[0]] = (label[1] == "Anomaly") keys = random.sample(list(hdfs_labels), nb_block) values = [hdfs_labels[k] for k in keys] hdfs_labels = dict(zip(keys, values)) blk_finder_2 = re.compile(r"(blk_-?\d+)") with open(hdfs_log, "r") as f: data_dict = {key: [] for key in hdfs_labels.keys()} for line in tqdm(f): blk = re.search(blk_finder_2, line).group() if blk in data_dict: msg = " ".join(line.strip().split()[5:]) result = self.template_miner.add_log_message(msg) cluster_id = result["cluster_id"] - 1 data_dict[blk].append(cluster_id) abnormal = [] normal = [] abnormal_label = [] normal_label = [] abnormal_blk = [] for blk, seq in data_dict.items(): if len(seq) > self.window_size: labels = seq[self.window_size:] seqs = sliding_window_view(seq[:-1], self.window_size) if hdfs_labels[blk]: abnormal.append(seqs) abnormal_label.append(labels) abnormal_blk.append(blk) else: normal.append(seqs) normal_label.append(labels) print("normal : ", len(normal)) print("abnormal : ", len(abnormal)) train_seq, test_seq, train_label, test_label = train_test_split( normal, normal_label, train_size=0.8) train_seq = np.concatenate(train_seq) train_label = np.concatenate(train_label) if train: self.set_dataLoader_training_1(train_seq, train_label) self.train() # predict FP = 0 TP = 0 mem = {} for seqs, labels in tqdm(zip(test_seq, test_label), total=len(test_seq)): for seq, label in zip(seqs, labels): seq_tuple = tuple(seq + [label]) if seq_tuple in mem: result = mem[seq_tuple] else: result = self.predict_seq(seq, label) mem[seq_tuple] = result if result: FP += 1 break for seqs, labels in tqdm(zip(abnormal, abnormal_label), total=len(abnormal)): for seq, label in zip(seqs, labels): seq_tuple = tuple(seq + [label]) if seq_tuple in mem: result = mem[seq_tuple] else: result = self.predict_seq(seq, label) mem[seq_tuple] = result if result: TP += 1 break FN = len(abnormal) - TP P = 100 * TP / (TP + FP) R = 100 * TP / (TP + FN) F1 = 2 * P * R / (P + R) print('''false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'''.format(FP, FN, P, R, F1)) # -------------- drain3 function ----------------- def log_to_key(self, log): result = self.template_miner.add_log_message(log) if result["change_type"] != "none": pass cluster_id = result["cluster_id"] - 1 return cluster_id def get_templates(self): return (c.get_template() for c in self.template_miner.drain.clusters) def get_number_classes(self): return len(list(self.get_templates())) def get_word_counter(self): d = defaultdict(int) for cluster in self.template_miner.drain.clusters: for word in preprocess_template(cluster.get_template()): d[word] += cluster.size return d def template_to_vec_all(self): d = {} d[0] = np.array([-1] * 300) word_counter = self.get_word_counter() for cluster in self.template_miner.drain.clusters: template, template_id = cluster.get_template(), cluster.cluster_id d[template_id] = line_to_vec(template, word_counter) return d def template_to_vec(self, templateID): if templateID == 0: return np.array([-1] * 300) for cluster in self.template_miner.drain.clusters: if cluster.cluster_id == templateID: word_counter = self.get_word_counter() return line_to_vec(cluster.get_template(), word_counter) print(templateID) raise RuntimeError def remove_system(self): os.remove(self.persistence_path)
level=logging.INFO, format='%(message)s') in_gz_file = "SSH.tar.gz" in_log_file = "SSH.log" if not os.path.isfile(in_log_file): logger.info(f"Downloading file {in_gz_file}") p = subprocess.Popen( f"curl https://zenodo.org/record/3227177/files/{in_gz_file} --output {in_gz_file}", shell=True) p.wait() logger.info(f"Extracting file {in_gz_file}") p = subprocess.Popen(f"tar -xvzf {in_gz_file}", shell=True) p.wait() template_miner = TemplateMiner() line_count = 0 start_time = time.time() batch_start_time = start_time batch_size = 10000 with open(in_log_file) as f: for line in f: line = line.rstrip() line = line.partition(": ")[2] result = template_miner.add_log_message(line) line_count += 1 if line_count % batch_size == 0: time_took = time.time() - batch_start_time rate = batch_size / time_took logger.info(
in_gz_file = "SSH.tar.gz" in_log_file = "SSH.log" if not os.path.isfile(in_log_file): logger.info(f"Downloading file {in_gz_file}") p = subprocess.Popen( f"curl https://zenodo.org/record/3227177/files/{in_gz_file} --output {in_gz_file}", shell=True) p.wait() logger.info(f"Extracting file {in_gz_file}") p = subprocess.Popen(f"tar -xvzf {in_gz_file}", shell=True) p.wait() config = TemplateMinerConfig() config.load(dirname(__file__) + "/drain3.ini") config.profiling_enabled = True template_miner = TemplateMiner(config=config) line_count = 0 with open(in_log_file) as f: lines = f.readlines() start_time = time.time() batch_start_time = start_time batch_size = 10000 for line in lines: line = line.rstrip() line = line.partition(": ")[2] result = template_miner.add_log_message(line) line_count += 1
config = configparser.ConfigParser() config.read('drain3.ini') logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') if persistence_type == "KAFKA": persistence = KafkaPersistence("localhost:9092", "drain3_state") elif persistence_type == "FILE": persistence = FilePersistence("drain3_state.bin") else: persistence = None template_miner = TemplateMiner(persistence) print( f"Drain3 started with '{persistence_type}' persistence, reading from std-in (input 'q' to finish)" ) with open("./record.txt", "w", encoding='UTF-8') as record: while True: log_line = input() if log_line == 'q': record.close() break result = template_miner.add_log_message(log_line) result_json = json.dumps(result) record.writelines(result_json + '\n') print(result_json) print("Clusters:")
class LogPreprocessor: def __init__(self, logs: pd.DataFrame): self.logs = logs self.template_miner = TemplateMiner() self.cleaned_logs = pd.DataFrame self.clusters = {} self.results = {} self.n_clusters = 0 @staticmethod def clean_solr_logs(s: str) -> str: if len(s) == 33 or len(s) == 32: if 'zoo' in s or 'solr' in s: s = s[:8] + ' ' + s[9:22] + ' ' + s[22:] return s def standardize(self, logs: pd.DataFrame) -> pd.DataFrame: fmt = '%Y-%m-%dT%H:%M:%S.%f' logs['timestamp'] = pd.to_datetime(logs['timestamp'], format=fmt) logger.info('Standardizing log documents ...') # remove timestamps logs['log'] = logs['log'].replace( to_replace= r'(?:\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}([.,]\d{3}|\s))', value='', regex=True) #logs['log'] = logs['log'].apply(lambda log: self.clean_solr_logs(log)) # remove punctuation #logs['log'] = logs['log'].replace(to_replace=r'[^\w\s]', # value=' ', # regex=True) logger.info('...complete!') return logs def generate_clusters(self): self.cleaned_logs = self.standardize(self.logs) logger.info('Generating log templates ...') for idx, row in enumerate(self.cleaned_logs.itertuples()): self.results[idx] = self.template_miner.add_log_message(row.log) self.clusters = self.template_miner.drain.clusters self.n_clusters = len(self.template_miner.drain.clusters) # cleaned_clusters = [re.sub(pattern=r'[^\w\s]', # repl=' ', # string=cluster.get_template()) # for cluster in self.Drain.drain.clusters] cleaned_clusters = [ re.sub(pattern=r' +', repl=' ', string=cluster.get_template()) for cluster in self.template_miner.drain.clusters ] logger.info('...complete!') joblib.dump(cleaned_clusters, '/results/clean_clusters.joblib') return cleaned_clusters, self.template_miner.drain.clusters def generate_word_embeddings(self): logger.info('Generating Word Embeddings ...') if os.environ["GENERATE_NEW_DRAIN"] == "yes": clusters, _ = self.generate_clusters() else: clusters = joblib.load('/results/clean_clusters.joblib') self.word_2_vec.corpus = clusters self.word_2_vec.generate_embeddings()