Python TemplateMiner.add_log_message 예제들, drain3.TemplateMiner.add_log_message Python 예제들

예제 #1

0

파일 보기

파일: test_template_miner.py 프로젝트: IBM/Drain3

    def test_match_only(self):
        config = TemplateMinerConfig()
        config.drain_extra_delimiters = ["_"]
        mi = MaskingInstruction(
            "((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM")
        config.masking_instructions.append(mi)
        tm = TemplateMiner(None, config)

        res = tm.add_log_message("aa aa aa")
        print(res)

        res = tm.add_log_message("aa aa bb")
        print(res)

        res = tm.add_log_message("xx yy zz")
        print(res)

        res = tm.add_log_message("rrr qqq 123")
        print(res)

        c = tm.match("aa   aa tt")
        self.assertEqual(1, c.cluster_id)

        c = tm.match("aa aa 12")
        self.assertEqual(1, c.cluster_id)

        c = tm.match("xx yy   zz")
        self.assertEqual(2, c.cluster_id)

        c = tm.match("xx yy rr")
        self.assertIsNone(c)

        c = tm.match("nothing")
        self.assertIsNone(c)

        c = tm.match("rrr qqq   456   ")
        self.assertEqual(3, c.cluster_id)

        c = tm.match("rrr qqq 555.2")
        self.assertIsNone(c)

        c = tm.match("rrr qqq num")
        self.assertIsNone(c)

예제 #2

0

파일 보기

파일: test_template_miner.py 프로젝트: IBM/Drain3

    def save_load_snapshot(self, max_clusters):
        persistence = MemoryBufferPersistence()

        config = TemplateMinerConfig()
        config.drain_max_clusters = max_clusters
        template_miner1 = TemplateMiner(persistence, config)
        print(template_miner1.add_log_message("hello"))
        print(template_miner1.add_log_message("hello ABC"))
        print(template_miner1.add_log_message("hello BCD"))
        print(template_miner1.add_log_message("hello XYZ"))
        print(template_miner1.add_log_message("goodbye XYZ"))

        template_miner2 = TemplateMiner(persistence, config)

        self.assertListEqual(list(template_miner1.drain.id_to_cluster.keys()),
                             list(template_miner2.drain.id_to_cluster.keys()))

        self.assertListEqual(
            list(template_miner1.drain.root_node.key_to_child_node.keys()),
            list(template_miner2.drain.root_node.key_to_child_node.keys()))

        def get_tree_lines(template_miner):
            sio = io.StringIO()
            template_miner.drain.print_tree(sio)
            sio.seek(0)
            return sio.readlines()

        self.assertListEqual(get_tree_lines(template_miner1),
                             get_tree_lines(template_miner2))

        print(template_miner2.add_log_message("hello yyy"))
        print(template_miner2.add_log_message("goodbye ABC"))

예제 #3

0

파일 보기

파일: logparser.py 프로젝트: LogAnalysisTeam/methods4logfiles

def parse_file_drain3(data: DefaultDict) -> Dict:
    template_miner = TemplateMiner()

    cluster_ids = defaultdict(list)
    log_lines = defaultdict(list)
    for block_id, logs in data.items():
        for log in logs:
            line = log.rstrip().partition(': ')[
                2]  # produces tuple (pre, delimiter, post)
            result = template_miner.add_log_message(line)
            cluster_ids[block_id].append(result['cluster_id'])
            log_lines[block_id].append(line)

    log_structure = get_log_structure(log_lines, cluster_ids,
                                      template_miner.drain.clusters)
    return log_structure

예제 #4

0

파일 보기

파일: test_template_miner.py 프로젝트: IBM/Drain3

    def test_match_strategies(self):
        miner = TemplateMiner()
        print(miner.add_log_message("training4Model start"))
        print(miner.add_log_message("loadModel start"))
        print(miner.add_log_message("loadModel stop"))
        print(miner.add_log_message("this is a test"))
        miner.drain.print_tree()
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="fallback"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="always"))
        self.assertIsNone(
            miner.match("loadModel start", full_search_strategy="never"))
        print(miner.add_log_message("loadModel start"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="fallback"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="always"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="never"))

        config = TemplateMinerConfig()
        config.parametrize_numeric_tokens = False
        miner = TemplateMiner(config=config)
        print(miner.add_log_message("training4Model start"))
        print(miner.add_log_message("loadModel start"))
        print(miner.add_log_message("loadModel stop"))
        print(miner.add_log_message("this is a test"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="fallback"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="always"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="never"))

        self.assertIsNone(miner.match("", full_search_strategy="never"))
        self.assertIsNone(miner.match("", full_search_strategy="always"))
        self.assertIsNone(miner.match("", full_search_strategy="fallback"))

        print(miner.add_log_message(""))
        self.assertIsNotNone(miner.match("", full_search_strategy="never"))
        self.assertIsNotNone(miner.match("", full_search_strategy="always"))
        self.assertIsNotNone(miner.match("", full_search_strategy="fallback"))

예제 #5

0

파일 보기

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout,
                    level=logging.INFO,
                    format='%(message)s')

if persistence_type == "KAFKA":
    persistence = KafkaPersistence("localhost:9092", "drain3_state")
elif persistence_type == "FILE":
    persistence = FilePersistence("drain3_state.bin")
else:
    persistence = None

template_miner = TemplateMiner(persistence)
print(
    f"Drain3 started with '{persistence_type}' persistence, reading from std-in (input 'q' to finish)"
)
with open("./record.txt", "w", encoding='UTF-8') as record:
    while True:
        log_line = input()
        if log_line == 'q':
            record.close()
            break
        result = template_miner.add_log_message(log_line)
        result_json = json.dumps(result)
        record.writelines(result_json + '\n')
        print(result_json)

print("Clusters:")
for cluster in template_miner.drain.clusters:
    print(cluster)

예제 #6

0

파일 보기

persistence_type = "FILE"
config = configparser.ConfigParser()
config.read('drain3.ini')
logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s')

if persistence_type == "KAFKA":
    persistence = KafkaPersistence("localhost:9092", "drain3_state")
elif persistence_type == "FILE":
    persistence = FilePersistence("results/{}/drain3_state[{}].bin".format(log_type,log_type))
else:
    persistence = None

template_miner = TemplateMiner(persistence)
print(f"Drain3 started with '{persistence_type}' persistence")

df = pd.read_csv("/container/drain3/parser/results/{}/{}_struct.csv".format(log_type,log_file))#
content=df.loc[:,'Content']

for idx in content.index:
        #component=line['Component']
        #level=line['Level']
        result=template_miner.add_log_message(content[idx])
        result_json = json.dumps(result)
        #print(result_json)
        
        
print("Clusters:")
for cluster in template_miner.drain.clusters:
    print(cluster)

예제 #7

0

파일 보기

class LSTMLogSequence(AnomalyDetector):
    """An abstract class for implementing anomaly detection models.

    ...

    Attributes
    ----------
    prefix_file : str
        the string wich will be added at the beginning the persistent file
        of drain3, and the .path file of the model
    num_candidates : int
        for prediction phase : the number of possible candidate for a log.
        The lower the value, the sensible the detection
    window_size : int
        the window size to use for the LSTM model
    device : {'cpu', 'cuda', 'auto'}
        the device to be used to train the model and predict.
        'cpu' will work everytime. To use 'cuda' you need to have a compatible
        graphic card, and a proper installation of CUDA. 'auto' will use cuda
        if is_available, else it will use cpu.
    lr : int
        learning rate for training.

    Methods
    -------
    add_train_log(log)
        add a log that will be used the next time train() will be called.
        The logs have to be added in the correct order.
    predict(log)
        return True if the log is abnormal, False otherwise
    train()
        train the model with the data added via the add_train_log function

    """
    def __init__(self, prefix_file, model_name, num_candidates, window_size,
                 device, lr, lr_step, lr_decay_ratio, max_iter):
        Path("data").mkdir(parents=True, exist_ok=True)
        self.persistence_path = prefix_file + "_templates_persist.bin"
        persistence = FilePersistence(self.persistence_path)
        config = TemplateMinerConfig()
        config.load("ailoganalyzer/drain3.ini")
        config.profiling_enabled = False
        self.template_miner = TemplateMiner(persistence, config)
        if device == "auto":
            device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")

        super().__init__(model_name)

        self.prefix_file = prefix_file
        self.num_candidates = num_candidates
        self.window_size = window_size
        self.device = device
        self.lr = lr
        self.lr_step = lr_step
        self.lr_decay_ratio = lr_decay_ratio
        self.nb_epoch = max_iter

        self.semantic = False
        self.sequentials = False
        self.quantitatives = False

        self.model = None

        self.sequence = []
        self.train_seq = []
        self.train_loader = None
        self.valid_loader = None
        self.model_path = self.prefix_file + "_last.pth"

    def add_train_log(self, log):
        cluster_id = self.log_to_key(log)
        self.train_seq.append(cluster_id)

    def predict(self, log):
        cluster_id = self.log_to_key(log)
        if len(self.train_seq) > 0:
            self.train_seq = []

        self.sequence = np.array(self.sequence)
        label = np.array([cluster_id])
        if len(self.sequence) == self.window_size:
            res = self.predict_seq(self.sequence, label)
        else:
            res = False

        if len(self.sequence) == self.window_size:
            self.sequence = self.sequence[1:]
        self.sequence = np.append(self.sequence, cluster_id)
        return res

    def initialize_model(self):
        state = None
        if os.path.isfile(self.model_path):
            state = torch.load(self.model_path)
            num_classes = state["num_keys"]
            self.is_trained = True
        else:
            num_classes = self.get_number_classes()
            self.is_trained = False
        self.num_classes = num_classes

        if self.model_name == "loganomaly":
            self.model = loganomaly(hidden_size=128,
                                    num_layers=2,
                                    num_keys=num_classes)
            self.input_size = 300
            self.semantic = True
            self.quantitatives = True
            self.batch_size = 256

        elif self.model_name == "deeplog":
            self.model = deeplog(hidden_size=64,
                                 num_layers=2,
                                 num_keys=num_classes)
            self.input_size = 1
            self.sequentials = True
            self.batch_size = 2048

        elif self.model_name == "robustlog":
            raise NotImplementedError

        else:
            raise NotImplementedError

        if state is not None:
            self.model.load_state_dict(state["state_dict"])

    def train(self):
        if len(self.train_seq) < self.window_size:
            raise RuntimeError(
                "There is not enought data for training. Add logs with the add_train_log function."
            )
        if self.train_loader is None or self.valid_loader is None:
            self.set_dataLoader_training()

        print("num classes:", self.num_classes)
        trainer = Trainer(self.model,
                          self.train_loader,
                          self.valid_loader,
                          self.num_classes,
                          self.prefix_file,
                          self.model_name,
                          self.window_size,
                          max_epoch=self.nb_epoch,
                          lr_step=self.lr_step,
                          model_path=self.model_path,
                          device=self.device)
        trainer.start_train()
        self.is_trained = True

    def set_dataLoader_training(self):
        self.train_seq = np.array(self.train_seq)
        labels = self.train_seq[self.window_size:]
        sequences = sliding_window_view(self.train_seq[:-1], self.window_size)
        self.set_dataLoader_training_1(sequences, labels)

    def set_dataLoader_training_1(self, sequences, labels):
        self.initialize_model()

        train_seq, val_seq, train_label, val_label = train_test_split(
            sequences, labels, train_size=0.8)
        print("number train sequences :", len(train_seq))
        print("number val sequences :", len(val_seq))
        self.num_classes = self.get_number_classes()
        event2vec = self.template_to_vec_all()

        train_dataset = sliddingWindowDataset(train_seq,
                                              train_label,
                                              self.window_size,
                                              event2vec,
                                              num_classes=self.num_classes,
                                              seq=self.sequentials,
                                              quan=self.quantitatives,
                                              sem=self.semantic)
        valid_dataset = sliddingWindowDataset(val_seq,
                                              val_label,
                                              self.window_size,
                                              event2vec,
                                              num_classes=self.num_classes,
                                              seq=self.sequentials,
                                              quan=self.quantitatives,
                                              sem=self.semantic)

        self.train_loader = DataLoader(train_dataset,
                                       batch_size=self.batch_size,
                                       shuffle=True,
                                       pin_memory=True)
        self.valid_loader = DataLoader(valid_dataset,
                                       batch_size=self.batch_size,
                                       shuffle=False,
                                       pin_memory=True)

    def predict_seq(self, sequence, label):
        if not self.is_trained:
            raise RuntimeError("You need to train the model before predicting")
        sequence = sequence[np.newaxis]
        event2vec = self.template_to_vec_all()
        if self.model is None:
            self.initialize_model()
        self.model = self.model.eval().to(self.device)

        label = np.array([label])
        dataset = sliddingWindowDataset(sequence,
                                        label,
                                        self.window_size,
                                        event2vec,
                                        num_classes=self.num_classes,
                                        seq=self.sequentials,
                                        quan=self.quantitatives,
                                        sem=self.semantic)

        data, label = dataset[0]
        features = []
        for value in data.values():
            features.append(value[np.newaxis].to(self.device))

        label = torch.tensor(label).view(-1).to(self.device)
        output = self.model(features=features, device=self.device)
        predicted = torch.argsort(output, 1)[0][-self.num_candidates:]

        if label not in predicted:
            return True
        else:
            return False

    def evaluate_HDFS(self, train=True):
        config = TemplateMinerConfig()
        config.load("ailoganalyzer/drain3.ini")
        config.profiling_enabled = False
        self.template_miner = TemplateMiner(config=config)

        hdfs_log = "../../Documents/HDFS_1/HDFS.log"
        hdfs_anomaly_label = "../../Documents/HDFS_1/anomaly_label.csv"
        nb_block = 30000

        with open(hdfs_anomaly_label, "r") as f:
            hdfs_labels = {}
            for i, line in tqdm(enumerate(f), total=nb_block):
                label = line.strip().split(",")
                hdfs_labels[label[0]] = (label[1] == "Anomaly")
        keys = random.sample(list(hdfs_labels), nb_block)
        values = [hdfs_labels[k] for k in keys]
        hdfs_labels = dict(zip(keys, values))

        blk_finder_2 = re.compile(r"(blk_-?\d+)")
        with open(hdfs_log, "r") as f:
            data_dict = {key: [] for key in hdfs_labels.keys()}
            for line in tqdm(f):
                blk = re.search(blk_finder_2, line).group()
                if blk in data_dict:
                    msg = " ".join(line.strip().split()[5:])
                    result = self.template_miner.add_log_message(msg)
                    cluster_id = result["cluster_id"] - 1
                    data_dict[blk].append(cluster_id)

        abnormal = []
        normal = []
        abnormal_label = []
        normal_label = []
        abnormal_blk = []

        for blk, seq in data_dict.items():
            if len(seq) > self.window_size:
                labels = seq[self.window_size:]
                seqs = sliding_window_view(seq[:-1], self.window_size)
                if hdfs_labels[blk]:
                    abnormal.append(seqs)
                    abnormal_label.append(labels)
                    abnormal_blk.append(blk)
                else:
                    normal.append(seqs)
                    normal_label.append(labels)

        print("normal : ", len(normal))
        print("abnormal : ", len(abnormal))
        train_seq, test_seq, train_label, test_label = train_test_split(
            normal, normal_label, train_size=0.8)
        train_seq = np.concatenate(train_seq)
        train_label = np.concatenate(train_label)

        if train:
            self.set_dataLoader_training_1(train_seq, train_label)
            self.train()

        # predict

        FP = 0
        TP = 0
        mem = {}
        for seqs, labels in tqdm(zip(test_seq, test_label),
                                 total=len(test_seq)):
            for seq, label in zip(seqs, labels):
                seq_tuple = tuple(seq + [label])
                if seq_tuple in mem:
                    result = mem[seq_tuple]
                else:
                    result = self.predict_seq(seq, label)
                    mem[seq_tuple] = result
                if result:
                    FP += 1
                    break
        for seqs, labels in tqdm(zip(abnormal, abnormal_label),
                                 total=len(abnormal)):
            for seq, label in zip(seqs, labels):
                seq_tuple = tuple(seq + [label])
                if seq_tuple in mem:
                    result = mem[seq_tuple]
                else:
                    result = self.predict_seq(seq, label)
                    mem[seq_tuple] = result
                if result:
                    TP += 1
                    break
        FN = len(abnormal) - TP
        P = 100 * TP / (TP + FP)
        R = 100 * TP / (TP + FN)
        F1 = 2 * P * R / (P + R)
        print('''false positive (FP): {}, false negative (FN): {},
            Precision: {:.3f}%, Recall: {:.3f}%,
            F1-measure: {:.3f}%'''.format(FP, FN, P, R, F1))

    # -------------- drain3 function -----------------

    def log_to_key(self, log):
        result = self.template_miner.add_log_message(log)
        if result["change_type"] != "none":
            pass
        cluster_id = result["cluster_id"] - 1
        return cluster_id

    def get_templates(self):
        return (c.get_template() for c in self.template_miner.drain.clusters)

    def get_number_classes(self):
        return len(list(self.get_templates()))

    def get_word_counter(self):
        d = defaultdict(int)
        for cluster in self.template_miner.drain.clusters:
            for word in preprocess_template(cluster.get_template()):
                d[word] += cluster.size
        return d

    def template_to_vec_all(self):
        d = {}
        d[0] = np.array([-1] * 300)
        word_counter = self.get_word_counter()
        for cluster in self.template_miner.drain.clusters:
            template, template_id = cluster.get_template(), cluster.cluster_id
            d[template_id] = line_to_vec(template, word_counter)
        return d

    def template_to_vec(self, templateID):
        if templateID == 0:
            return np.array([-1] * 300)
        for cluster in self.template_miner.drain.clusters:
            if cluster.cluster_id == templateID:
                word_counter = self.get_word_counter()
                return line_to_vec(cluster.get_template(), word_counter)

        print(templateID)
        raise RuntimeError

    def remove_system(self):
        os.remove(self.persistence_path)

예제 #8

0

파일 보기

파일: LogPreprocessor.py 프로젝트: sdwalker62/log-analyzer

class LogPreprocessor:
    def __init__(self, logs: pd.DataFrame):
        self.logs = logs
        self.template_miner = TemplateMiner()
        self.cleaned_logs = pd.DataFrame
        self.clusters = {}
        self.results = {}
        self.n_clusters = 0

    @staticmethod
    def clean_solr_logs(s: str) -> str:
        if len(s) == 33 or len(s) == 32:
            if 'zoo' in s or 'solr' in s:
                s = s[:8] + ' ' + s[9:22] + ' ' + s[22:]

        return s

    def standardize(self, logs: pd.DataFrame) -> pd.DataFrame:
        fmt = '%Y-%m-%dT%H:%M:%S.%f'
        logs['timestamp'] = pd.to_datetime(logs['timestamp'], format=fmt)

        logger.info('Standardizing log documents ...')

        # remove timestamps
        logs['log'] = logs['log'].replace(
            to_replace=
            r'(?:\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}([.,]\d{3}|\s))',
            value='',
            regex=True)
        #logs['log'] = logs['log'].apply(lambda log: self.clean_solr_logs(log))

        # remove punctuation
        #logs['log'] = logs['log'].replace(to_replace=r'[^\w\s]',
        #                                  value=' ',
        #                                  regex=True)

        logger.info('...complete!')

        return logs

    def generate_clusters(self):
        self.cleaned_logs = self.standardize(self.logs)
        logger.info('Generating log templates ...')

        for idx, row in enumerate(self.cleaned_logs.itertuples()):
            self.results[idx] = self.template_miner.add_log_message(row.log)

        self.clusters = self.template_miner.drain.clusters
        self.n_clusters = len(self.template_miner.drain.clusters)

        # cleaned_clusters = [re.sub(pattern=r'[^\w\s]',
        #                            repl=' ',
        #                            string=cluster.get_template())
        #                     for cluster in self.Drain.drain.clusters]

        cleaned_clusters = [
            re.sub(pattern=r' +', repl=' ', string=cluster.get_template())
            for cluster in self.template_miner.drain.clusters
        ]

        logger.info('...complete!')
        joblib.dump(cleaned_clusters, '/results/clean_clusters.joblib')
        return cleaned_clusters, self.template_miner.drain.clusters

    def generate_word_embeddings(self):
        logger.info('Generating Word Embeddings ...')

        if os.environ["GENERATE_NEW_DRAIN"] == "yes":
            clusters, _ = self.generate_clusters()
        else:
            clusters = joblib.load('/results/clean_clusters.joblib')

        self.word_2_vec.corpus = clusters
        self.word_2_vec.generate_embeddings()