Пример #1
0
    def match(self, log_filepath, template_filepath):
        print('Processing log file: {}...'.format(log_filepath))
        start_time = datetime.now()
        loader = logloader.LogLoader(self.logformat, self.n_workers)
        log_dataframe = loader.load_to_dataframe(log_filepath)

        print('Reading templates from {}...'.format(template_filepath))
        templates = self._read_template_from_csv(template_filepath)
        print('Building match tree...')
        match_tree = self._build_match_tree(templates)

        print('Matching event templates...')
        match_dict = self.match_event(match_tree,
                                      log_dataframe['Content'].tolist())

        log_dataframe['EventTemplate'] = log_dataframe['Content'].map(
            match_dict)
        log_dataframe['EventId'] = log_dataframe['Content'].map(
            lambda x: hashlib.md5(x.encode('utf-8')).hexdigest()[0:8])

        # wrtie csv to path ~
        self._dump_match_result(os.path.basename(log_filepath), log_dataframe)
        match_rate = sum(log_dataframe['EventTemplate'] != 'NoMatch') / float(
            len(log_dataframe))

        print('Matching done, matching rate: {:.1%} [Time taken: {!s}]'.format(
            match_rate,
            datetime.now() - start_time))
        return log_dataframe
Пример #2
0
 def match(self, log_filepath, template_filepath):
     print('Processing log file: {}'.format(log_filepath))
     start_time = datetime.now()
     loader = logloader.LogLoader(self.logformat, self.n_workers)
     self.read_template_from_csv(template_filepath)
     log_dataframe = loader.load_to_dataframe(log_filepath)
     print('Matching event templates...')
     match_list = self.match_event(log_dataframe['Content'].tolist())
     log_dataframe = pd.concat([log_dataframe, pd.DataFrame(match_list, columns=['EventId', 'EventTemplate'])], axis=1)
     self._dump_match_result(os.path.basename(log_filepath), log_dataframe)
     match_rate = sum(log_dataframe['EventId'] != 'NONE') / float(len(log_dataframe))
     print('Matching done, matching rate: {:.1%} [Time taken: {!s}]'.format(match_rate, datetime.now() - start_time))
Пример #3
0
 def read_data(self, logname):
     ########## Field Extraction TIME begin
     start_extract_time = time.time()
     loader = logloader.LogLoader(self.log_format, self.tmp_dir, self.n_workers)
     self.log_dataframe = loader.load_to_dataframe(logname)
     end_extract_time = time.time()
     self.field_extraction_time = end_extract_time - start_extract_time
     ########## Field Extraction TIME end
     
     
     ########## PARSING TIME begin
     start_parse_time = time.time()
     templates = []
     paras = []
     if self.n_workers == 1:
         templates, paras = parse_chunk(self.log_dataframe)
     else:
         chunk_size = min(5000000, self.log_dataframe.shape[0] // self.n_workers)
         result_chunks = []
         pool = mp.Pool(processes=self.n_workers)
         result_chunks = [pool.apply_async(parse_chunk, \
                          args=(self.log_dataframe.iloc[i:i + chunk_size],))
                          for i in range(0, self.log_dataframe.shape[0], chunk_size)]
         pool.close()
         pool.join()
         for result in result_chunks:
             result = result.get()
             templates.extend(result[0])
             paras.extend(result[1])
     print("Finish filter numbers.")
     
     ## filter top events begin
     self.log_dataframe['EventTemplate'] = templates
     self.log_dataframe['ParameterList'] = paras
     top_events = list(filter(lambda x: "<*>" in x, self.log_dataframe['EventTemplate'].value_counts().index))[0: self.top_event]
     false_index = ~self.log_dataframe["EventTemplate"].isin(top_events)
     self.log_dataframe.loc[false_index, "EventTemplate"] = self.log_dataframe.loc[false_index, "Content"]
     self.log_dataframe.loc[false_index, "ParameterList"] = ""
     ## filter top events end
     
     self.template_eid_mapping = {evt: "E"+str(idx) for idx, evt in enumerate(self.log_dataframe['EventTemplate'].unique())}
     self.log_dataframe['EventId'] = self.log_dataframe['EventTemplate'].map(lambda x: self.template_eid_mapping[x])
     self.log_dataframe.drop(["LineId"], axis=1, inplace=True)
     end_parse_time = time.time()
     
     self.parse_time = end_parse_time - start_parse_time
Пример #4
0
    def match(self, log_filepath, templates):
        print('Processing log file: {}...'.format(log_filepath))
        start_time = datetime.now()
        loader = logloader.LogLoader(self.logformat, self.tmp_dir)
        log_dataframe = loader.load_to_dataframe(log_filepath)
        # log_dataframe = log_dataframe.head(1)

        templates = self._read_templates(templates)

        print('Building match tree...')
        match_tree = self._build_match_tree(templates)

        print('Matching event templates...')
        if self.optimized:
            match_dict = self.match_event(
                match_tree,
                log_dataframe['Content'].drop_duplicates().tolist())
        else:
            match_dict = self.match_event(match_tree,
                                          log_dataframe['Content'].tolist())

        log_dataframe['EventTemplate'] = log_dataframe['Content'].map(
            lambda x: match_dict[x][0])
        log_dataframe['ParameterList'] = log_dataframe['Content'].map(
            lambda x: match_dict[x][1])
        self.id_map = {
            tmp: "E" + str(idx)
            for idx, tmp in enumerate(log_dataframe['EventTemplate'].unique(),
                                      1)
        }
        log_dataframe['EventId'] = log_dataframe['EventTemplate'].map(
            lambda x: self.id_map[x])
        #        self._dump_match_result(os.path.basename(log_filepath), log_dataframe)
        match_rate = sum(log_dataframe['EventTemplate'] != 'NoMatch') / float(
            len(log_dataframe))
        print('Matching done, matching rate: {:.1%} [Time taken: {!s}]'.format(
            match_rate,
            datetime.now() - start_time))
        return log_dataframe
Пример #5
0
        head_length = 8
        is_multi = False
        head_regex = None
    if Type == "Windows":
        head_length = 4
        is_multi = False
        head_regex = None
    if Type == "Zookeeper":
        head_length = 4
        is_multi = False
        head_regex = None

    max_length = 300000

    loader = logloader.LogLoader(headLength=head_length,
                                 isMulti=is_multi,
                                 headRegex=head_regex,
                                 maxLength=max_length)
    LogData, Heads, HeadDelimers = loader.load_to_dataframe(filepath)
    # print(LogData)
    # print(Heads)
    # print(HeadDelimers)
    #Extract head format
    Header = header.Header(head_length=head_length,
                           is_multi=is_multi,
                           head_bound=head_regex,
                           template_path=template_path,
                           heads=Heads,
                           delimer=HeadDelimers)
    Header.outputFormat()

    #Extract templates and corrlation