def match(self, log_filepath, template_filepath): print('Processing log file: {}...'.format(log_filepath)) start_time = datetime.now() loader = logloader.LogLoader(self.logformat, self.n_workers) log_dataframe = loader.load_to_dataframe(log_filepath) print('Reading templates from {}...'.format(template_filepath)) templates = self._read_template_from_csv(template_filepath) print('Building match tree...') match_tree = self._build_match_tree(templates) print('Matching event templates...') match_dict = self.match_event(match_tree, log_dataframe['Content'].tolist()) log_dataframe['EventTemplate'] = log_dataframe['Content'].map( match_dict) log_dataframe['EventId'] = log_dataframe['Content'].map( lambda x: hashlib.md5(x.encode('utf-8')).hexdigest()[0:8]) # wrtie csv to path ~ self._dump_match_result(os.path.basename(log_filepath), log_dataframe) match_rate = sum(log_dataframe['EventTemplate'] != 'NoMatch') / float( len(log_dataframe)) print('Matching done, matching rate: {:.1%} [Time taken: {!s}]'.format( match_rate, datetime.now() - start_time)) return log_dataframe
def match(self, log_filepath, template_filepath): print('Processing log file: {}'.format(log_filepath)) start_time = datetime.now() loader = logloader.LogLoader(self.logformat, self.n_workers) self.read_template_from_csv(template_filepath) log_dataframe = loader.load_to_dataframe(log_filepath) print('Matching event templates...') match_list = self.match_event(log_dataframe['Content'].tolist()) log_dataframe = pd.concat([log_dataframe, pd.DataFrame(match_list, columns=['EventId', 'EventTemplate'])], axis=1) self._dump_match_result(os.path.basename(log_filepath), log_dataframe) match_rate = sum(log_dataframe['EventId'] != 'NONE') / float(len(log_dataframe)) print('Matching done, matching rate: {:.1%} [Time taken: {!s}]'.format(match_rate, datetime.now() - start_time))
def read_data(self, logname): ########## Field Extraction TIME begin start_extract_time = time.time() loader = logloader.LogLoader(self.log_format, self.tmp_dir, self.n_workers) self.log_dataframe = loader.load_to_dataframe(logname) end_extract_time = time.time() self.field_extraction_time = end_extract_time - start_extract_time ########## Field Extraction TIME end ########## PARSING TIME begin start_parse_time = time.time() templates = [] paras = [] if self.n_workers == 1: templates, paras = parse_chunk(self.log_dataframe) else: chunk_size = min(5000000, self.log_dataframe.shape[0] // self.n_workers) result_chunks = [] pool = mp.Pool(processes=self.n_workers) result_chunks = [pool.apply_async(parse_chunk, \ args=(self.log_dataframe.iloc[i:i + chunk_size],)) for i in range(0, self.log_dataframe.shape[0], chunk_size)] pool.close() pool.join() for result in result_chunks: result = result.get() templates.extend(result[0]) paras.extend(result[1]) print("Finish filter numbers.") ## filter top events begin self.log_dataframe['EventTemplate'] = templates self.log_dataframe['ParameterList'] = paras top_events = list(filter(lambda x: "<*>" in x, self.log_dataframe['EventTemplate'].value_counts().index))[0: self.top_event] false_index = ~self.log_dataframe["EventTemplate"].isin(top_events) self.log_dataframe.loc[false_index, "EventTemplate"] = self.log_dataframe.loc[false_index, "Content"] self.log_dataframe.loc[false_index, "ParameterList"] = "" ## filter top events end self.template_eid_mapping = {evt: "E"+str(idx) for idx, evt in enumerate(self.log_dataframe['EventTemplate'].unique())} self.log_dataframe['EventId'] = self.log_dataframe['EventTemplate'].map(lambda x: self.template_eid_mapping[x]) self.log_dataframe.drop(["LineId"], axis=1, inplace=True) end_parse_time = time.time() self.parse_time = end_parse_time - start_parse_time
def match(self, log_filepath, templates): print('Processing log file: {}...'.format(log_filepath)) start_time = datetime.now() loader = logloader.LogLoader(self.logformat, self.tmp_dir) log_dataframe = loader.load_to_dataframe(log_filepath) # log_dataframe = log_dataframe.head(1) templates = self._read_templates(templates) print('Building match tree...') match_tree = self._build_match_tree(templates) print('Matching event templates...') if self.optimized: match_dict = self.match_event( match_tree, log_dataframe['Content'].drop_duplicates().tolist()) else: match_dict = self.match_event(match_tree, log_dataframe['Content'].tolist()) log_dataframe['EventTemplate'] = log_dataframe['Content'].map( lambda x: match_dict[x][0]) log_dataframe['ParameterList'] = log_dataframe['Content'].map( lambda x: match_dict[x][1]) self.id_map = { tmp: "E" + str(idx) for idx, tmp in enumerate(log_dataframe['EventTemplate'].unique(), 1) } log_dataframe['EventId'] = log_dataframe['EventTemplate'].map( lambda x: self.id_map[x]) # self._dump_match_result(os.path.basename(log_filepath), log_dataframe) match_rate = sum(log_dataframe['EventTemplate'] != 'NoMatch') / float( len(log_dataframe)) print('Matching done, matching rate: {:.1%} [Time taken: {!s}]'.format( match_rate, datetime.now() - start_time)) return log_dataframe
head_length = 8 is_multi = False head_regex = None if Type == "Windows": head_length = 4 is_multi = False head_regex = None if Type == "Zookeeper": head_length = 4 is_multi = False head_regex = None max_length = 300000 loader = logloader.LogLoader(headLength=head_length, isMulti=is_multi, headRegex=head_regex, maxLength=max_length) LogData, Heads, HeadDelimers = loader.load_to_dataframe(filepath) # print(LogData) # print(Heads) # print(HeadDelimers) #Extract head format Header = header.Header(head_length=head_length, is_multi=is_multi, head_bound=head_regex, template_path=template_path, heads=Heads, delimer=HeadDelimers) Header.outputFormat() #Extract templates and corrlation