def parse_log(input_dir, output_dir, log_file, parser_type): log_format = '<Label> <Id> <Date> <Admin> <Month> <Day> <Time> <AdminAddr> <Content>' regex = [ r'(0x)[0-9a-fA-F]+', # hexadecimal r'\d+\.\d+\.\d+\.\d+', r'(?<=Warning: we failed to resolve data source name )[\w\s]+', r'\d+' ] keep_para = False if parser_type == "drain": # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf st = 0.3 # Similarity threshold depth = 3 # Depth of all leaf nodes # Drain is modified parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=keep_para, maxChild=1000) parser.parse(log_file) elif parser_type == "spell": tau = 0.35 parser = Spell.LogParser(indir=data_dir, outdir=output_dir, log_format=log_format, tau=tau, rex=regex, keep_para=keep_para) parser.parse(log_file)
def parser(input_dir, output_dir, log_file, log_format, type='drain'): if type == 'spell': tau = 0.5 # Message type threshold (default: 0.5) regex = [ "(/[-\w]+)+", #replace file path with * "(?<=blk_)[-\d]+" #replace block_id with * ] # Regular expression list for optional preprocessing (default: []) parser = Spell.LogParser(indir=input_dir, outdir=output_dir, log_format=log_format, tau=tau, rex=regex, keep_para=False) parser.parse(log_file) elif type == 'drain': regex = [ r"(?<=blk_)[-\d]+", # block_id r'\d+\.\d+\.\d+\.\d+', # IP r"(/[-\w]+)+", # file path #r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers ] # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf st = 0.5 # Similarity threshold depth = 5 # Depth of all leaf nodes parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=False) parser.parse(log_file)
def parse_log(input_dir, output_dir, log_file, parser_type): log_format = '<Label> <Id> <Date> <Code1> <Time> <Code2> <Component1> <Component2> <Level> <Content>' regex = [ r'(0x)[0-9a-fA-F]+', #hexadecimal r'\d+.\d+.\d+.\d+', # r'/\w+( )$' r'\d+' ] keep_para = False if parser_type == "drain": # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf st = 0.3 # Similarity threshold depth = 3 # Depth of all leaf nodes parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=keep_para) parser.parse(log_file) elif parser_type == "spell": tau = 0.55 parser = Spell.LogParser(indir=data_dir, outdir=output_dir, log_format=log_format, tau=tau, rex=regex, keep_para=keep_para) parser.parse(log_file)
def run_drain(input_dir, output_dir, log_file): log_format = '<Time>\t<Level>\t:<Content>' # HDFS log format # Regular expression list for optional preprocessing (default: []) st = 0.5 # Similarity threshold depth = 4 # Depth of all leaf nodes parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st) parser.parse(log_file)
input_dir_list = [ '../logs/dropt/advisor/advisor.log', '../logs/dropt/service_kernel_logs/tomcat.log' ] # The input directory of log file print("path:", os.path.split(input_dir_list[curIndex])) input_dir, log_file = os.path.split(input_dir_list[curIndex]) #log_file = 'advisor.log' # The input log file name copyfile(input_dir_list[curIndex], output_dir + log_file) log_format_list = [ '<Date> <Time> <Level> <dash> <Content>', '<Date> <Time> <Level> <dash> <usr> <prj> <Content>' ] # HDFS log format log_format = log_format_list[curIndex] # Regular expression list for optional preprocessing (default: []) regex = [ r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$' # Numbers ] st = 0.5 # Similarity threshold depth = 4 # Depth of all leaf nodes parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=False) parser.parse(log_file)
#!/usr/bin/env python import sys sys.path.append('../') from logparser import Drain resume_training = True input_dir = '../logs/HDFS/' # The input directory of log file output_dir = 'Drain_result/' # The output directory of parsing results history = "history" log_file = 'HDFS_1k_1.log' # The input log file name log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>' # HDFS log format # Regular expression list for optional preprocessing (default: []) regex = [ r'blk_(|-)[0-9]+' , # block id r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers ] st = 0.5 # Similarity threshold depth = 4 # Depth of all leaf nodes parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex, resume_training=resume_training, history=history) parser.parse(log_file)
'<Month> <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>', 'regex': [r'([\w-]+\.){2,}[\w-]+'], 'st': 0.7, 'depth': 6 }, } bechmark_result = [] for dataset, setting in benchmark_settings.iteritems(): print('\n=== Evaluation on %s ===' % dataset) indir = os.path.join(input_dir, os.path.dirname(setting['log_file'])) log_file = os.path.basename(setting['log_file']) parser = Drain.LogParser(log_format=setting['log_format'], indir=indir, outdir=output_dir, rex=setting['regex'], depth=setting['depth'], st=setting['st']) parser.parse(log_file) F1_measure, accuracy = evaluator.evaluate( groundtruth=os.path.join(indir, log_file + '_structured.csv'), parsedresult=os.path.join(output_dir, log_file + '_structured.csv')) bechmark_result.append([dataset, F1_measure, accuracy]) print('\n=== Overall evaluation results ===') df_result = pd.DataFrame(bechmark_result, columns=['Dataset', 'F1_measure', 'Accuracy']) df_result.set_index('Dataset', inplace=True) print(df_result) df_result.T.to_csv('Drain_bechmark_result.csv')
import time from logparser import evaluator from benchmark.Drain_benchmark import benchmark_settings from logparser import Drain output_dir = 'Drain_result/' # The output directory of parsing results one_setting = benchmark_settings['Android'] log_file = os.path.basename(one_setting['log_file']) input_dir = os.path.join('../logs/', os.path.dirname(one_setting['log_file'])) parser = Drain.LogParser( log_format=one_setting['log_format'], indir=input_dir, outdir=output_dir, depth=one_setting['depth'], st=one_setting['st'], rex=one_setting['regex'], keep_para=False ) start = time.perf_counter() # time_elapsed = parser.parse(log_file) time_elapsed = parser.parse('Android_5m.log') end = time.perf_counter() # F1_measure, accuracy = evaluator.evaluate( # groundtruth=os.path.join(input_dir, log_file + '_structured.csv'), # parsedresult=os.path.join(output_dir, log_file + '_structured.csv'), # )
def call_logParser(n, in_dir, out_dir, dep, st_v, rex_v): parser = Drain.LogParser(log_formats[n], indir=in_dir, outdir=out_dir, depth=dep, st=st_v, rex=rex_v) parser.parse(log_files[n])
def execute( directory='/Users/haraldott/Development/thesis/detector/data/openstack/sasho/raw/sorted_per_request/', file='combined', output='/Users/haraldott/Development/thesis/detector/data/openstack/utah/parsed/combined', logtype='OpenStackSasho', st=0.4, depth=2, full_output=False): settings = { 'HDFS': { 'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>', 'regex': [r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?'], 'st': 0.5, 'depth': 4 }, 'OpenStack': { 'log_format': '<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>', 'regex': [ r'((\d+\.){3}\d+,?)+', r'/.+?\s', r'\d+', r'\[.*?\]', r'\[.*\]', r'\[.*\] \[.*\]', r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', r'\(\/.*\)' ], 'st': 0.2, 'depth': 2 }, 'OpenStackSasho': { 'log_format': '<_id>,<_index>,<_score>,<_type>,<Hostname>,<user_id>,<project_domain>,<Timestamp>,<timestamp>,<log_level>,<Pid>,<Content>,<tenant_id>,<programname>,<request_id>,<python_module>,<Logger>,<user_domain>,<domain_id>,<http_status>,<http_method>,<http_version>,<http_url>,<chunk>,<next_retry_seconds>,<error>,<retry_time>,<message>,<chunk_id>,<worker>', 'regex': [ r'((\d+\.){3}\d+,?)+', r'/.+?\s', r'\d+', r'\[.*?\]', r'\[.*\]', r'\[.*\] \[.*\]', r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', r'\(\/.*\)' ], 'st': 0.2, 'depth': 2 }, } try: log_format = settings[logtype]["log_format"] regex = settings[logtype]["regex"] # depth = settings[log_type]["depth"] # st = settings[log_type]["st"] except ValueError: print("log format does not exist") raise parser = Drain.LogParser(log_format, indir=directory, outdir=output, depth=depth, st=st, rex=regex) parser.parse(file, full_output)
def zip_file(filepath, outdir, log_format, template_file="", n_workers=2, level=3, lossy=False, top_event=2000, kernel="gz", compress_single=False, report_file="./report.csv"): time_start = time.time() # new tmp dirs logname = os.path.basename(filepath) timemark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) tmp_dir = os.path.join(outdir, logname + "_tmp_" + timemark) print("Tmp files are in {}".format(tmp_dir)) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) if not template_file: """ 0. sampling """ line_num = subprocess.check_output("wc -l {}".format(filepath), shell=True) line_num = int(line_num.split()[0]) sample_num = 50000 sample_file_path = filepath + ".sample" try: subprocess.check_output("gshuf -n{} {} > {}".format( sample_num, filepath, sample_file_path), shell=True) except: subprocess.check_output("shuf -n{} {} > {}".format( sample_num, filepath, sample_file_path), shell=True) """ 1. get template file """ st = 0.5 # Similarity threshold depth = 4 # Depth of all leaf nodes regex = [ r'blk_(|-)[0-9]+', # block id r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers ] parse_begin_time = time.time() parser = Drain.LogParser(log_format, outdir=tmp_dir, depth=depth, st=st, rex=regex) templates = parser.parse(sample_file_path) os.remove(sample_file_path) parse_end_time = time.time() template_file = os.path.join(tmp_dir, "log_templates.csv") with open(template_file, "w") as fw: [fw.write(item + "\n") for item in templates] print("Parser cost [{:.3f}s]".format(parse_end_time - parse_begin_time)) # split files kb_per_chunk = int(get_FileSize(filepath) // n_workers) + 1 cmd = "split -b {}k {} {}".format(kb_per_chunk, filepath, os.path.join(tmp_dir, f"{logname}_")) subprocess.call(cmd, stderr=subprocess.STDOUT, shell=True) # run subprocesses processes = [] for idx, file in enumerate( sorted(glob.glob(os.path.join(tmp_dir, f"{logname}_*")))): script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "zipper_longgest.py") per_tmp_dir = os.path.join(tmp_dir, str(idx)) cmd = ('python {} --file {} --log_format "{}" --level {} --lossy {} --template_file {}'+ \ ' --tmp_dir {} --out_dir {} --compress_single {} --n_workers {}') \ .format(script_path, file, log_format, level, lossy, template_file, per_tmp_dir, per_tmp_dir, compress_single, n_workers) print(cmd) processes.append( subprocess.Popen(cmd, stderr=subprocess.STDOUT, shell=True)) [p.wait() for p in processes] compressed_size = 0 for idx in range(len(processes)): sub_outfile = glob.glob(os.path.join(tmp_dir, str(idx), "*logzip*"))[0] dst = os.path.join( outdir, os.path.basename(sub_outfile) + f".{idx+1}of{len(processes)}") shutil.move(sub_outfile, dst) compressed_size += get_FileSize(dst, "mb") [ os.remove(chunk) for chunk in glob.glob(os.path.join(tmp_dir, f"{logname}_*")) ] original_size = get_FileSize(filepath, "mb") compress_ratio = round(original_size / compressed_size, 2) time_end = time.time() total_time_taken = time_end - time_start firstline = True if os.path.isfile(report_file): firstline = False with open(report_file, "a+") as fw: if firstline: fw.write( "timemark,logname,original_size,compressed_size,compress_ratio,time_taken,n_workers,compress_single\n" ) fw.write( f"{timemark},{logname},{original_size},{compressed_size},{compress_ratio},{total_time_taken},{n_workers},{compress_single}\n" )