示例#1
0
def parse_log(input_dir, output_dir, log_file, parser_type):
    log_format = '<Label> <Id> <Date> <Admin> <Month> <Day> <Time> <AdminAddr> <Content>'
    regex = [
        r'(0x)[0-9a-fA-F]+',  # hexadecimal
        r'\d+\.\d+\.\d+\.\d+',
        r'(?<=Warning: we failed to resolve data source name )[\w\s]+',
        r'\d+'
    ]
    keep_para = False
    if parser_type == "drain":
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.3  # Similarity threshold
        depth = 3  # Depth of all leaf nodes

        # Drain is modified
        parser = Drain.LogParser(log_format,
                                 indir=input_dir,
                                 outdir=output_dir,
                                 depth=depth,
                                 st=st,
                                 rex=regex,
                                 keep_para=keep_para,
                                 maxChild=1000)
        parser.parse(log_file)

    elif parser_type == "spell":
        tau = 0.35
        parser = Spell.LogParser(indir=data_dir,
                                 outdir=output_dir,
                                 log_format=log_format,
                                 tau=tau,
                                 rex=regex,
                                 keep_para=keep_para)
        parser.parse(log_file)
示例#2
0
def parser(input_dir, output_dir, log_file, log_format, type='drain'):
    if type == 'spell':
        tau = 0.5  # Message type threshold (default: 0.5)
        regex = [
            "(/[-\w]+)+",  #replace file path with *
            "(?<=blk_)[-\d]+"  #replace block_id with *
        ]  # Regular expression list for optional preprocessing (default: [])

        parser = Spell.LogParser(indir=input_dir,
                                 outdir=output_dir,
                                 log_format=log_format,
                                 tau=tau,
                                 rex=regex,
                                 keep_para=False)
        parser.parse(log_file)

    elif type == 'drain':
        regex = [
            r"(?<=blk_)[-\d]+",  # block_id
            r'\d+\.\d+\.\d+\.\d+',  # IP
            r"(/[-\w]+)+",  # file path
            #r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$',  # Numbers
        ]
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.5  # Similarity threshold
        depth = 5  # Depth of all leaf nodes

        parser = Drain.LogParser(log_format,
                                 indir=input_dir,
                                 outdir=output_dir,
                                 depth=depth,
                                 st=st,
                                 rex=regex,
                                 keep_para=False)
        parser.parse(log_file)
示例#3
0
def parse_log(input_dir, output_dir, log_file, parser_type):
    log_format = '<Label> <Id> <Date> <Code1> <Time> <Code2> <Component1> <Component2> <Level> <Content>'
    regex = [
        r'(0x)[0-9a-fA-F]+',  #hexadecimal
        r'\d+.\d+.\d+.\d+',
        # r'/\w+( )$'
        r'\d+'
    ]
    keep_para = False
    if parser_type == "drain":
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.3  # Similarity threshold
        depth = 3  # Depth of all leaf nodes
        parser = Drain.LogParser(log_format,
                                 indir=input_dir,
                                 outdir=output_dir,
                                 depth=depth,
                                 st=st,
                                 rex=regex,
                                 keep_para=keep_para)
        parser.parse(log_file)
    elif parser_type == "spell":
        tau = 0.55
        parser = Spell.LogParser(indir=data_dir,
                                 outdir=output_dir,
                                 log_format=log_format,
                                 tau=tau,
                                 rex=regex,
                                 keep_para=keep_para)
        parser.parse(log_file)
def run_drain(input_dir, output_dir, log_file):
    log_format = '<Time>\t<Level>\t:<Content>'  # HDFS log format
    # Regular expression list for optional preprocessing (default: [])
    st = 0.5  # Similarity threshold
    depth = 4  # Depth of all leaf nodes

    parser = Drain.LogParser(log_format,
                             indir=input_dir,
                             outdir=output_dir,
                             depth=depth,
                             st=st)
    parser.parse(log_file)
示例#5
0
input_dir_list = [
    '../logs/dropt/advisor/advisor.log',
    '../logs/dropt/service_kernel_logs/tomcat.log'
]  # The input directory of log file
print("path:", os.path.split(input_dir_list[curIndex]))
input_dir, log_file = os.path.split(input_dir_list[curIndex])
#log_file   = 'advisor.log'  # The input log file name
copyfile(input_dir_list[curIndex], output_dir + log_file)

log_format_list = [
    '<Date> <Time> <Level> <dash> <Content>',
    '<Date> <Time> <Level> <dash> <usr> <prj> <Content>'
]  # HDFS log format
log_format = log_format_list[curIndex]
# Regular expression list for optional preprocessing (default: [])
regex = [
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)',  # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$'  # Numbers
]
st = 0.5  # Similarity threshold
depth = 4  # Depth of all leaf nodes

parser = Drain.LogParser(log_format,
                         indir=input_dir,
                         outdir=output_dir,
                         depth=depth,
                         st=st,
                         rex=regex,
                         keep_para=False)
parser.parse(log_file)
示例#6
0
#!/usr/bin/env python
import sys
sys.path.append('../')
from logparser import Drain

resume_training = True
input_dir  = '../logs/HDFS/'  # The input directory of log file
output_dir = 'Drain_result/'  # The output directory of parsing results
history = "history"
log_file   = 'HDFS_1k_1.log'  # The input log file name
log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>'  # HDFS log format
# Regular expression list for optional preprocessing (default: [])
regex      = [
    r'blk_(|-)[0-9]+' , # block id
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers
]
st         = 0.5  # Similarity threshold
depth      = 4  # Depth of all leaf nodes

parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir,  depth=depth, st=st, rex=regex, resume_training=resume_training, history=history)
parser.parse(log_file)

        '<Month>  <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>',
        'regex': [r'([\w-]+\.){2,}[\w-]+'],
        'st': 0.7,
        'depth': 6
    },
}

bechmark_result = []
for dataset, setting in benchmark_settings.iteritems():
    print('\n=== Evaluation on %s ===' % dataset)
    indir = os.path.join(input_dir, os.path.dirname(setting['log_file']))
    log_file = os.path.basename(setting['log_file'])

    parser = Drain.LogParser(log_format=setting['log_format'],
                             indir=indir,
                             outdir=output_dir,
                             rex=setting['regex'],
                             depth=setting['depth'],
                             st=setting['st'])
    parser.parse(log_file)

    F1_measure, accuracy = evaluator.evaluate(
        groundtruth=os.path.join(indir, log_file + '_structured.csv'),
        parsedresult=os.path.join(output_dir, log_file + '_structured.csv'))
    bechmark_result.append([dataset, F1_measure, accuracy])

print('\n=== Overall evaluation results ===')
df_result = pd.DataFrame(bechmark_result,
                         columns=['Dataset', 'F1_measure', 'Accuracy'])
df_result.set_index('Dataset', inplace=True)
print(df_result)
df_result.T.to_csv('Drain_bechmark_result.csv')
示例#8
0
import time

from logparser import evaluator
from benchmark.Drain_benchmark import benchmark_settings
from logparser import Drain

output_dir = 'Drain_result/'  # The output directory of parsing results
one_setting = benchmark_settings['Android']
log_file = os.path.basename(one_setting['log_file'])
input_dir = os.path.join('../logs/', os.path.dirname(one_setting['log_file']))

parser = Drain.LogParser(
    log_format=one_setting['log_format'],
    indir=input_dir,
    outdir=output_dir,
    depth=one_setting['depth'],
    st=one_setting['st'],
    rex=one_setting['regex'],
    keep_para=False
)

start = time.perf_counter()
# time_elapsed = parser.parse(log_file)
time_elapsed = parser.parse('Android_5m.log')
end = time.perf_counter()

# F1_measure, accuracy = evaluator.evaluate(
#     groundtruth=os.path.join(input_dir, log_file + '_structured.csv'),
#     parsedresult=os.path.join(output_dir, log_file + '_structured.csv'),
# )
示例#9
0
def call_logParser(n, in_dir, out_dir, dep, st_v, rex_v):
	parser = Drain.LogParser(log_formats[n], indir=in_dir, outdir=out_dir,  depth=dep, st=st_v, rex=rex_v)
	parser.parse(log_files[n])
def execute(
        directory='/Users/haraldott/Development/thesis/detector/data/openstack/sasho/raw/sorted_per_request/',
        file='combined',
        output='/Users/haraldott/Development/thesis/detector/data/openstack/utah/parsed/combined',
        logtype='OpenStackSasho',
        st=0.4,
        depth=2,
        full_output=False):
    settings = {
        'HDFS': {
            'log_format': '<Date> <Time> <Pid> <Level> <Component>: <Content>',
            'regex': [r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?'],
            'st': 0.5,
            'depth': 4
        },
        'OpenStack': {
            'log_format':
            '<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>',
            'regex': [
                r'((\d+\.){3}\d+,?)+', r'/.+?\s', r'\d+', r'\[.*?\]',
                r'\[.*\]', r'\[.*\] \[.*\]',
                r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)',
                r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$',
                r'\(\/.*\)'
            ],
            'st':
            0.2,
            'depth':
            2
        },
        'OpenStackSasho': {
            'log_format':
            '<_id>,<_index>,<_score>,<_type>,<Hostname>,<user_id>,<project_domain>,<Timestamp>,<timestamp>,<log_level>,<Pid>,<Content>,<tenant_id>,<programname>,<request_id>,<python_module>,<Logger>,<user_domain>,<domain_id>,<http_status>,<http_method>,<http_version>,<http_url>,<chunk>,<next_retry_seconds>,<error>,<retry_time>,<message>,<chunk_id>,<worker>',
            'regex': [
                r'((\d+\.){3}\d+,?)+', r'/.+?\s', r'\d+', r'\[.*?\]',
                r'\[.*\]', r'\[.*\] \[.*\]',
                r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)',
                r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$',
                r'\(\/.*\)'
            ],
            'st':
            0.2,
            'depth':
            2
        },
    }

    try:
        log_format = settings[logtype]["log_format"]
        regex = settings[logtype]["regex"]
        # depth = settings[log_type]["depth"]
        # st = settings[log_type]["st"]
    except ValueError:
        print("log format does not exist")
        raise

    parser = Drain.LogParser(log_format,
                             indir=directory,
                             outdir=output,
                             depth=depth,
                             st=st,
                             rex=regex)
    parser.parse(file, full_output)
示例#11
0
def zip_file(filepath,
             outdir,
             log_format,
             template_file="",
             n_workers=2,
             level=3,
             lossy=False,
             top_event=2000,
             kernel="gz",
             compress_single=False,
             report_file="./report.csv"):
    time_start = time.time()

    # new tmp dirs
    logname = os.path.basename(filepath)
    timemark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
    tmp_dir = os.path.join(outdir, logname + "_tmp_" + timemark)
    print("Tmp files are in {}".format(tmp_dir))
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)

    if not template_file:
        """
        0. sampling
        """

        line_num = subprocess.check_output("wc -l {}".format(filepath),
                                           shell=True)
        line_num = int(line_num.split()[0])
        sample_num = 50000
        sample_file_path = filepath + ".sample"
        try:
            subprocess.check_output("gshuf -n{} {} > {}".format(
                sample_num, filepath, sample_file_path),
                                    shell=True)
        except:
            subprocess.check_output("shuf -n{} {} > {}".format(
                sample_num, filepath, sample_file_path),
                                    shell=True)
        """
        1. get template file  
        """
        st = 0.5  # Similarity threshold
        depth = 4  # Depth of all leaf nodes
        regex = [
            r'blk_(|-)[0-9]+',  # block id
            r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)',  # IP
            r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$',  # Numbers
        ]

        parse_begin_time = time.time()
        parser = Drain.LogParser(log_format,
                                 outdir=tmp_dir,
                                 depth=depth,
                                 st=st,
                                 rex=regex)
        templates = parser.parse(sample_file_path)
        os.remove(sample_file_path)
        parse_end_time = time.time()
        template_file = os.path.join(tmp_dir, "log_templates.csv")
        with open(template_file, "w") as fw:
            [fw.write(item + "\n") for item in templates]
        print("Parser cost [{:.3f}s]".format(parse_end_time -
                                             parse_begin_time))

    # split files
    kb_per_chunk = int(get_FileSize(filepath) // n_workers) + 1
    cmd = "split -b {}k {} {}".format(kb_per_chunk, filepath,
                                      os.path.join(tmp_dir, f"{logname}_"))
    subprocess.call(cmd, stderr=subprocess.STDOUT, shell=True)

    # run subprocesses
    processes = []
    for idx, file in enumerate(
            sorted(glob.glob(os.path.join(tmp_dir, f"{logname}_*")))):
        script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "zipper_longgest.py")
        per_tmp_dir = os.path.join(tmp_dir, str(idx))
        cmd = ('python {} --file {} --log_format "{}" --level {} --lossy {} --template_file {}'+ \
                ' --tmp_dir {} --out_dir {} --compress_single {} --n_workers {}') \
                                            .format(script_path, file, log_format, level, lossy, template_file,
                                              per_tmp_dir, per_tmp_dir,
                                              compress_single, n_workers)
        print(cmd)
        processes.append(
            subprocess.Popen(cmd, stderr=subprocess.STDOUT, shell=True))
    [p.wait() for p in processes]

    compressed_size = 0
    for idx in range(len(processes)):
        sub_outfile = glob.glob(os.path.join(tmp_dir, str(idx), "*logzip*"))[0]
        dst = os.path.join(
            outdir,
            os.path.basename(sub_outfile) + f".{idx+1}of{len(processes)}")
        shutil.move(sub_outfile, dst)
        compressed_size += get_FileSize(dst, "mb")

    [
        os.remove(chunk)
        for chunk in glob.glob(os.path.join(tmp_dir, f"{logname}_*"))
    ]
    original_size = get_FileSize(filepath, "mb")
    compress_ratio = round(original_size / compressed_size, 2)

    time_end = time.time()
    total_time_taken = time_end - time_start

    firstline = True
    if os.path.isfile(report_file):
        firstline = False
    with open(report_file, "a+") as fw:
        if firstline:
            fw.write(
                "timemark,logname,original_size,compressed_size,compress_ratio,time_taken,n_workers,compress_single\n"
            )
        fw.write(
            f"{timemark},{logname},{original_size},{compressed_size},{compress_ratio},{total_time_taken},{n_workers},{compress_single}\n"
        )