示例#1
0
def _gen_regex_for_request_logs(filepath="request.log"):
    """
    Return a list which contains column names, and regex pattern for request.log
    :param filepath: A file path or *simple* regex used in glob to select files.
    :return: (col_list, pattern_str)
    """
    if os.path.isfile(filepath) is False:
        files = ju._globr(filepath)
        if bool(files) is False:
            return ([], "")
        filepath = files[0]
    checking_line = linecache.getline(
        filepath, 2)  # first line can be a junk: "** TRUNCATED ** linux x64"
    # @see: samples/bash/log_search.sh:f_request2csv()
    columns = [
        "clientHost", "l", "user", "date", "requestURL", "statusCode",
        "headerContentLength", "bytesSent", "elapsedTime", "headerUserAgent",
        "thread"
    ]
    partern_str = '^([^ ]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) "([^"]+)" \[([^\]]+)\]'
    if re.search(partern_str, checking_line):
        return (columns, partern_str)
    columns = [
        "clientHost", "l", "user", "date", "requestURL", "statusCode",
        "bytesSent", "elapsedTime", "headerUserAgent", "thread"
    ]
    partern_str = '^([^ ]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) ([^ ]+) "([^"]+)" \[([^\]]+)\]'
    if re.search(partern_str, checking_line):
        return (columns, partern_str)
    columns = [
        "clientHost", "l", "user", "date", "requestURL", "statusCode",
        "bytesSent", "elapsedTime", "headerUserAgent"
    ]
    partern_str = '^([^ ]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) ([^ ]+) "([^"]+)'
    if re.search(partern_str, checking_line):
        return (columns, partern_str)
    columns = [
        "clientHost", "l", "user", "date", "requestURL", "statusCode",
        "bytesSent", "elapsedTime"
    ]
    partern_str = '^([^ ]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) ([0-9]+)'
    if re.search(partern_str, checking_line):
        return (columns, partern_str)

    columns = [
        "clientHost", "l", "user", "date", "requestURL", "statusCode",
        "bytesSent", "elapsedTime", "misc"
    ]
    partern_str = '^([^ ]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+)'
    if re.search(partern_str, checking_line):
        return (columns, partern_str)
    else:
        ju._info(
            "Can not determine the log format for %s . Using default one." %
            (str(filepath)))
        return (columns, partern_str)
def _gen_regex_for_app_logs(filepath=""):
    """
    Return a list which contains column names, and regex pattern for nexus.log, clm-server.log, server.log
    :param filepath: A file path or a file name or *simple* pattern used in glob to select files.
    :param checking_line: Based on this line, columns and regex will be decided
    :return: (col_list, pattern_str)
    2020-01-03 00:00:38,357-0600 WARN  [qtp1359575796-407871] anonymous org.sonatype.nexus.proxy.maven.maven2.M2GroupRepository - IOException during parse of metadata UID="oracle:/junit/junit-dep/maven-metadata.xml", will be skipped from aggregation!
    """
    # If filepath is not empty but not exist, assuming it as a glob pattern
    if bool(filepath) and os.path.isfile(filepath) is False:
        files = ju._globr(filepath)
        if bool(files) is False:
            return ([], "")
        filepath = files[0]

    # Default and in case can't be identified
    columns = ['date_time', 'loglevel', 'message']
    partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[^ ]*) +([^ ]+) +(.+)'

    checking_line = None
    for i in range(1, 10):
        checking_line = linecache.getline(filepath, i)
        if re.search('^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[^ ]*)',
                     checking_line):
            break
    if bool(checking_line) is False:
        ju._info("Could not determine columns and pattern_str. Using default.")
        return (columns, partern_str)
    ju._debug(checking_line)

    columns = [
        'date_time', 'loglevel', 'thread', 'node', 'user', 'class', 'message'
    ]
    partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[^ ]*) +([^ ]+) +\[([^]]+)\] ([^ ]*) ([^ ]*) ([^ ]+) - (.*)'
    if re.search(partern_str, checking_line):
        return (columns, partern_str)
    columns = ['date_time', 'loglevel', 'thread', 'user', 'class', 'message']
    partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[^ ]*) +([^ ]+) +\[([^]]+)\] ([^ ]*) ([^ ]+) - (.*)'
    if re.search(partern_str, checking_line):
        return (columns, partern_str)
    return (columns, partern_str)
示例#3
0
def _gen_regex_for_app_logs(filepath=""):
    """
    Return a list which contains column names, and regex pattern for nexus.log, clm-server.log, server.log
    :param filepath: A file path or a file name or *simple* pattern used in glob to select files.
    :param checking_line: Based on this line, columns and regex will be decided
    :return: (col_list, pattern_str)
    NOTE: TODO: gz file such as request-2021-03-02.log.gz won't be recognised.
    """
    # If filepath is not empty but not exist, assuming it as a glob pattern
    if os.path.isfile(filepath) is False:
        files = ju._globr(filepath)
        if bool(files) is False:
            return ([], "")
        filepath = files[0]

    # Default and in case can't be identified
    columns = ['date_time', 'loglevel', 'message']
    partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[.,0-9]*)[^ ]* +([^ ]+) +(.+)'

    checking_line = None
    for i in range(1, 100):  # 10 was not enough
        checking_line = linecache.getline(filepath, i)
        if re.search('^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d)', checking_line):
            break
    if bool(checking_line) is False:
        ju._info("Could not determine columns and pattern_str. Using default.")
        return (columns, partern_str)
    ju._debug(checking_line)

    columns = [
        'date_time', 'loglevel', 'thread', 'node', 'user', 'class', 'message'
    ]
    partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[.,0-9]*)[^ ]* +([^ ]+) +\[([^]]+)\] ([^ ]*) ([^ ]*) ([^ ]+) - (.*)'
    if re.search(partern_str, checking_line):
        return (columns, partern_str)
    columns = ['date_time', 'loglevel', 'thread', 'user', 'class', 'message']
    partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[.,0-9]*)[^ ]* +([^ ]+) +\[([^]]+)\] ([^ ]*) ([^ ]+) - (.*)'
    if re.search(partern_str, checking_line):
        return (columns, partern_str)
    return (columns, partern_str)
def _save_json(file_regex,
               save_path="",
               search_props=None,
               key_name=None,
               rtn_attrs=None,
               find_all=False):
    file_paths = ju._globr(file_regex, useRegex=True)
    if bool(file_paths) is False:
        ju._info("No file found by using regex:%s" % file_regex)
        return False
    js_obj = gj.get_json(file_paths[0],
                         search_props=search_props,
                         key_name=key_name,
                         rtn_attrs=rtn_attrs,
                         find_all=find_all)
    if bool(js_obj) is False:
        ju._info("No JSON returned by searching with %s and %s" %
                 (str(search_props), file_regex))
        return False
    if bool(save_path) is False:
        return js_obj
    with open(save_path, 'w') as f:
        f.write(json.dumps(js_obj))
def etl(path="", dist="./_filtered", max_file_size=(1024 * 1024 * 100)):
    """
    Extract data, transform and load (to DB)
    :param path: To specify a zip file
    :param dist: Directory path to save the extracted data (default ./_filtered)
    :param max_file_size: Larger than this size will be skipped (default 100MB)
    :return: void
    """
    if bool(path) is False:
        maybe_zips = ju._globr("*support*.zip", depth=1)
        if len(maybe_zips) > 0:
            path = maybe_zips[-1:][0]
            ju._info(
                "'path' is not specified and found zip file: %s . Using this one..."
                % path)

    cur_dir = os.getcwd()  # chdir to the original path later
    dist = os.path.realpath(dist)
    tmpObj = None
    extracted_dir = None
    if os.path.isfile(path) and path.endswith(".zip"):
        tmpObj = ju._extract_zip(path)
        extracted_dir = tmpObj.name
        os.chdir(extracted_dir)
    elif os.path.isdir(path):
        os.chdir(path)

    try:
        ### Extract ############################################################
        # Somehow Jupyter started as service uses 'sh', so forcing 'bash'
        ju._system(
            ju._SH_EXECUTABLE +
            " -c '[ ! -s /tmp/log_search.sh ] && curl -s --compressed https://raw.githubusercontent.com/hajimeo/samples/master/bash/log_search.sh -o /tmp/log_search.sh; [ ! -d \"%s\" ] && mkdir \"%s\"'"
            % (dist, dist))
        ju._system(
            ju._SH_EXECUTABLE +
            " -c '%s[ -d \"%s\" ] && . /tmp/log_search.sh && f_request2csv \"\" \"%s\" 2>/dev/null && f_audit2json \"\" \"%s\"'"
            % ("cd %s;" % extracted_dir if extracted_dir else "", dist, dist,
               dist))
        # system-filestores from sysinfo.json
        _save_json("sysinfo\.json", "%s/system-filestores.json" % dist,
                   "system-filestores")
        # extracting from DB export.json files
        _save_json("config/export\.json", "%s/http_client.json" % dist,
                   "records,@class=http_client", "@class", "connection,proxy")
        _save_json("config/export\.json", "%s/db_repo.json" % dist,
                   "records,@class=repository", "@class",
                   "recipe_name,repository_name,online,attributes", True)
        saml_config = _save_json(
            "config/export\.json", "", "records,@class:saml", "@class",
            "entityId,idpMetadata,mapping,keyStoreBytes,keyStorePassword",
            True)
        if bool(saml_config):
            db_saml_idp_metadata = ""
            from lxml import etree as ET
            if 'idpMetadata' in saml_config:
                t = ET.fromstring(saml_config['idpMetadata'].encode('utf-8'))
                db_saml_idp_metadata += ET.tostring(
                    t, pretty_print=True, encoding='unicode') + "\n"
            if 'mapping' in saml_config:
                db_saml_idp_metadata += saml_config['mapping']
            if len(db_saml_idp_metadata) > 0:
                with open("%s/db_saml_idp_metadata.xml" % dist, 'w') as f:
                    f.write(db_saml_idp_metadata)
        _save_json("security/export\.json", "%s/db_saml_user.json" % dist,
                   "records,@class=saml_user", "@class", "id,status,roles",
                   True)
        # TODO: add more

        ### Transform & Load ###################################################
        # db_xxxxx.json
        _ = ju.load_jsons(src=dist, include_ptn="db_*.json", flatten=True)
        # If audit.json file exists
        _ = ju.json2df(dist + "/audit.json",
                       tablename="t_audit_logs",
                       flatten=True)
        # xxxxx.csv
        _ = ju.load_csvs(src="./_filtered/", include_ptn="*.csv")

        # If request.*csv* exists, use that (because it's faster), if not, logs2table, which is slower.
        if ju.exists("t_request") is False:
            (col_names,
             line_matching) = _gen_regex_for_request_logs('request.log')
            request_logs = ju.logs2table('request.log',
                                         tablename="t_request",
                                         col_names=col_names,
                                         line_beginning="^.",
                                         line_matching=line_matching,
                                         max_file_size=max_file_size)

        # Loading application log file(s) into database.
        (col_names, line_matching) = _gen_regex_for_app_logs('nexus.log')
        nxrm_logs = ju.logs2table('nexus.log',
                                  tablename="t_nxrm_logs",
                                  col_names=col_names,
                                  line_matching=line_matching,
                                  max_file_size=max_file_size)
        (col_names, line_matching) = _gen_regex_for_app_logs('clm-server.log')
        clm_logs = ju.logs2table('clm-server.log',
                                 tablename="t_iq_logs",
                                 col_names=col_names,
                                 line_matching=line_matching,
                                 max_file_size=max_file_size)

        # Hazelcast health monitor
        if ju.exists("t_log_hazelcast_monitor") is False and bool(nxrm_logs):
            df_hm = ju.q(
                """select date_time, message from t_nxrm_logs where class = 'com.hazelcast.internal.diagnostics.HealthMonitor'"""
            )
            if len(df_hm) > 0:
                (col_names, line_matching) = _gen_regex_for_hazel_health(
                    df_hm['message'][1])
                msg_ext = df_hm['message'].str.extract(line_matching)
                msg_ext.columns = col_names
                # Delete unnecessary column(s), then left join the extracted dataframe, then load into SQLite
                df_hm.drop(columns=['message']).join(msg_ext).to_sql(
                    name="t_log_hazelcast_monitor",
                    con=ju.connect(),
                    chunksize=1000,
                    if_exists='replace',
                    schema=ju._DB_SCHEMA)
                health_monitor = True
                ju._autocomp_inject(tablename='t_log_hazelcast_monitor')

        # Elastic JVM monitor
        if ju.exists("t_log_elastic_jvm_monitor") is False and bool(nxrm_logs):
            df_em = ju.q(
                """select date_time, message from t_nxrm_logs where class = 'org.elasticsearch.monitor.jvm'"""
            )
            if len(df_em) > 0:
                (col_names, line_matching) = _gen_regex_for_elastic_jvm(
                    df_em['message'][1])
                msg_ext = df_em['message'].str.extract(line_matching)
                msg_ext.columns = col_names
                # Delete unnecessary column(s), then left join the extracted dataframe, then load into SQLite
                df_em.drop(columns=['message']).join(msg_ext).to_sql(
                    name="t_log_elastic_jvm_monitor",
                    con=ju.connect(),
                    chunksize=1000,
                    if_exists='replace',
                    schema=ju._DB_SCHEMA)
                health_monitor = True
                ju._autocomp_inject(tablename='t_log_elastic_jvm_monitor')

        # Thread dump
        threads = ju.logs2table(
            filename="threads.txt",
            tablename="t_threads",
            conn=ju.connect(),
            col_names=['thread_name', 'id', 'state', 'stacktrace'],
            line_beginning="^[^ ]",
            line_matching='^"?([^"]+)"? id=([^ ]+) state=(\w+)(.*)',
            size_regex=None,
            time_regex=None)
    except:
        raise
    finally:
        os.chdir(cur_dir)
        if tmpObj:
            tmpObj.cleanup()

    ju.display(ju.desc(), name="Available_Tables")