def _gen_regex_for_request_logs(filepath="request.log"): """ Return a list which contains column names, and regex pattern for request.log :param filepath: A file path or *simple* regex used in glob to select files. :return: (col_list, pattern_str) """ if os.path.isfile(filepath) is False: files = ju._globr(filepath) if bool(files) is False: return ([], "") filepath = files[0] checking_line = linecache.getline( filepath, 2) # first line can be a junk: "** TRUNCATED ** linux x64" # @see: samples/bash/log_search.sh:f_request2csv() columns = [ "clientHost", "l", "user", "date", "requestURL", "statusCode", "headerContentLength", "bytesSent", "elapsedTime", "headerUserAgent", "thread" ] partern_str = '^([^ ]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) "([^"]+)" \[([^\]]+)\]' if re.search(partern_str, checking_line): return (columns, partern_str) columns = [ "clientHost", "l", "user", "date", "requestURL", "statusCode", "bytesSent", "elapsedTime", "headerUserAgent", "thread" ] partern_str = '^([^ ]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) ([^ ]+) "([^"]+)" \[([^\]]+)\]' if re.search(partern_str, checking_line): return (columns, partern_str) columns = [ "clientHost", "l", "user", "date", "requestURL", "statusCode", "bytesSent", "elapsedTime", "headerUserAgent" ] partern_str = '^([^ ]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) ([^ ]+) "([^"]+)' if re.search(partern_str, checking_line): return (columns, partern_str) columns = [ "clientHost", "l", "user", "date", "requestURL", "statusCode", "bytesSent", "elapsedTime" ] partern_str = '^([^ ]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) ([0-9]+)' if re.search(partern_str, checking_line): return (columns, partern_str) columns = [ "clientHost", "l", "user", "date", "requestURL", "statusCode", "bytesSent", "elapsedTime", "misc" ] partern_str = '^([^ ]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+)' if re.search(partern_str, checking_line): return (columns, partern_str) else: ju._info( "Can not determine the log format for %s . Using default one." % (str(filepath))) return (columns, partern_str)
def _gen_regex_for_app_logs(filepath=""): """ Return a list which contains column names, and regex pattern for nexus.log, clm-server.log, server.log :param filepath: A file path or a file name or *simple* pattern used in glob to select files. :param checking_line: Based on this line, columns and regex will be decided :return: (col_list, pattern_str) 2020-01-03 00:00:38,357-0600 WARN [qtp1359575796-407871] anonymous org.sonatype.nexus.proxy.maven.maven2.M2GroupRepository - IOException during parse of metadata UID="oracle:/junit/junit-dep/maven-metadata.xml", will be skipped from aggregation! """ # If filepath is not empty but not exist, assuming it as a glob pattern if bool(filepath) and os.path.isfile(filepath) is False: files = ju._globr(filepath) if bool(files) is False: return ([], "") filepath = files[0] # Default and in case can't be identified columns = ['date_time', 'loglevel', 'message'] partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[^ ]*) +([^ ]+) +(.+)' checking_line = None for i in range(1, 10): checking_line = linecache.getline(filepath, i) if re.search('^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[^ ]*)', checking_line): break if bool(checking_line) is False: ju._info("Could not determine columns and pattern_str. Using default.") return (columns, partern_str) ju._debug(checking_line) columns = [ 'date_time', 'loglevel', 'thread', 'node', 'user', 'class', 'message' ] partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[^ ]*) +([^ ]+) +\[([^]]+)\] ([^ ]*) ([^ ]*) ([^ ]+) - (.*)' if re.search(partern_str, checking_line): return (columns, partern_str) columns = ['date_time', 'loglevel', 'thread', 'user', 'class', 'message'] partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[^ ]*) +([^ ]+) +\[([^]]+)\] ([^ ]*) ([^ ]+) - (.*)' if re.search(partern_str, checking_line): return (columns, partern_str) return (columns, partern_str)
def _gen_regex_for_app_logs(filepath=""): """ Return a list which contains column names, and regex pattern for nexus.log, clm-server.log, server.log :param filepath: A file path or a file name or *simple* pattern used in glob to select files. :param checking_line: Based on this line, columns and regex will be decided :return: (col_list, pattern_str) NOTE: TODO: gz file such as request-2021-03-02.log.gz won't be recognised. """ # If filepath is not empty but not exist, assuming it as a glob pattern if os.path.isfile(filepath) is False: files = ju._globr(filepath) if bool(files) is False: return ([], "") filepath = files[0] # Default and in case can't be identified columns = ['date_time', 'loglevel', 'message'] partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[.,0-9]*)[^ ]* +([^ ]+) +(.+)' checking_line = None for i in range(1, 100): # 10 was not enough checking_line = linecache.getline(filepath, i) if re.search('^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d)', checking_line): break if bool(checking_line) is False: ju._info("Could not determine columns and pattern_str. Using default.") return (columns, partern_str) ju._debug(checking_line) columns = [ 'date_time', 'loglevel', 'thread', 'node', 'user', 'class', 'message' ] partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[.,0-9]*)[^ ]* +([^ ]+) +\[([^]]+)\] ([^ ]*) ([^ ]*) ([^ ]+) - (.*)' if re.search(partern_str, checking_line): return (columns, partern_str) columns = ['date_time', 'loglevel', 'thread', 'user', 'class', 'message'] partern_str = '^(\d\d\d\d-\d\d-\d\d.\d\d:\d\d:\d\d[.,0-9]*)[^ ]* +([^ ]+) +\[([^]]+)\] ([^ ]*) ([^ ]+) - (.*)' if re.search(partern_str, checking_line): return (columns, partern_str) return (columns, partern_str)
def _save_json(file_regex, save_path="", search_props=None, key_name=None, rtn_attrs=None, find_all=False): file_paths = ju._globr(file_regex, useRegex=True) if bool(file_paths) is False: ju._info("No file found by using regex:%s" % file_regex) return False js_obj = gj.get_json(file_paths[0], search_props=search_props, key_name=key_name, rtn_attrs=rtn_attrs, find_all=find_all) if bool(js_obj) is False: ju._info("No JSON returned by searching with %s and %s" % (str(search_props), file_regex)) return False if bool(save_path) is False: return js_obj with open(save_path, 'w') as f: f.write(json.dumps(js_obj))
def etl(path="", dist="./_filtered", max_file_size=(1024 * 1024 * 100)): """ Extract data, transform and load (to DB) :param path: To specify a zip file :param dist: Directory path to save the extracted data (default ./_filtered) :param max_file_size: Larger than this size will be skipped (default 100MB) :return: void """ if bool(path) is False: maybe_zips = ju._globr("*support*.zip", depth=1) if len(maybe_zips) > 0: path = maybe_zips[-1:][0] ju._info( "'path' is not specified and found zip file: %s . Using this one..." % path) cur_dir = os.getcwd() # chdir to the original path later dist = os.path.realpath(dist) tmpObj = None extracted_dir = None if os.path.isfile(path) and path.endswith(".zip"): tmpObj = ju._extract_zip(path) extracted_dir = tmpObj.name os.chdir(extracted_dir) elif os.path.isdir(path): os.chdir(path) try: ### Extract ############################################################ # Somehow Jupyter started as service uses 'sh', so forcing 'bash' ju._system( ju._SH_EXECUTABLE + " -c '[ ! -s /tmp/log_search.sh ] && curl -s --compressed https://raw.githubusercontent.com/hajimeo/samples/master/bash/log_search.sh -o /tmp/log_search.sh; [ ! -d \"%s\" ] && mkdir \"%s\"'" % (dist, dist)) ju._system( ju._SH_EXECUTABLE + " -c '%s[ -d \"%s\" ] && . /tmp/log_search.sh && f_request2csv \"\" \"%s\" 2>/dev/null && f_audit2json \"\" \"%s\"'" % ("cd %s;" % extracted_dir if extracted_dir else "", dist, dist, dist)) # system-filestores from sysinfo.json _save_json("sysinfo\.json", "%s/system-filestores.json" % dist, "system-filestores") # extracting from DB export.json files _save_json("config/export\.json", "%s/http_client.json" % dist, "records,@class=http_client", "@class", "connection,proxy") _save_json("config/export\.json", "%s/db_repo.json" % dist, "records,@class=repository", "@class", "recipe_name,repository_name,online,attributes", True) saml_config = _save_json( "config/export\.json", "", "records,@class:saml", "@class", "entityId,idpMetadata,mapping,keyStoreBytes,keyStorePassword", True) if bool(saml_config): db_saml_idp_metadata = "" from lxml import etree as ET if 'idpMetadata' in saml_config: t = ET.fromstring(saml_config['idpMetadata'].encode('utf-8')) db_saml_idp_metadata += ET.tostring( t, pretty_print=True, encoding='unicode') + "\n" if 'mapping' in saml_config: db_saml_idp_metadata += saml_config['mapping'] if len(db_saml_idp_metadata) > 0: with open("%s/db_saml_idp_metadata.xml" % dist, 'w') as f: f.write(db_saml_idp_metadata) _save_json("security/export\.json", "%s/db_saml_user.json" % dist, "records,@class=saml_user", "@class", "id,status,roles", True) # TODO: add more ### Transform & Load ################################################### # db_xxxxx.json _ = ju.load_jsons(src=dist, include_ptn="db_*.json", flatten=True) # If audit.json file exists _ = ju.json2df(dist + "/audit.json", tablename="t_audit_logs", flatten=True) # xxxxx.csv _ = ju.load_csvs(src="./_filtered/", include_ptn="*.csv") # If request.*csv* exists, use that (because it's faster), if not, logs2table, which is slower. if ju.exists("t_request") is False: (col_names, line_matching) = _gen_regex_for_request_logs('request.log') request_logs = ju.logs2table('request.log', tablename="t_request", col_names=col_names, line_beginning="^.", line_matching=line_matching, max_file_size=max_file_size) # Loading application log file(s) into database. (col_names, line_matching) = _gen_regex_for_app_logs('nexus.log') nxrm_logs = ju.logs2table('nexus.log', tablename="t_nxrm_logs", col_names=col_names, line_matching=line_matching, max_file_size=max_file_size) (col_names, line_matching) = _gen_regex_for_app_logs('clm-server.log') clm_logs = ju.logs2table('clm-server.log', tablename="t_iq_logs", col_names=col_names, line_matching=line_matching, max_file_size=max_file_size) # Hazelcast health monitor if ju.exists("t_log_hazelcast_monitor") is False and bool(nxrm_logs): df_hm = ju.q( """select date_time, message from t_nxrm_logs where class = 'com.hazelcast.internal.diagnostics.HealthMonitor'""" ) if len(df_hm) > 0: (col_names, line_matching) = _gen_regex_for_hazel_health( df_hm['message'][1]) msg_ext = df_hm['message'].str.extract(line_matching) msg_ext.columns = col_names # Delete unnecessary column(s), then left join the extracted dataframe, then load into SQLite df_hm.drop(columns=['message']).join(msg_ext).to_sql( name="t_log_hazelcast_monitor", con=ju.connect(), chunksize=1000, if_exists='replace', schema=ju._DB_SCHEMA) health_monitor = True ju._autocomp_inject(tablename='t_log_hazelcast_monitor') # Elastic JVM monitor if ju.exists("t_log_elastic_jvm_monitor") is False and bool(nxrm_logs): df_em = ju.q( """select date_time, message from t_nxrm_logs where class = 'org.elasticsearch.monitor.jvm'""" ) if len(df_em) > 0: (col_names, line_matching) = _gen_regex_for_elastic_jvm( df_em['message'][1]) msg_ext = df_em['message'].str.extract(line_matching) msg_ext.columns = col_names # Delete unnecessary column(s), then left join the extracted dataframe, then load into SQLite df_em.drop(columns=['message']).join(msg_ext).to_sql( name="t_log_elastic_jvm_monitor", con=ju.connect(), chunksize=1000, if_exists='replace', schema=ju._DB_SCHEMA) health_monitor = True ju._autocomp_inject(tablename='t_log_elastic_jvm_monitor') # Thread dump threads = ju.logs2table( filename="threads.txt", tablename="t_threads", conn=ju.connect(), col_names=['thread_name', 'id', 'state', 'stacktrace'], line_beginning="^[^ ]", line_matching='^"?([^"]+)"? id=([^ ]+) state=(\w+)(.*)', size_regex=None, time_regex=None) except: raise finally: os.chdir(cur_dir) if tmpObj: tmpObj.cleanup() ju.display(ju.desc(), name="Available_Tables")