def parse_diag(args, transform=_group_uniq): """ parses the following files to generate a report object: - all system.log (GC pause times) - all output.log (configuration at runtime from last reboot) - all cfsats files (table stats) -- node_info.json (drive configuration) -- all blockdev_report (read ahead) """ # find output logs node_configs = node_env.initialize_node_configs(args.diag_dir) output_logs = diag.find_logs(args.diag_dir, args.output_log_prefix) # find system.logs system_logs = diag.find_logs(args.diag_dir, args.system_log_prefix) warnings = node_env.find_config_in_logs(node_configs, output_logs, system_logs) warn_missing(node_configs, output_logs, warnings, "missing output logs") warn_missing(node_configs, system_logs, warnings, "missing system logs") # find block dev node_info_list = diag.find_logs(args.diag_dir, args.node_info_prefix) if node_info_list: # only set block_dev_results if we find a single node_info.json with diag.FileWithProgress(node_info_list[0]) as node_info_json: # read all the block dev reports if node_info_json.error: warnings.append( "unable to read node_info.json with error: '%s'" % node_info_json.error) block_dev_reports = diag.find_logs(args.diag_dir, args.block_dev_prefix) warn_missing(node_configs, block_dev_reports, warnings, "missing blockdev_reports") cass_drive_ra = read_ahead.get_cass_drive_read_ahead( node_info_json, block_dev_reports) read_ahead.add_block_dev_to_config(cass_drive_ra, node_configs) else: warnings.append("unable to read '%s'" % args.node_info_prefix) transformed_configs = transform(node_configs) for warn in node_env.add_gc_to_configs(transformed_configs, system_logs): warnings.append(warn) # add cfstats if present cfstats_files = diag.find_logs(args.diag_dir, args.cfstats_prefix) warn_missing(node_configs, cfstats_files, warnings, "missing cfstats") for warn in table_stats.add_stats_to_config(transformed_configs, cfstats_files): warnings.append(warn) return { "diag_dir": args.diag_dir, "warnings": warnings, "original_configs": node_configs, "configs": transformed_configs, "system_logs": system_logs, }
def analyze(self): error_if_file_not_found = False """ analyze log files """ if self.files: error_if_file_not_found = True target = self.files elif self.diag_dir: target = diag.find_logs(self.diag_dir, "output.log") else: self.analyzed = True return for file in target: with diag.FileWithProgress(file) as log: if not log.file_desc and error_if_file_not_found: raise FileNotFoundError(log.error) for event in parser.read_output_log(log): if event["event_type"] == "classpath": thisjars = OrderedDefaultDict(int) for jar in event["classpath"].split(":"): j = jar.split("/")[-1] if j.endswith("jar"): # to eliminate dupes within the same file, because java is crazy town if j not in thisjars: thisjars[j] += 1 self.jars[j] += 1 self.files_analyzed += 1 self.analyzed = True
def analyze(self): """ analyze slow queries """ parser = SlowQueryParser() target = find_logs(self.diag_dir, 'debug.log') if self.files: target = self.files for file in target: log = open(file, 'r') for query in parser.parse(log): if self.start_time and query['date'] < self.start_time: continue if self.end_time and query['date'] > self.end_time: continue if not self.start: self.start = query['date'] self.end = query['date'] if query['date'] > self.end: self.end = query['date'] if query['date'] < self.start: self.start = query['date'] if 'numslow' in query: # pylint: disable=unused-variable for x in range(query['numslow']): self.querytimes[query['date']].append(query['timeslow']) else: self.querytimes[query['date']].append(query['timeslow']) self.queries.append((query['query'], int(query['timeslow']))) if query['cross'] is not None: self.cross += 1 self.analyzed = True
def analyze(self): """analyze slow queries""" parser = SlowQueryParser() target = find_logs(self.diag_dir, "debug.log") if self.files: target = self.files for f in target: with FileWithProgress(f) as log: for query in parser.parse(log): if self.start_time and query["date"] < self.start_time: continue if self.end_time and query["date"] > self.end_time: continue if not self.start: self.start = query["date"] self.end = query["date"] if query["date"] > self.end: self.end = query["date"] if query["date"] < self.start: self.start = query["date"] if "avg" in query: for x in range(query["numslow"]): self.querytimes[query["date"]].append( int(query["time"])) else: self.querytimes[query["date"]].append( int(query["time"])) self.queries.append((query["query"], int(query["time"]))) if "type" in query and query["type"] == "timed_out": self.timedout += 1 * int(query["numslow"]) if query["cross"] is not None: self.cross += 1 self.analyzed = True
def analyze(self): """parses logs for results""" print("bucketgrep version %s" % VERSION) print("search: '%s'" % self.supplied_regex) target = None if self.files: target = self.files elif self.diag_dir: if self.diag_dir == ".": directory_path = os.getcwd() print("from directory '%s':" % directory_path) else: print("from directory '%s':" % self.diag_dir) target = diag.find_logs(self.diag_dir) else: raise Exception("no diag dir and no files specified") for file in target: with diag.FileWithProgress(file) as log: node_name = extract_node_name(file, ignore_missing_nodes=True) self.node_matches[node_name] = OrderedDefaultDict(list) for line in log: # as long as it's a valid log line we want the date, # even if we don't care about the rest of the line so we can set # the last date for any straregex lines that match current_dt = self.valid_log_regex.match(line) if current_dt: dt = date()(current_dt.group("date")) # if the log line is valite we want to set the last_time self.last_time = dt # we now can validate if our search term matches the log line d = self.timeregex.match(line) if d: # normal case, well-formatted log line self.__setdates(dt) if self.start_time and dt < self.start_time: continue if self.end_time and dt > self.end_time: continue self.matches[dt].append(line) self.node_matches[node_name][dt].append(line) self.count += 1 else: m = self.strayregex.match(line) # check for a match in an unformatted line, like a traceback if m: if self.last_time is None: # match, but no previous timestamp to associate with self.unknown += 1 continue self.matches[self.last_time].append(line) self.node_matches[node_name][ self.last_time].append(line) self.count += 1 self.analyzed = True
def parse(args): """read diag tarball""" res = parse_diag(args, lambda n: [calculate(n)]) # use debug logs for statuslogger output on 5.1.17+, 6.0.10+, 6.7.5+ and 6.8+ debug_logs = diag.find_logs(args.diag_dir, args.debug_log_prefix) parsed = OrderedDict() parsed["diag_dir"] = args.diag_dir parsed["warnings"] = res.get("warnings") parsed["configs"] = res.get("original_configs") parsed["summary"] = res.get("configs")[0] parsed["rec_logs"] = res.get("system_logs") + debug_logs return parsed
def parse(args): """read diag tarball""" res = parse_diag(args, lambda n: [calculate(n)]) #use debug logs for statuslogger output on 5.1.17+, 6.0.10+, 6.7.5+ and 6.8+ debug_logs = diag.find_logs(args.diag_dir, args.debug_log_prefix) return { "diag_dir": args.diag_dir, "warnings": res.get("warnings"), "configs": res.get("original_configs"), "summary": res.get("configs")[0], "rec_logs": res.get("system_logs") + debug_logs, }
def analyze(self): """parses logs for results""" target = None if self.files: target = self.files elif self.diag_dir: target = diag.find_logs(self.diag_dir) else: raise Exception("no diag dir and no files specified") for file in target: with diag.FileWithProgress(file) as log: for line in log: # as long as it's a valid log line we want the date, # even if we don't care about the rest of the line so we can set # the last date for any straregex lines that match current_dt = self.valid_log_regex.match(line) if current_dt: dt = date()(current_dt.group("date")) # if the log line is valite we want to set the last_time self.last_time = dt # we now can validate if our search term matches the log line d = self.timeregex.match(line) if d: # normal case, well-formatted log line self.__setdates(dt) if self.start_time and dt < self.start_time: continue if self.end_time and dt > self.end_time: continue self.matches[dt].append(line) self.count += 1 else: m = self.strayregex.match(line) # check for a match in an unformatted line, like a traceback if m: if self.last_time is None: # match, but no previous timestamp to associate with self.unknown += 1 continue self.matches[self.last_time].append(line) self.count += 1 self.analyzed = True
def analyze(self): """ analyze files """ target = None if self.files: target = self.files elif self.diag_dir: target = find_logs(self.diag_dir) else: raise Exception("no diag dir and no files specified") for file in target: node = node_name(file) log = open(file, 'r') for event in parser.read_log(log, gc.capture_line): if event['event_type'] == 'pause': if self.start_time and event['date'] < self.start_time: continue if self.end_time and event['date'] > self.end_time: continue self.__setdates(event['date'], node) self.pauses[node][event['date']].append(event['duration']) self.gc_types[event['gc_type']] += 1 self.analyzed = True
def analyze(self): """ analyze log files """ if self.files: target = self.files elif self.diag_dir: target = find_logs(self.diag_dir, 'output.log') else: self.analyzed = True return # pylint: disable=too-many-nested-blocks for file in target: log = open(file, 'r') for event in parser.read_output_log(log): if event['event_type'] == 'classpath': thisjars = defaultdict(int) for jar in event['classpath'].split(':'): j = jar.split('/')[-1] if j.endswith("jar"): if j not in thisjars: # to eliminate dupes within the same file, because java is crazy town thisjars[j] += 1 self.jars[j] += 1 self.files_analyzed += 1 self.analyzed = True
def analyze(self): """analyze files""" target = None if self.files: target = self.files elif self.diag_dir: target = diag.find_logs(self.diag_dir) else: raise Exception("no diag dir and no files specified") for file in target: node = extract_node_name(file) with diag.FileWithProgress(file) as log: for event in parser.read_log(log, gc.capture_line): if event["event_type"] == "pause": if self.start_time and event["date"] < self.start_time: continue if self.end_time and event["date"] > self.end_time: continue self.__setdates(event["date"], node) self.pauses[node][event["date"]].append( event["duration"]) self.gc_types[event["gc_type"]] += 1 self.analyzed = True
def parse(args): """read diag tarball""" #find output logs node_configs = node_env.initialize_node_configs(args.diag_dir) output_logs = diag.find_logs(args.diag_dir, args.output_log_prefix) #find system.logs system_logs = diag.find_logs(args.diag_dir, args.system_log_prefix) warnings = node_env.find_config_in_logs(node_configs, output_logs, system_logs) warn_missing(node_configs, output_logs, warnings, "missing output logs") warn_missing(node_configs, system_logs, warnings, "missing system logs") #find block dev node_info_list = diag.find_logs(args.diag_dir, args.node_info_prefix) if node_info_list: #only set block_dev_results if we find a single node_info.json with diag.FileWithProgress(node_info_list[0]) as node_info_json: #read all the block dev reports if node_info_json.error: warnings.append(node_info_json.error) block_dev_reports = diag.find_logs(args.diag_dir, args.block_dev_prefix) warn_missing(node_configs, block_dev_reports, warnings, "missing blockdev_reports") cass_drive_ra = read_ahead.get_cass_drive_read_ahead( node_info_json, block_dev_reports) read_ahead.add_block_dev_to_config(cass_drive_ra, node_configs) else: warnings.append("unable to read '%s'" % args.node_info_prefix) summary = [calculate(node_configs)] for warn in node_env.add_gc_to_configs(summary, system_logs): warnings.append(warn) #add cfstats if present cfstats_files = diag.find_logs(args.diag_dir, args.cfstats_prefix) warn_missing(node_configs, cfstats_files, warnings, "missing cfstats") for warn in table_stats.add_stats_to_config(summary, cfstats_files): warnings.append(warn) #use debug logs for statuslogger output on 5.1.17+, 6.0.10+, 6.7.5+ and 6.8+ debug_logs = diag.find_logs(args.diag_dir, args.debug_log_prefix) return { "diag_dir": args.diag_dir, "warnings": warnings, "configs": node_configs, "summary": summary[0], "rec_logs": system_logs + debug_logs, }
def analyze(self): """ analyze log files """ if self.analyzed: return # pylint: disable=too-many-nested-blocks event_filter = UniqEventPerNodeFilter() target = None if self.files: target = self.files elif self.diag_dir: target_system = find_logs(self.diag_dir, file_to_find=self.syslog_prefix) target_debug = find_logs(self.diag_dir, file_to_find=self.dbglog_prefix) target = target_system + target_debug else: raise Exception("no diag dir and no files specified") for file in target: nodename = node_name(file) event_filter.set_node(nodename) node = self.nodes[nodename] if env.DEBUG: print("parsing", file) log = open(file, 'r') statuslogger_fixer = UnknownStatusLoggerWriter() for event in parser.read_system_log(log): statuslogger_fixer.check(event) if self.start and event['date'] < self.start: continue if self.end and event['date'] > self.end: continue self.__setdates(node, statuslogger_fixer.last_event_date) node.lines += 1 if event_filter.is_duplicate(event): node.skipped_lines += 1 continue if env.DEBUG: if 'rule_type' in event: self.rule_types[event['rule_type']] += 1 elif event['event_type'] == 'unknown': self.rule_types['unknown'] += 1 else: self.rule_types['no type'] += 1 if event['event_type'] == 'server_version': if event.get('version'): node.version = event['version'] elif event.get('cassandra_version'): node.cassandra_version = event['cassandra_version'] #skipping solr, spark etc as it maybe too much noise for statuslogger elif event['event_type'] == 'memtable_status': tname = '.'.join([event['keyspace'], event['table']]) if event['ops'] > node.tables[tname].ops: node.tables[tname].ops = event['ops'] try: if event['data'] > node.tables[tname].data: node.tables[tname].data = event['data'] except Exception as e: print(event) raise e elif event['event_type'] == 'pause': node.pauses.append(event['duration']) elif event['event_type'] == 'threadpool_header': node.dumps_analyzed += 1 self.dumps_analyzed += 1 elif event['event_type'] == 'threadpool_status': if re.match(r"TPC/\d+$", event['pool_name']): if not node.version: node.version = "6.x" if 'delayed' in event and event['delayed']: print(event) val = event['delayed'] node.stages['local backpressure'][ event['pool_name']].append(val) else: for pool in [ 'active', 'pending', 'blocked', 'all_time_blocked' ]: if pool in event and event[pool]: if not self.wanted_stages or event[ 'pool_name'].startswith( self.wanted_stages): node.stages[pool][ event['pool_name']].append(event[pool]) self.analyzed = True if env.DEBUG: print(self.rule_types.items())
def analyze(self): """analyze log files""" if self.analyzed: return event_filter = UniqEventPerNodeFilter() target = None if self.files: target = self.files elif self.diag_dir: target_system = find_logs(self.diag_dir, file_to_find=self.syslog_prefix) target_debug = find_logs(self.diag_dir, file_to_find=self.dbglog_prefix) target = target_system + target_debug else: raise Exception("no diag dir and no files specified") for f in target: nodename = extract_node_name(f, ignore_missing_nodes=True) event_filter.set_node(nodename) node = self.nodes[nodename] if env.DEBUG: print("parsing", f) with FileWithProgress(f) as log: statuslogger_fixer = UnknownStatusLoggerWriter() for event in parser.read_system_log(log): statuslogger_fixer.check(event) if self.start and event["date"] < self.start: continue if self.end and event["date"] > self.end: continue self.__setdates(node, statuslogger_fixer.last_event_date) node.lines += 1 if event_filter.is_duplicate(event): node.skipped_lines += 1 continue if env.DEBUG: if "rule_type" in event: self.rule_types[event["rule_type"]] += 1 elif event["event_type"] == "unknown": self.rule_types["unknown"] += 1 else: self.rule_types["no type"] += 1 if event["event_type"] == "server_version": if event.get("version"): node.version = event["version"] if node.version.startswith("6"): node.cassandra_version = "DSE Private Fork" elif event.get("cassandra_version"): node.cassandra_version = event["cassandra_version"] # skipping solr, spark etc as it maybe too much noise for statuslogger elif event["event_type"] == "memtable_status": tname = ".".join([event["keyspace"], event["table"]]) if event["ops"] > node.tables[tname].ops: node.tables[tname].ops = event["ops"] try: if event["data"] > node.tables[tname].data: node.tables[tname].data = event["data"] except Exception as e: print(event) raise e elif event["event_type"] == "pause": node.pauses.append(event["duration"]) elif event["event_type"] == "threadpool_header": node.dumps_analyzed += 1 self.dumps_analyzed += 1 elif event["event_type"] == "threadpool_status": if re.match(r"TPC/\d+$", event["pool_name"]): if not node.version: node.version = "6.x" if "delayed" in event and event["delayed"]: val = event["delayed"] node.stages["local backpressure"][ event["pool_name"]].append(val) else: for pool in [ "active", "pending", "blocked", "all_time_blocked", ]: if pool in event and event[pool]: if not self.wanted_stages or event[ "pool_name"].startswith( self.wanted_stages): node.stages[pool][ event["pool_name"]].append( event[pool]) self.analyzed = True if env.DEBUG: print(self.rule_types.items())