示例#1
0
def parse(args):
    """parse entry point, generates a report object
    from a tarball or series of files"""
    logs = diag.find_files(args, args.system_log_prefix)
    if args.diag_dir == ".":
        directory_path = os.getcwd()
        print("from directory '%s':" % directory_path)
    else:
        print("from directory '%s':" % args.diag_dir)
    node_stats = OrderedDict()
    after_time = dates.date_parse(args.after)
    before_time = dates.date_parse(args.before)
    for log in logs:
        start_log_time, last_log_time = diag.log_range(log)
        with diag.FileWithProgress(log) as log_file:
            raw_events = parser.read_system_log(log_file)
            item_ev_stats, bytes_ev_stats = calculate_eviction_stats(
                raw_events, after_time, before_time)
            node = util.extract_node_name(log, True)
            node_stats[node] = OrderedDict([
                ("evictions", (bytes_ev_stats, item_ev_stats)),
                ("start", start_log_time),
                ("end", last_log_time),
            ])
    return OrderedDict([
        ("nodes", node_stats),
        ("after_time", after_time),
        ("before_time", before_time),
    ])
示例#2
0
def generate_recommendations(parsed):
    """generate recommendations off the parsed data"""
    gc_over_500 = 0
    delayed_counter = 0
    pending_counter = 0
    blocked_counter = 0
    event_filter = diag.UniqEventPerNodeFilter()
    for rec_log in parsed["rec_logs"]:
        node = util.extract_node_name(rec_log)
        event_filter.set_node(node)
        with diag.FileWithProgress(rec_log) as rec_log_file:
            if rec_log_file.error:
                parsed["warnings"].append("%s failed with error %s" %
                                          (rec_log, rec_log_file.error))
            else:
                statuslogger_fixer = diag.UnknownStatusLoggerWriter()
                events = parser.read_system_log(rec_log_file)
                for event in [
                        e for e in events if not event_filter.is_duplicate(e)
                ]:
                    statuslogger_fixer.check(event)
                    if event.get("event_type") == "unknown":
                        continue
                    if (event.get("event_type") == "pause"
                            and event.get("event_category")
                            == "garbage_collection"):
                        if event.get("duration") > 500:
                            gc_over_500 += 1
                    _status_logger_counter(event, delayed_counter,
                                           pending_counter, blocked_counter)
    recommendations = []
    _recs_on_stages(recommendations, gc_over_500, delayed_counter,
                    pending_counter, blocked_counter)
    _recs_on_configs(recommendations, parsed["configs"])
    return recommendations
示例#3
0
def parse(args):
    """parse entry point, generates a report object
    from a tarball or series of files"""
    logs = diag.find_files(args, args.system_log_prefix)
    print("from directory '%s':" % args.diag_dir)
    node_stats = OrderedDict()
    after_time = dates.date_parse(args.after)
    before_time = dates.date_parse(args.before)
    for log in logs:
        start_log_time, last_log_time = diag.log_range(log)
        with diag.FileWithProgress(log) as log_file:
            raw_events = parser.read_system_log(log_file)
            filter_cache_events_all = [
                event for event in raw_events
                if event.get("event_category", "") == "filter_cache"
            ]
            filter_cache_events = [
                event for event in filter_cache_events_all if "date" in event
                and event["date"] > after_time and event["date"] < before_time
            ]
            item_eviction_stats = _get_stats(filter_cache_events, ItemFCStats,
                                             "eviction_items")
            bytes_eviction_stats = _get_stats(filter_cache_events,
                                              BytesFCStats, "eviction_bytes")
            node = util.extract_node_name(log, True)
            node_stats[node] = OrderedDict([
                ("evictions", (bytes_eviction_stats, item_eviction_stats)),
                ("start", start_log_time),
                ("end", last_log_time),
            ])
    return OrderedDict([
        ("nodes", node_stats),
        ("after_time", after_time),
        ("before_time", before_time),
    ])
示例#4
0
 def test_calculate_eviction_stats(self):
     lines = [
         "INFO  [RemoteMessageServer query worker - 81] 2020-01-21 11:34:33,033  SolrFilterCache.java:340 - Filter cache org.apache.solr.search.SolrFilterCache$1@7c723229 has reached 8000000 entries of a maximum of 8000000. Evicting oldest entries...",
         "ERROR [RemoteMessageServer query worker - 18] 2020-01-21 11:34:34,475  MessageServer.java:277 - Failed to process request:",
         "INFO  [RemoteMessageServer query worker - 81] 2020-01-21 11:34:35,448  SolrFilterCache.java:356 - ...eviction completed in 1304 milliseconds. Filter cache org.apache.solr.search.SolrFilterCache$1@7c723229 usage is now 32441266 bytes across 4000000 entries.",
         "INFO  [LocalMessageServer query worker - 77] 2020-01-21 12:24:23,912  SolrFilterCache.java:340 - Filter cache org.apache.solr.search.SolrFilterCache$1@324b2c16 has reached 3999974 entries of a maximum of 8000000. Evicting oldest entries...",
         "INFO  [LocalMessageServer query worker - 77] 2020-01-21 12:24:23,912  SolrFilterCache.java:356 - ...eviction completed in 1 milliseconds. Filter cache org.apache.solr.search.SolrFilterCache$1@324b2c16 usage is now 32005744 bytes across 3999962 entries.",
         "INFO  [RemoteMessageServer query worker - 41] 2020-01-21 12:47:26,942  SolrFilterCache.java:311 - Filter cache org.apache.solr.search.SolrFilterCache$6@5af917a4 has reached 16 GB bytes of off-heap memory usage, the maximum is 16 GB. Evicting oldest entries...",
         "INFO  [RemoteMessageServer query worker - 41] 2020-01-21 12:47:26,950  SolrFilterCache.java:328 - ...eviction completed in 9 milliseconds. Filter cache org.apache.solr.search.SolrFilterCache$6@5af917a4 usage is now 114781220 across 159 entries.",
         # new version of logs, after DSP-18693
         "INFO  [RemoteMessageServer query worker - 41] 2020-01-21 12:47:26,942  SolrFilterCache.java:311 - Filter cache org.apache.solr.search.SolrFilterCache$6@5af917b6 has reached 16 GB bytes of off-heap memory usage, the maximum is 16 GB. Evicting oldest entries...",
         "INFO  [RemoteMessageServer query worker - 41] 2020-01-21 12:47:26,950  SolrFilterCache.java:328 - ...eviction completed in 8 milliseconds. Filter cache org.apache.solr.search.SolrFilterCache$6@5af917b6 usage is now 114781220 bytes across 159 entries.",
         # eviction event without duration log line
         "INFO  [RemoteMessageServer query worker - 41] 2020-01-21 12:47:26,970  SolrFilterCache.java:311 - Filter cache org.apache.solr.search.SolrFilterCache$6@5af917c7 has reached 16 GB bytes of off-heap memory usage, the maximum is 16 GB. Evicting oldest entries...",
     ]
     raw_events = parser.read_system_log(lines)
     after_time = dates.date_parse("2020-01-21 00:00:00,000")
     before_time = dates.date_parse("2020-02-21 00:00:00,000")
     item_ev_stats, bytes_ev_stats = calculate_eviction_stats(
         raw_events, after_time, before_time
     )
     assert len(item_ev_stats.values()) == 2
     assert sum([s.duration for s in item_ev_stats.values()]) == 1304 + 1
     assert len(bytes_ev_stats.values()) == 3
     assert sum([s.duration for s in bytes_ev_stats.values()]) == 9 + 8 + 0
示例#5
0
def add_gc_to_configs(configs, system_logs):
    """read the system logs and extract the configuration from each file"""
    results = {}
    warnings = []
    for system_log in system_logs:
        with diag.FileWithProgress(system_log) as system_log_file:
            if system_log_file.error:
                warnings.append(system_log_file.error)
            events = parser.read_system_log(system_log_file)
            worst_gc = 0
            for event in events:
                if event.get('event_type') == 'pause' and event.get(
                        'event_category') == 'garbage_collection':
                    worst_gc = max(event.get('duration'), worst_gc)
            node = util.extract_node_name(system_log)
            results[node] = worst_gc
    for config in configs:
        worst_gc_per_config = 0
        worst_node = ""
        for node in config.get('nodes_list', []):
            node_worst_gc = results.get(node, 0)
            if node_worst_gc > worst_gc_per_config:
                worst_gc_per_config = node_worst_gc
                worst_node = node
        config['worst_gc'] = (worst_gc_per_config, worst_node)
    return warnings
示例#6
0
def add_gc_to_configs(configs, system_logs):
    """read the system logs and extract the configuration from each file"""
    results = OrderedDict()
    warnings = []
    for system_log in system_logs:
        with diag.FileWithProgress(system_log) as system_log_file:
            if system_log_file.error:
                warnings.append(system_log_file.error)
            events = parser.read_system_log(system_log_file)
            worst_gc = 0
            for event in events:
                if (
                    event.get("event_type") == "pause"
                    and event.get("event_category") == "garbage_collection"
                ):
                    worst_gc = max(event.get("duration"), worst_gc)
            node = util.extract_node_name(system_log)
            results[node] = worst_gc
    for config in configs:
        worst_gc_per_config = 0
        worst_node = ""
        for node in config.get("nodes_list", []):
            node_worst_gc = results.get(node, 0)
            if node_worst_gc > worst_gc_per_config:
                worst_gc_per_config = node_worst_gc
                worst_node = node
        config["worst_gc"] = (worst_gc_per_config, worst_node)
    return warnings
示例#7
0
def parse(args):
    """parse entry point, generates a report object
    from a tarball or series of files"""
    logs = diag.find_files(args, args.system_log_prefix)
    print("from directory '%s':" % args.diag_dir)
    node_stats = {}
    after_time = dates.date_parse(args.after)
    before_time = dates.date_parse(args.before)
    for log in logs:
        start_log_time, last_log_time = diag.log_range(log)
        with diag.FileWithProgress(log) as log_file:
            raw_events = parser.read_system_log(log_file)
            events = [
                event for event in raw_events
                if event['date'] > after_time and event['date'] < before_time
            ]
            filter_cache_events = [event for event in events \
                    if event['event_category'] == 'filter_cache']
            item_eviction_stats = _get_stats(filter_cache_events, ItemFCStats,
                                             'eviction_items')
            bytes_eviction_stats = _get_stats(filter_cache_events,
                                              BytesFCStats, 'eviction_bytes')
            node = util.extract_node_name(log, True)
            node_stats[node] = {"evictions" : (bytes_eviction_stats, item_eviction_stats), \
                "start": start_log_time, "end": last_log_time, \
                }
    return { \
            "nodes": node_stats, \
            "after_time": after_time, \
            "before_time": before_time, \
            }
示例#8
0
def parse(args):
    """reads the args used in the command to determine what to parse
    and how to parse it. The returned object should be suitable for a report"""
    files = diag.find_files(args, args.log_prefix)
    queries = []
    for filename in files:
        with diag.FileWithProgress(filename) as log_file:
            events = parser.read_system_log(log_file)
            for event in events:
                if event.get('event_type', '') == 'query_logs' and \
                event.get('event_product', '') == 'solr' and \
                event.get('event_category', '') == 'query_component':
                    queries.append(parse_event(event))
    return Parsed(
        queries=queries,
        top_n_worst=args.top,
        unique_reasons=args.uniquereasons,
        score_threshold=args.scorethreshold,
        )
示例#9
0
 def analyze(self):
     """ analyze log files """
     if self.analyzed:
         return
     # pylint: disable=too-many-nested-blocks
     event_filter = UniqEventPerNodeFilter()
     target = None
     if self.files:
         target = self.files
     elif self.diag_dir:
         target_system = find_logs(self.diag_dir,
                                   file_to_find=self.syslog_prefix)
         target_debug = find_logs(self.diag_dir,
                                  file_to_find=self.dbglog_prefix)
         target = target_system + target_debug
     else:
         raise Exception("no diag dir and no files specified")
     for file in target:
         nodename = node_name(file)
         event_filter.set_node(nodename)
         node = self.nodes[nodename]
         if env.DEBUG:
             print("parsing", file)
         log = open(file, 'r')
         statuslogger_fixer = UnknownStatusLoggerWriter()
         for event in parser.read_system_log(log):
             statuslogger_fixer.check(event)
             if self.start and event['date'] < self.start:
                 continue
             if self.end and event['date'] > self.end:
                 continue
             self.__setdates(node, statuslogger_fixer.last_event_date)
             node.lines += 1
             if event_filter.is_duplicate(event):
                 node.skipped_lines += 1
                 continue
             if env.DEBUG:
                 if 'rule_type' in event:
                     self.rule_types[event['rule_type']] += 1
                 elif event['event_type'] == 'unknown':
                     self.rule_types['unknown'] += 1
                 else:
                     self.rule_types['no type'] += 1
             if event['event_type'] == 'server_version':
                 if event.get('version'):
                     node.version = event['version']
                 elif event.get('cassandra_version'):
                     node.cassandra_version = event['cassandra_version']
                 #skipping solr, spark etc as it maybe too much noise for statuslogger
             elif event['event_type'] == 'memtable_status':
                 tname = '.'.join([event['keyspace'], event['table']])
                 if event['ops'] > node.tables[tname].ops:
                     node.tables[tname].ops = event['ops']
                 try:
                     if event['data'] > node.tables[tname].data:
                         node.tables[tname].data = event['data']
                 except Exception as e:
                     print(event)
                     raise e
             elif event['event_type'] == 'pause':
                 node.pauses.append(event['duration'])
             elif event['event_type'] == 'threadpool_header':
                 node.dumps_analyzed += 1
                 self.dumps_analyzed += 1
             elif event['event_type'] == 'threadpool_status':
                 if re.match(r"TPC/\d+$", event['pool_name']):
                     if not node.version:
                         node.version = "6.x"
                     if 'delayed' in event and event['delayed']:
                         print(event)
                         val = event['delayed']
                         node.stages['local backpressure'][
                             event['pool_name']].append(val)
                 else:
                     for pool in [
                             'active', 'pending', 'blocked',
                             'all_time_blocked'
                     ]:
                         if pool in event and event[pool]:
                             if not self.wanted_stages or event[
                                     'pool_name'].startswith(
                                         self.wanted_stages):
                                 node.stages[pool][
                                     event['pool_name']].append(event[pool])
     self.analyzed = True
     if env.DEBUG:
         print(self.rule_types.items())
示例#10
0
 def analyze(self):
     """analyze log files"""
     if self.analyzed:
         return
     event_filter = UniqEventPerNodeFilter()
     target = None
     if self.files:
         target = self.files
     elif self.diag_dir:
         target_system = find_logs(self.diag_dir,
                                   file_to_find=self.syslog_prefix)
         target_debug = find_logs(self.diag_dir,
                                  file_to_find=self.dbglog_prefix)
         target = target_system + target_debug
     else:
         raise Exception("no diag dir and no files specified")
     for f in target:
         nodename = extract_node_name(f, ignore_missing_nodes=True)
         event_filter.set_node(nodename)
         node = self.nodes[nodename]
         if env.DEBUG:
             print("parsing", f)
         with FileWithProgress(f) as log:
             statuslogger_fixer = UnknownStatusLoggerWriter()
             for event in parser.read_system_log(log):
                 statuslogger_fixer.check(event)
                 if self.start and event["date"] < self.start:
                     continue
                 if self.end and event["date"] > self.end:
                     continue
                 self.__setdates(node, statuslogger_fixer.last_event_date)
                 node.lines += 1
                 if event_filter.is_duplicate(event):
                     node.skipped_lines += 1
                     continue
                 if env.DEBUG:
                     if "rule_type" in event:
                         self.rule_types[event["rule_type"]] += 1
                     elif event["event_type"] == "unknown":
                         self.rule_types["unknown"] += 1
                     else:
                         self.rule_types["no type"] += 1
                 if event["event_type"] == "server_version":
                     if event.get("version"):
                         node.version = event["version"]
                         if node.version.startswith("6"):
                             node.cassandra_version = "DSE Private Fork"
                     elif event.get("cassandra_version"):
                         node.cassandra_version = event["cassandra_version"]
                     # skipping solr, spark etc as it maybe too much noise for statuslogger
                 elif event["event_type"] == "memtable_status":
                     tname = ".".join([event["keyspace"], event["table"]])
                     if event["ops"] > node.tables[tname].ops:
                         node.tables[tname].ops = event["ops"]
                     try:
                         if event["data"] > node.tables[tname].data:
                             node.tables[tname].data = event["data"]
                     except Exception as e:
                         print(event)
                         raise e
                 elif event["event_type"] == "pause":
                     node.pauses.append(event["duration"])
                 elif event["event_type"] == "threadpool_header":
                     node.dumps_analyzed += 1
                     self.dumps_analyzed += 1
                 elif event["event_type"] == "threadpool_status":
                     if re.match(r"TPC/\d+$", event["pool_name"]):
                         if not node.version:
                             node.version = "6.x"
                         if "delayed" in event and event["delayed"]:
                             val = event["delayed"]
                             node.stages["local backpressure"][
                                 event["pool_name"]].append(val)
                     else:
                         for pool in [
                                 "active",
                                 "pending",
                                 "blocked",
                                 "all_time_blocked",
                         ]:
                             if pool in event and event[pool]:
                                 if not self.wanted_stages or event[
                                         "pool_name"].startswith(
                                             self.wanted_stages):
                                     node.stages[pool][
                                         event["pool_name"]].append(
                                             event[pool])
     self.analyzed = True
     if env.DEBUG:
         print(self.rule_types.items())
示例#11
0
def generate_recommendations(parsed):
    """generate recommendations off the parsed data"""
    gc_target = 0
    pause_target = None
    if "jvm_args" in parsed:
        pause_target = parsed["jvm_args"].get("-XX:MaxGCPauseMillis")
    if pause_target is not None:
        parse_target_array = []
        if len(pause_target.keys()) > 0:
            for value, nodes in pause_target.items():
                try:
                    parsed_target = int(value)
                except TypeError as e:
                    raise ValueError(
                        "unexpected gc target pause of %s on nodes %s with error %s"
                        % (value, nodes, e))
                if gc_target < parsed_target:
                    gc_target = parsed_target
                parse_target_array.append("(%s: %s)" %
                                          (parsed_target, ",".join(nodes)))
            if len(pause_target.keys()) > 1:
                print(
                    "WARN there are several pause targets so using the highest for recommendations but they may not be accurate: Configuration is as follows %s"
                    % "; ".join(parse_target_array))
    else:
        print(
            "WARN cannot find -XX:MaxGCPauseMillis in the logs setting common default of 500ms"
        )
        gc_target = 500
    tombstone_errors = 0
    tombstone_warns = 0
    gc_over_target = 0
    counter = StatusLoggerCounter()
    solr_index_backoff = OrderedDict()
    solr_index_restore = OrderedDict()
    zero_copy_errors = 0
    drops_remote_only = 0
    rejected = 0
    drop_sums = 0
    drop_types = set()
    event_filter = diag.UniqEventPerNodeFilter()
    bp = BackpressureStats(local_backpressure_active={}, per_core_bp={})
    core_balance = {}
    tpc_event_types = ["6.8", "new"]
    pool_name_pattern = re.compile(r"TPC\/(?P<core>[0-9]+)$")
    for rec_log in parsed["rec_logs"]:
        node = util.extract_node_name(rec_log)
        event_filter.set_node(node)
        with diag.FileWithProgress(rec_log) as rec_log_file:
            if rec_log_file.error:
                parsed["warnings"].append("%s failed with error %s" %
                                          (rec_log, rec_log_file.error))
            else:
                statuslogger_fixer = diag.UnknownStatusLoggerWriter()
                if env.DEBUG:
                    print("parsing", rec_log_file.filepath)
                events = parser.read_system_log(rec_log_file)
                for event in [
                        e for e in events if not event_filter.is_duplicate(e)
                ]:
                    # statuslogger_fixer.check(event) HAS to run first before any code below or statuslogger events will get tossed.
                    statuslogger_fixer.check(event)
                    event_type = event.get("event_type")
                    event_category = event.get("event_category")
                    event_product = event.get("event_product")
                    rule_type = event.get("rule_type")
                    if event_type == "unknown":
                        continue
                    if event_type == "pause" and event_category == "garbage_collection":
                        if event.get("duration") > gc_target:
                            gc_over_target += 1
                    elif event_product == "tombstone":
                        if event_type == "scan_error":
                            tombstone_errors += event.get("tombstones")
                        elif (event_type == "tpc_scan_warn"
                              or event_type == "seda_scan_warn"):
                            tombstone_warns += event.get("tombstones")
                    elif (event_type == "threadpool_status"
                          and rule_type in tpc_event_types):
                        pool_name = event.get("pool_name")
                        if env.DEBUG:
                            print("detected pool name is %s" % pool_name)
                        match = pool_name_pattern.match(pool_name)
                        if match:
                            core = int(match.group(1))
                            if env.DEBUG:
                                print("detected core is %i" % core)
                            pending = event.get("pending")
                            if node in core_balance:
                                core_balance[node].append(
                                    PendingCoreMeasurement(core, pending))
                            else:
                                core_balance[node] = [
                                    PendingCoreMeasurement(core, pending)
                                ]
                    elif (event_category == "streaming"
                          and event_product == "zcs"
                          and event_type == "bloom_filter"):
                        zero_copy_errors += 1
                    elif event_type == "core_backpressure":
                        if node in bp.per_core_bp.keys():
                            bp.per_core_bp[node].cores.append(
                                event.get("core_num"))
                            bp.per_core_bp[node].total_bp_events += 1
                        else:
                            bp.per_core_bp[node] = CoreBackpressureStats(
                                cores=[event.get("core_num")],
                                total_bp_events=1)
                    elif event_type == "core_backpressure_local":
                        if node in bp.local_backpressure_active:
                            bp.local_backpressure_active[node] += 1
                        else:
                            bp.local_backpressure_active[node] = 1
                    elif event_category == "indexing":
                        core_name = event.get("core_name")
                        d = event.get("date")
                        if event.get("event_type") == "increase_soft_commit":
                            if core_name in solr_index_backoff:
                                solr_index_backoff[core_name]["count"] += 1
                                solr_index_backoff[core_name]["dates"].append(
                                    d)
                            else:
                                solr_index_backoff[core_name] = {
                                    "count": 1,
                                    "dates": [event.get("date")],
                                }
                        elif event_type == "restore_soft_commit":
                            if core_name in solr_index_restore:
                                solr_index_restore[core_name]["count"] += 1
                                solr_index_restore[core_name]["dates"].append(
                                    d)
                            else:
                                solr_index_restore[core_name] = {
                                    "count": 1,
                                    "dates": [event.get("date")],
                                }
                    elif event_type == "network_backpressure":
                        rejected += event.get("total_dropped")
                    elif event_type == "drops":
                        local = event.get("localCount")
                        remote = event.get("remoteCount")
                        drop_type = event.get("messageType")
                        drop_types.add(drop_type)
                        drop_sums += local + remote
                        if remote > 0 and local == 0:
                            drops_remote_only += 1
                    _status_logger_counter(event, counter)
    recommendations = []
    _recs_on_stages(recommendations, gc_over_target, gc_target, counter)
    _recs_on_configs(recommendations, parsed["configs"])
    _recs_on_solr(recommendations, solr_index_backoff, solr_index_restore)
    _recs_on_drops(
        recommendations,
        drops_remote_only,
        sorted(list(drop_types), reverse=True),
        drop_sums,
    )
    _recs_on_zero_copy(recommendations, zero_copy_errors)
    _recs_on_rejects(recommendations, rejected)
    _recs_on_bp(recommendations, bp, gc_over_target)
    _recs_on_core_balance(recommendations, core_balance)
    _recs_on_tombstones(recommendations, tombstone_errors, tombstone_warns)
    return recommendations