def analyze_log(parser): root = Measurement('Luigi Worker') scheduling_time = analyze_overall_scheduling(parser) root.add_child(scheduling_time) execution_time = analyze_overall_execution(parser) root.add_child(execution_time) return root
def analyze(): arg_parser = argparse.ArgumentParser(description='Analyze log files.') arg_parser.add_argument( '-o', '--output', help= 'Save the results of the analysis to a file that can be read later.') arg_parser.add_argument( '-r', '--report', choices=['text', 'json', 'html'], help= 'Generate a report in the requested format and stream it to stdout.') arg_parser.add_argument( '--threshold-percent', type=float, default=None, help= "Don't include entries that accounted for less than this amount of time relative to the total execution" " time in the report.") group = arg_parser.add_argument_group('Input File') group_ex = group.add_mutually_exclusive_group() group_ex.add_argument( '-l', '--log', default='edx_analytics.log', help= 'A log file to analyze. Defaults to "edx_analytics.log" in the current directory.' ) group_ex.add_argument('-i', '--input', help='Reads a previously saved result file.') group_ex.add_argument( '-t', '--trace', help= 'Path to an execution trace of the launch-task process captured by pyinstrument and saved as JSON.' ) args = arg_parser.parse_args() if args.input: root = Measurement.from_json(args.input) elif args.trace: root = Measurement.from_pyinstrument_trace(args.trace) else: root = analyze_log_file(args.log) if args.output: json_report(root, args.output, pretty=False, threshold_percent=args.threshold_percent) if args.report == 'json': json_report(root, threshold_percent=args.threshold_percent) elif args.report == 'text': text_report(root, threshold_percent=args.threshold_percent) elif args.report == 'html': html_report(root, threshold_percent=args.threshold_percent)
def analyze(): arg_parser = argparse.ArgumentParser(description='Analyze log files.') arg_parser.add_argument( '-o', '--output', help='Save the results of the analysis to a file that can be read later.' ) arg_parser.add_argument( '-r', '--report', choices=['text', 'json', 'html'], help='Generate a report in the requested format and stream it to stdout.' ) arg_parser.add_argument( '--threshold-percent', type=float, default=None, help="Don't include entries that accounted for less than this amount of time relative to the total execution" " time in the report." ) group = arg_parser.add_argument_group('Input File') group_ex = group.add_mutually_exclusive_group() group_ex.add_argument( '-l', '--log', default='edx_analytics.log', help='A log file to analyze. Defaults to "edx_analytics.log" in the current directory.' ) group_ex.add_argument( '-i', '--input', help='Reads a previously saved result file.' ) group_ex.add_argument( '-t', '--trace', help='Path to an execution trace of the launch-task process captured by pyinstrument and saved as JSON.' ) args = arg_parser.parse_args() if args.input: root = Measurement.from_json(args.input) elif args.trace: root = Measurement.from_pyinstrument_trace(args.trace) else: root = analyze_log_file(args.log) if args.output: json_report(root, args.output, pretty=False, threshold_percent=args.threshold_percent) if args.report == 'json': json_report(root, threshold_percent=args.threshold_percent) elif args.report == 'text': text_report(root, threshold_percent=args.threshold_percent) elif args.report == 'html': html_report(root, threshold_percent=args.threshold_percent)
def analyze_overall_execution(parser): all_execution = Measurement('Executing Tasks') pattern = r'.*? Worker Worker.* (?P<state>running|done|failed)\s+(?P<task_id>.*)' message = True overall_start_timestamp = None running_measurement = None while message: message = parser.next_message() if message.content == 'Done': if overall_start_timestamp: all_execution.set_time_from_range(message.timestamp, overall_start_timestamp) return all_execution match = re.match(pattern, message.content, (re.MULTILINE | re.DOTALL)) if not match: if 'Running job:' in message.content or 'Starting Job =' in message.content: for measurement in analyze_hadoop_job(message, parser): running_measurement.add_child(measurement) continue task = LuigiTaskDescription.from_string(match.group('task_id')) state = match.group('state') if state == 'running': start_timestamp = message.timestamp running_measurement = Measurement('Executing {}'.format(task)) if not overall_start_timestamp: overall_start_timestamp = start_timestamp else: running_measurement.set_time_from_range(message.timestamp, start_timestamp) all_execution.add_child(running_measurement)
def analyze_output_move(parser): output_message = parser.peek_message() if re.match(r'.*?Output:.*-temp-', output_message.content): start_timestamp = output_message.timestamp parser.next_message() next_message = parser.peek_message() return Measurement('Moving Job Output', next_message.timestamp - start_timestamp)
def analyze_overall_scheduling(parser): all_scheduling = Measurement('Scheduling Tasks') start_scheduling_pattern = r'Checking if (?P<task_id>.*?) is complete' message = True start_scheduling_timestamp = None while message: message = parser.next_message() if message.content == 'Done scheduling tasks': all_scheduling.set_time_from_range(message.timestamp, start_scheduling_timestamp) return all_scheduling start_match = re.match(start_scheduling_pattern, message.content, (re.MULTILINE | re.DOTALL)) if start_match: if start_scheduling_timestamp is None: start_scheduling_timestamp = message.timestamp measurement = analyze_task_scheduling(message, start_match, parser) if measurement: all_scheduling.add_child(measurement)
def analyze_hadoop_job(starting_message, parser): match = re.match(r'.*?(?P<job_id>job_\d{12}_\d{4})', starting_message.content) job_id = match.group('job_id') start_timestamp = starting_message.timestamp message = starting_message while message: message = parser.next_message() if 'Job complete:' in message.content or 'Ended Job = ' in message.content: if 'Job complete:' in message.content: move_measure = analyze_output_move(parser) if move_measure: yield move_measure yield Measurement('Hadoop ' + job_id, message.timestamp - start_timestamp) return
def analyze_task_scheduling(start_message, start_match, parser): scheduled_pattern = r'Scheduled (?P<task_id>.*?) \((?P<status>\w+)\)' task = LuigiTaskDescription.from_string(start_match.group('task_id')) start_timestamp = start_message.timestamp message = start_message while message: message = parser.peek_message() if message.content == 'Done scheduling tasks': break if re.match(scheduled_pattern, message.content, (re.MULTILINE | re.DOTALL)): break parser.next_message() if not task.name == 'UncheckedExternalURL': return Measurement('Scheduling {}'.format(task), message.timestamp - start_timestamp) else: return None