def main(home_dir, experiments_dir, subsystem_dir, telemetry_script_dir): """ Home directory: Where config info for experiments, etc., is Experiments directory: Where experiment implementations are Both should be given as absolute directories """ time_str = get_timestamp() if not check_file_exists(home_dir, 'config.json'): print('Dashboard config (config.json) is missing in {}'.format(home_dir)) return 1 dash_config = read_json(home_dir, 'config.json') # must expand all tildes in the config to avoid future errors for path_field in ['tmp_data_dir', 'setup_dir', 'backup_dir']: dash_config[path_field] = os.path.expanduser(dash_config[path_field]) tmp_data_dir = os.path.join(dash_config['tmp_data_dir'], 'benchmarks_' + time_str) data_archive = os.path.join(dash_config['tmp_data_dir'], 'benchmarks_' + time_str + '_data.tar.gz') setup_dir = dash_config['setup_dir'] backup_archive = os.path.join(dash_config['backup_dir'], 'dashboard_' + time_str + '.tar.gz') idemp_mkdir(tmp_data_dir) idemp_mkdir(os.path.dirname(backup_archive)) idemp_mkdir(setup_dir) info = DashboardInfo(home_dir) # make a backup of the previous dashboard files if they exist if os.path.exists(home_dir): subprocess.call(['tar', '-zcf', backup_archive, home_dir]) # directories whose contents should not change between runs of the dashboard persistent_dirs = {info.exp_data, info.exp_configs, info.subsys_configs, info.subsys_output} all_dashboard_dirs = info.all_experiment_dirs() + info.all_subsystem_dirs() # instantiate necessary dashboard dirs and clean any that should be empty for dashboard_dir in all_dashboard_dirs: if dashboard_dir not in persistent_dirs: subprocess.call(['rm', '-rf', dashboard_dir]) idemp_mkdir(dashboard_dir) randomize_exps = True if 'randomize' in dash_config: randomize_exps = dash_config['randomize'] telemetry_rate = dash_config.get('telemetry_rate', 15) run_cpu_telemetry = dash_config.get('run_cpu_telemetry', False) run_gpu_telemetry = dash_config.get('run_gpu_telemetry', False) run_all_experiments(info, experiments_dir, setup_dir, tmp_data_dir, data_archive, time_str, telemetry_script_dir, run_cpu_telemetry=run_cpu_telemetry, run_gpu_telemetry=run_gpu_telemetry, telemetry_interval=telemetry_rate, randomize=randomize_exps) run_all_subsystems(info, subsystem_dir, time_str)
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] pass_spec_name_map = { '3;FuseOps': 'Op Fusion', '3;FoldConstant|FuseOps': '... + Constant Folding', '3;EliminateCommonSubexpr|FoldConstant|FuseOps': '... + Common Subexpr Elim', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldConstant|FuseOps': '... + Parallel Conv Comb', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|FoldConstant|FuseOps': '... + Axis Scale Folding', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|FoldConstant|FuseOps': '... + Cast Canonicalization', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|CanonicalizeOps|FoldConstant|FuseOps': '... + Op Canonicalization', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|CanonicalizeOps|AlterOpLayout|FoldConstant|FuseOps': '... + Op Layout Alteration' } prereqs, msg = check_prerequisites(info, { 'pass_comparison': { 'networks': networks, 'passes': [ parse_combo(combo) for combo in pass_spec_name_map.keys() ] } }) all_data = sort_data(info.exp_data_dir('pass_comparison')) raw_data = all_data[-1] baseline = '0;' network_name_map = { 'resnet-18': 'ResNet-18', 'mobilenet': 'MobileNet V2', 'nature-dqn': 'DQN', 'vgg-16': 'VGG-16' } del raw_data['timestamp'] del raw_data['tvm_hash'] try: for (dev, raw_dev_data) in raw_data.items(): plot_data = OrderedDict([ (pass_spec_name_map[pass_spec], { network_name_map[network]: raw_dev_data[baseline][network] / raw_dev_data[pass_spec][network] for network in networks}) for pass_spec in pass_spec_name_map.keys() ]) generate_pass_comparisons(plot_data, output_dir, f'pass-comp-{dev}.png') except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) our_name = 'Relay' if 'our_name' in conf: our_name = conf['our_name'] conf_fws = ['relay', 'pt', 'tf', 'mxnet', 'nnvm'] networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] prereqs, msg = check_prerequisites( info, { 'cnn_comp': { 'devices': ['gpu'], 'use_xla': True, 'networks': networks, 'frameworks': conf_fws } }) if not prereqs: write_status(output_dir, False, msg) return 1 all_data = sort_data(info.exp_data_dir('cnn_comp')) raw_data = all_data[-1]['gpu'] our_fw = 'Relay' other_fws = ['TensorFlow', 'Pytorch', 'MxNet', 'NNVM', 'TF XLA'] fw_name_map = {fw: fw for fw in other_fws} fw_name_map['Pytorch'] = 'PyTorch' networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] network_name_map = { 'resnet-18': 'ResNet-18', 'mobilenet': 'MobileNet V2', 'nature-dqn': 'DQN', 'vgg-16': 'VGG-16' } plot_data = OrderedDict([(fw_name_map[fw], { network_name_map[network]: raw_data[fw][network] / raw_data[our_fw][network] for network in networks }) for fw in other_fws]) try: generate_vision_comparisons(our_name, plot_data, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] prereqs, msg = check_prerequisites( info, { 'relay_opt': { 'devices': ['gpu'], 'opt_levels': [0, 1, 2, 3, 4], 'networks': networks } }) if not prereqs: write_status(output_dir, False, msg) return 1 all_data = sort_data(info.exp_data_dir('relay_opt')) raw_data = all_data[-1]['gpu'] baseline = 'O0' opts = ['O1', 'O2', 'O3', 'O4'] network_name_map = { 'resnet-18': 'ResNet-18', 'mobilenet': 'MobileNet V2', 'nature-dqn': 'DQN', 'vgg-16': 'VGG-16' } plot_data = OrderedDict([(opt, { network_name_map[network]: raw_data[baseline][network] / raw_data[opt][network] for network in networks }) for opt in opts]) try: generate_opt_comparisons(plot_data, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) idemp_mkdir(output_dir) for exp_name in info.all_present_experiments(): exp_status = info.exp_status_dir(exp_name) run_status = validate_json(exp_status, 'run_cpu_telemetry', 'run_gpu_telemetry', filename='run.json') if check_prerequisites( info, {exp_name: {}}) == (True, 'success') and run_status.get( 'success', False): telemetry_folder = info.subsys_telemetry_dir(exp_name) if os.path.exists(telemetry_folder): exp_graph_folder = os.path.join(telemetry_folder, 'graph') cpu_stat = info.exp_cpu_telemetry(exp_name) gpu_stat = info.exp_gpu_telemetry(exp_name) cpu_data = sort_data(cpu_stat) gpu_data = sort_data(gpu_stat) graph_folder = info.exp_graph_dir(exp_name) website_include_dir = os.path.join(graph_folder) try: if cpu_data and run_status.get('run_cpu_telemetry', False): visualize( 'cpu', process_cpu_telemetry(cpu_data[-1]), exp_graph_folder, os.path.join(website_include_dir, 'cpu_telemetry'), f'Visualizing CPU telemetry for {exp_name}', lambda adapter, title, *rest: f'{adapter}-{title}') if gpu_data and run_status.get('run_gpu_telemetry', False): visualize( 'gpu', process_gpu_telemetry(gpu_data[-1]), exp_graph_folder, os.path.join(website_include_dir, 'gpu_telemetry'), f'Visualizing GPU telemetry for {exp_name}', lambda _, title, *rest: title) except Exception as e: write_status( output_dir, False, f'Encountered err while generating graphs: {e}') return write_status(output_dir, True, 'success') else: write_status(output_dir, False, 'No telemetry data found') return
def main(config_dir, home_dir, out_dir): config = read_config(config_dir) info = DashboardInfo(home_dir) exp_titles = get_exp_titles(info) score_titles = get_score_titles(info) deadline_config = None if info.subsys_config_valid('deadline'): deadline_config = info.read_subsys_config('deadline') set_up_out_dir(info, out_dir) # Switch to the output directory, so we don't need to keep track of # separate paths for loading images while the script is running and loading # images when viewing the generated webpage. os.chdir(out_dir) page_prefix = init_page_prefix_template(deadline_config) page_body = gen_page_body(exp_titles, score_titles) page_suffix = init_page_suffix_template(deadline_config) with open(os.path.join(out_dir, 'index.html'), 'w') as f: f.write(page_prefix) f.write(page_body) f.write(page_suffix) write_status(out_dir, True, 'success')
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) data_dir = os.path.join(output_dir, 'data') graph_dir = os.path.join(output_dir, 'graphs') idemp_mkdir(data_dir) idemp_mkdir(graph_dir) timestamp = get_timestamp() score_confs = conf['score_confs'] metrics = set(score_confs.keys()) metrics = metrics.intersection(set(SCORE_METRICS.keys())) if not metrics: write_status(output_dir, True, 'No scores to report') return 0 score_data = {} score_reports = {} for metric in metrics: score_metric = SCORE_METRICS[metric](score_confs[metric]) valid, msg = check_prerequisites(info, score_metric.prereq()) if not valid: write_status(output_dir, False, msg) return 1 score_data_dir = os.path.join(data_dir, metric) score_graph_dir = os.path.join(graph_dir, metric) idemp_mkdir(score_data_dir) idemp_mkdir(score_graph_dir) try: report = process_score(info, score_metric, score_data_dir, score_graph_dir, timestamp) score_reports[metric] = report except Exception as e: write_status( output_dir, False, 'Encountered exception while scoring {}:\n{}'.format( metric, render_exception(e))) return 1 report = {'title': 'Metric Scores', 'value': format_scores(score_reports)} write_json(output_dir, 'report.json', report) write_status(output_dir, True, 'success')
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) our_name = 'Relay' if 'our_name' in conf: our_name = conf['our_name'] prereqs, msg = check_prerequisites( info, { 'treelstm': { 'devices': ['cpu'], 'frameworks': ['relay', 'pt'], 'relay_methods': ['aot'] }, 'char_rnn': { 'devices': ['cpu'], 'frameworks': ['relay', 'pt'], 'relay_methods': ['aot'], 'relay_configs': ['loop'] }, 'gluon_rnns': { 'devices': ['cpu'], 'frameworks': ['relay', 'mxnet'], 'networks': ['rnn', 'lstm', 'gru'], 'relay_methods': ['aot'] } }) if not prereqs: write_status(output_dir, False, msg) return 1 raw_data = {} for exp in ['treelstm', 'char_rnn', 'gluon_rnns']: all_data = sort_data(info.exp_data_dir(exp)) raw_data[exp] = all_data[-1] plot_data = OrderedDict([ ('MxNet', { 'RNN': raw_data['gluon_rnns']['cpu']['MxNet']['rnn'] / raw_data['gluon_rnns']['cpu']['Aot']['rnn'], 'GRU': raw_data['gluon_rnns']['cpu']['MxNet']['gru'] / raw_data['gluon_rnns']['cpu']['Aot']['gru'], 'LSTM': raw_data['gluon_rnns']['cpu']['MxNet']['lstm'] / raw_data['gluon_rnns']['cpu']['Aot']['lstm'], 'CharRNN': 0.0, 'TreeLSTM': 0.0, }), ('PyTorch', { 'RNN': 0.0, 'GRU': 0.0, 'LSTM': 0.0, 'CharRNN': raw_data['char_rnn']['cpu']['Pytorch'] / raw_data['char_rnn']['cpu']['Aot'], 'TreeLSTM': raw_data['treelstm']['cpu']['Pytorch'] / raw_data['treelstm']['cpu']['Aot'], }), ]) try: generate_nlp_comparisons(our_name, plot_data, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def main(config_dir, home_dir, output_dir): config = read_config(config_dir) if 'channel_id' not in config: write_status(output_dir, False, 'No channel token given') return 1 channel = config['channel_id'] success, msg, client = new_client(config) if not success: write_status(output_dir, False, msg) return 1 info = DashboardInfo(home_dir) failed_subsys = [] reports = [] failed_reports = [] for subsys in info.all_present_subsystems(): # ignore self if subsys == 'subsys_reporter': continue if not info.subsys_active(subsys): continue status = info.subsys_stage_status(subsys, 'run') if not status['success']: failed_subsys.append(failed_subsys_field(subsys, status)) continue report_present = check_file_exists(info.subsys_output_dir(subsys), 'report.json') if not report_present: continue try: report = read_json(info.subsys_output_dir(subsys), 'report.json') reports.append( build_field(title=report['title'], value=report['value'])) except Exception: failed_reports.append(subsys_name) attachments = [] if reports: attachments.append(build_attachment(title='Reports', fields=reports)) if failed_reports or failed_subsys: failure_text = '' if failed_reports: failure_text = 'Failed to parse reports: {}'.format( ', '.join(failed_reports)) attachments.append( build_attachment(title='Errors', text=failure_text, color='#fa0000', fields=failed_subsys)) if not attachments: write_status(output_dir, True, 'Nothing to report') return 0 success, _, msg = post_message( client, channel, build_message(text='Subsystem Results', attachments=attachments)) write_status(output_dir, success, msg)
def main(config_dir, home_dir, output_dir): config = read_config(config_dir) if 'webhook_url' not in config: write_status(output_dir, False, 'No webhook URL given') return 1 webhook = config['webhook_url'] description = '' if 'description' in config: description = config['description'] info = DashboardInfo(home_dir) inactive_experiments = [] # list of titles failed_experiments = [] # list of slack fields successful_experiments = [] # list of slack fields failed_graphs = [] # list of titles for exp_name in info.all_present_experiments(): stage_statuses = info.exp_stage_statuses(exp_name) if not stage_statuses['precheck']['success']: failed_experiments.append( failed_experiment_field(exp_name, stage_statuses, 'precheck')) continue exp_conf = info.read_exp_config(exp_name) exp_title = exp_name if 'title' not in exp_conf else exp_conf['title'] notify = exp_conf['notify'] if not exp_conf['active']: inactive_experiments.append(exp_title) continue failure = False for stage in ['setup', 'run', 'analysis', 'summary']: if stage not in stage_statuses: # setup is the only stage that's optional assert stage == 'setup' continue if not stage_statuses[stage]['success']: failed_experiments.append( failed_experiment_field(exp_title, stage_statuses, stage, notify)) failure = True break if failure: continue # failure to visualize is not as big a deal as failing to # run or analyze the experiment, so we only report it but # don't fail to report the summary if not stage_statuses['visualization']['success']: failed_graphs.append(exp_title) summary = info.read_exp_summary(exp_name) successful_experiments.append( build_field(summary['title'], summary['value'])) # produce messages attachments = [] if successful_experiments: attachments.append( build_attachment( title='Successful benchmarks', pretext=description, fields=successful_experiments)) if failed_experiments: attachments.append( build_attachment( color='#fa0000', title='Failed benchmarks', fields=failed_experiments)) if inactive_experiments: attachments.append( build_attachment( color='#616161', title='Inactive benchmarks', text=', '.join(inactive_experiments))) if failed_graphs: attachments.append( build_attachment( color='#fa0000', title='Failed to Visualize', text=', '.join(failed_graphs))) success, report = post_message( webhook, build_message( text='Dashboard Results', attachments=attachments)) write_status(output_dir, success, report)
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) # delete old report so it doesn't hang around if we exit # without a new one if check_file_exists(output_dir, 'report.json'): subprocess.call(['rm', '-f', os.path.join(output_dir, 'report.json')]) time_window = -1 if 'time_window' in conf: time_window = int(conf['time_window']) pings = conf['notify'] if 'notify' in conf else [] # map: exp -> [(fields w/ high SD, historic mean, SD, current)] exp_alerts = {} for exp in info.all_present_experiments(): if not info.exp_active(exp): continue # not this subsystem's job to report on failures stage_statuses = info.exp_stage_statuses(exp) if 'run' not in stage_statuses or 'analysis' not in stage_statuses: continue if not stage_statuses['analysis']['success']: continue all_data = sort_data(info.exp_data_dir(exp)) if len(all_data) <= 1: continue exp_alerts[exp] = [] most_recent = all_data[-1] past_data = all_data[:-1] if time_window >= 1: past_data = [ entry for entry in past_data if time_difference(most_recent, entry).days <= time_window ] field_values = traverse_fields(most_recent) for fields in itertools.product(*field_values): current_stat, _ = gather_stats([most_recent], fields) current = current_stat[0] past_stats, _ = gather_stats(past_data, fields) past_sd = np.std(past_stats) past_mean = np.mean(past_stats) if abs(current - past_mean) > past_sd: exp_alerts[exp].append((fields, past_mean, past_sd, current)) if not exp_alerts[exp]: del exp_alerts[exp] if exp_alerts: report = { 'title': 'High SD Alerts', 'value': format_report(info, exp_alerts, pings) } write_json(output_dir, 'report.json', report) write_status(output_dir, True, 'success')
def main(config_dir, home_dir, output_dir): config = read_config(config_dir) if 'channel_id' not in config: write_status(output_dir, False, 'No channel token given') return 1 success, msg, client = new_client(config) info = DashboardInfo(home_dir) if not success: write_status(output_dir, False, msg) return 1 slack_channel = config['channel_id'] description = '' if 'description' in config: description = config['description'] info = DashboardInfo(home_dir) inactive_experiments = [] # list of titles failed_experiments = [] # list of slack fields successful_experiments = [] # list of slack fields failed_graphs = [] # list of titles for exp_name in info.all_present_experiments(): stage_statuses = info.exp_stage_statuses(exp_name) if not stage_statuses['precheck']['success']: failed_experiments.append( failed_experiment_field(exp_name, stage_statuses, 'precheck')) continue exp_conf = info.read_exp_config(exp_name) exp_status = info.exp_status_dir(exp_name) run_status = validate_json(exp_status, 'time_delta', filename='run.json') exp_title = exp_name if 'title' not in exp_conf else exp_conf['title'] notify = exp_conf['notify'] if not exp_conf['active']: inactive_experiments.append(exp_title) continue failure = False for stage in ['setup', 'run', 'analysis', 'summary']: if stage not in stage_statuses: # setup is the only stage that's optional assert stage == 'setup' continue if not stage_statuses[stage]['success']: failed_experiments.append( failed_experiment_field( exp_title, stage_statuses, stage, duration=run_status.get('time_delta'), notify=notify)) failure = True break if failure: continue # failure to visualize is not as big a deal as failing to # run or analyze the experiment, so we only report it but # don't fail to report the summary if not stage_statuses['visualization']['success']: failed_graphs.append(exp_title) summary = info.read_exp_summary(exp_name) successful_experiments.append( build_field( summary['title'], attach_duration(summary['value'], run_status.get('time_delta')))) # produce messages attachments = [] if successful_experiments: attachments.append( build_attachment(title='Successful benchmarks', fields=successful_experiments)) if failed_experiments: attachments.append( build_attachment(color='#fa0000', title='Failed benchmarks', fields=failed_experiments)) if inactive_experiments: attachments.append( build_attachment(color='#616161', title='Inactive benchmarks', text=', '.join(inactive_experiments))) if failed_graphs: attachments.append( build_attachment(color='#fa0000', title='Failed to Visualize', text=', '.join(failed_graphs))) success, _, report = post_message( client, slack_channel, build_message(text='*Dashboard Results*{}'.format( '\n' + description if description != '' else ''), attachments=attachments)) if config.get('report_images', False): success, msg = send_graphs(config, info, client, output_dir) if not success: write_status(output_dir, False, msg) return 1 write_status(output_dir, success, report)