예제 #1
0
def main(home_dir, experiments_dir, subsystem_dir, telemetry_script_dir):
    """
    Home directory: Where config info for experiments, etc., is
    Experiments directory: Where experiment implementations are
    Both should be given as absolute directories
    """
    time_str = get_timestamp()

    if not check_file_exists(home_dir, 'config.json'):
        print('Dashboard config (config.json) is missing in {}'.format(home_dir))
        return 1
    dash_config = read_json(home_dir, 'config.json')

    # must expand all tildes in the config to avoid future errors
    for path_field in ['tmp_data_dir', 'setup_dir', 'backup_dir']:
        dash_config[path_field] = os.path.expanduser(dash_config[path_field])

    tmp_data_dir = os.path.join(dash_config['tmp_data_dir'], 'benchmarks_' + time_str)
    data_archive = os.path.join(dash_config['tmp_data_dir'], 'benchmarks_' + time_str + '_data.tar.gz')
    setup_dir = dash_config['setup_dir']
    backup_archive = os.path.join(dash_config['backup_dir'], 'dashboard_' + time_str + '.tar.gz')
    idemp_mkdir(tmp_data_dir)
    idemp_mkdir(os.path.dirname(backup_archive))
    idemp_mkdir(setup_dir)

    info = DashboardInfo(home_dir)

    # make a backup of the previous dashboard files if they exist
    if os.path.exists(home_dir):
        subprocess.call(['tar', '-zcf', backup_archive, home_dir])

    # directories whose contents should not change between runs of the dashboard
    persistent_dirs = {info.exp_data,
                       info.exp_configs,
                       info.subsys_configs,
                       info.subsys_output}
    all_dashboard_dirs = info.all_experiment_dirs() + info.all_subsystem_dirs()

    # instantiate necessary dashboard dirs and clean any that should be empty
    for dashboard_dir in all_dashboard_dirs:
        if dashboard_dir not in persistent_dirs:
            subprocess.call(['rm', '-rf', dashboard_dir])
        idemp_mkdir(dashboard_dir)

    randomize_exps = True
    if 'randomize' in dash_config:
        randomize_exps = dash_config['randomize']

    telemetry_rate = dash_config.get('telemetry_rate', 15)
    run_cpu_telemetry = dash_config.get('run_cpu_telemetry', False)
    run_gpu_telemetry = dash_config.get('run_gpu_telemetry', False)
    run_all_experiments(info, experiments_dir, setup_dir,
                        tmp_data_dir, data_archive,
                        time_str, telemetry_script_dir, run_cpu_telemetry=run_cpu_telemetry, run_gpu_telemetry=run_gpu_telemetry,
                        telemetry_interval=telemetry_rate, randomize=randomize_exps)

    run_all_subsystems(info, subsystem_dir, time_str)
예제 #2
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16']
    pass_spec_name_map = {
        '3;FuseOps': 'Op Fusion',
        '3;FoldConstant|FuseOps': '... + Constant Folding',
        '3;EliminateCommonSubexpr|FoldConstant|FuseOps': '... + Common Subexpr Elim',
        '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldConstant|FuseOps': '... + Parallel Conv Comb',
        '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|FoldConstant|FuseOps': '... + Axis Scale Folding',
        '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|FoldConstant|FuseOps': '... + Cast Canonicalization',
        '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|CanonicalizeOps|FoldConstant|FuseOps': '... + Op Canonicalization',
        '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|CanonicalizeOps|AlterOpLayout|FoldConstant|FuseOps': '... + Op Layout Alteration'
    }

    prereqs, msg = check_prerequisites(info, {
        'pass_comparison': {
            'networks': networks,
            'passes': [
                parse_combo(combo) for combo in pass_spec_name_map.keys()
            ]
        }
    })

    all_data = sort_data(info.exp_data_dir('pass_comparison'))
    raw_data = all_data[-1]

    baseline = '0;'

    network_name_map = {
        'resnet-18': 'ResNet-18',
        'mobilenet': 'MobileNet V2',
        'nature-dqn': 'DQN',
        'vgg-16': 'VGG-16'
    }

    del raw_data['timestamp']
    del raw_data['tvm_hash']

    try:
        for (dev, raw_dev_data) in raw_data.items():
            plot_data = OrderedDict([
                (pass_spec_name_map[pass_spec], {
                    network_name_map[network]:
                    raw_dev_data[baseline][network] / raw_dev_data[pass_spec][network]
                    for network in networks})
                for pass_spec in pass_spec_name_map.keys()
            ])
            generate_pass_comparisons(plot_data, output_dir, f'pass-comp-{dev}.png')
    except Exception as e:
        write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e))
        return 1

    write_status(output_dir, True, 'success')
예제 #3
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    conf = read_config(config_dir)
    our_name = 'Relay'
    if 'our_name' in conf:
        our_name = conf['our_name']

    conf_fws = ['relay', 'pt', 'tf', 'mxnet', 'nnvm']
    networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16']
    prereqs, msg = check_prerequisites(
        info, {
            'cnn_comp': {
                'devices': ['gpu'],
                'use_xla': True,
                'networks': networks,
                'frameworks': conf_fws
            }
        })
    if not prereqs:
        write_status(output_dir, False, msg)
        return 1

    all_data = sort_data(info.exp_data_dir('cnn_comp'))
    raw_data = all_data[-1]['gpu']

    our_fw = 'Relay'
    other_fws = ['TensorFlow', 'Pytorch', 'MxNet', 'NNVM', 'TF XLA']
    fw_name_map = {fw: fw for fw in other_fws}
    fw_name_map['Pytorch'] = 'PyTorch'

    networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16']
    network_name_map = {
        'resnet-18': 'ResNet-18',
        'mobilenet': 'MobileNet V2',
        'nature-dqn': 'DQN',
        'vgg-16': 'VGG-16'
    }

    plot_data = OrderedDict([(fw_name_map[fw], {
        network_name_map[network]:
        raw_data[fw][network] / raw_data[our_fw][network]
        for network in networks
    }) for fw in other_fws])

    try:
        generate_vision_comparisons(our_name, plot_data, output_dir)
    except Exception as e:
        write_status(output_dir, False,
                     'Exception encountered:\n' + render_exception(e))
        return 1

    write_status(output_dir, True, 'success')
예제 #4
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16']
    prereqs, msg = check_prerequisites(
        info, {
            'relay_opt': {
                'devices': ['gpu'],
                'opt_levels': [0, 1, 2, 3, 4],
                'networks': networks
            }
        })
    if not prereqs:
        write_status(output_dir, False, msg)
        return 1

    all_data = sort_data(info.exp_data_dir('relay_opt'))
    raw_data = all_data[-1]['gpu']

    baseline = 'O0'
    opts = ['O1', 'O2', 'O3', 'O4']

    network_name_map = {
        'resnet-18': 'ResNet-18',
        'mobilenet': 'MobileNet V2',
        'nature-dqn': 'DQN',
        'vgg-16': 'VGG-16'
    }

    plot_data = OrderedDict([(opt, {
        network_name_map[network]:
        raw_data[baseline][network] / raw_data[opt][network]
        for network in networks
    }) for opt in opts])

    try:
        generate_opt_comparisons(plot_data, output_dir)
    except Exception as e:
        write_status(output_dir, False,
                     'Exception encountered:\n' + render_exception(e))
        return 1

    write_status(output_dir, True, 'success')
예제 #5
0
파일: run.py 프로젝트: uwsampl/relay-bench
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    idemp_mkdir(output_dir)
    for exp_name in info.all_present_experiments():
        exp_status = info.exp_status_dir(exp_name)
        run_status = validate_json(exp_status,
                                   'run_cpu_telemetry',
                                   'run_gpu_telemetry',
                                   filename='run.json')
        if check_prerequisites(
                info, {exp_name: {}}) == (True, 'success') and run_status.get(
                    'success', False):
            telemetry_folder = info.subsys_telemetry_dir(exp_name)
            if os.path.exists(telemetry_folder):
                exp_graph_folder = os.path.join(telemetry_folder, 'graph')
                cpu_stat = info.exp_cpu_telemetry(exp_name)
                gpu_stat = info.exp_gpu_telemetry(exp_name)
                cpu_data = sort_data(cpu_stat)
                gpu_data = sort_data(gpu_stat)
                graph_folder = info.exp_graph_dir(exp_name)
                website_include_dir = os.path.join(graph_folder)
                try:
                    if cpu_data and run_status.get('run_cpu_telemetry', False):
                        visualize(
                            'cpu', process_cpu_telemetry(cpu_data[-1]),
                            exp_graph_folder,
                            os.path.join(website_include_dir, 'cpu_telemetry'),
                            f'Visualizing CPU telemetry for {exp_name}',
                            lambda adapter, title, *rest: f'{adapter}-{title}')

                    if gpu_data and run_status.get('run_gpu_telemetry', False):
                        visualize(
                            'gpu', process_gpu_telemetry(gpu_data[-1]),
                            exp_graph_folder,
                            os.path.join(website_include_dir, 'gpu_telemetry'),
                            f'Visualizing GPU telemetry for {exp_name}',
                            lambda _, title, *rest: title)
                except Exception as e:
                    write_status(
                        output_dir, False,
                        f'Encountered err while generating graphs: {e}')
                    return
                write_status(output_dir, True, 'success')
            else:
                write_status(output_dir, False, 'No telemetry data found')
                return
예제 #6
0
def main(config_dir, home_dir, out_dir):
    config = read_config(config_dir)
    info = DashboardInfo(home_dir)
    exp_titles = get_exp_titles(info)
    score_titles = get_score_titles(info)

    deadline_config = None
    if info.subsys_config_valid('deadline'):
        deadline_config = info.read_subsys_config('deadline')

    set_up_out_dir(info, out_dir)
    # Switch to the output directory, so we don't need to keep track of
    # separate paths for loading images while the script is running and loading
    # images when viewing the generated webpage.
    os.chdir(out_dir)

    page_prefix = init_page_prefix_template(deadline_config)
    page_body = gen_page_body(exp_titles, score_titles)
    page_suffix = init_page_suffix_template(deadline_config)
    with open(os.path.join(out_dir, 'index.html'), 'w') as f:
        f.write(page_prefix)
        f.write(page_body)
        f.write(page_suffix)
    write_status(out_dir, True, 'success')
예제 #7
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    conf = read_config(config_dir)

    data_dir = os.path.join(output_dir, 'data')
    graph_dir = os.path.join(output_dir, 'graphs')
    idemp_mkdir(data_dir)
    idemp_mkdir(graph_dir)

    timestamp = get_timestamp()

    score_confs = conf['score_confs']
    metrics = set(score_confs.keys())
    metrics = metrics.intersection(set(SCORE_METRICS.keys()))

    if not metrics:
        write_status(output_dir, True, 'No scores to report')
        return 0

    score_data = {}
    score_reports = {}
    for metric in metrics:
        score_metric = SCORE_METRICS[metric](score_confs[metric])
        valid, msg = check_prerequisites(info, score_metric.prereq())
        if not valid:
            write_status(output_dir, False, msg)
            return 1

        score_data_dir = os.path.join(data_dir, metric)
        score_graph_dir = os.path.join(graph_dir, metric)
        idemp_mkdir(score_data_dir)
        idemp_mkdir(score_graph_dir)

        try:
            report = process_score(info, score_metric, score_data_dir,
                                   score_graph_dir, timestamp)
            score_reports[metric] = report
        except Exception as e:
            write_status(
                output_dir, False,
                'Encountered exception while scoring {}:\n{}'.format(
                    metric, render_exception(e)))
            return 1

    report = {'title': 'Metric Scores', 'value': format_scores(score_reports)}
    write_json(output_dir, 'report.json', report)
    write_status(output_dir, True, 'success')
예제 #8
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    conf = read_config(config_dir)
    our_name = 'Relay'
    if 'our_name' in conf:
        our_name = conf['our_name']

    prereqs, msg = check_prerequisites(
        info, {
            'treelstm': {
                'devices': ['cpu'],
                'frameworks': ['relay', 'pt'],
                'relay_methods': ['aot']
            },
            'char_rnn': {
                'devices': ['cpu'],
                'frameworks': ['relay', 'pt'],
                'relay_methods': ['aot'],
                'relay_configs': ['loop']
            },
            'gluon_rnns': {
                'devices': ['cpu'],
                'frameworks': ['relay', 'mxnet'],
                'networks': ['rnn', 'lstm', 'gru'],
                'relay_methods': ['aot']
            }
        })
    if not prereqs:
        write_status(output_dir, False, msg)
        return 1

    raw_data = {}
    for exp in ['treelstm', 'char_rnn', 'gluon_rnns']:
        all_data = sort_data(info.exp_data_dir(exp))
        raw_data[exp] = all_data[-1]

    plot_data = OrderedDict([
        ('MxNet', {
            'RNN':
            raw_data['gluon_rnns']['cpu']['MxNet']['rnn'] /
            raw_data['gluon_rnns']['cpu']['Aot']['rnn'],
            'GRU':
            raw_data['gluon_rnns']['cpu']['MxNet']['gru'] /
            raw_data['gluon_rnns']['cpu']['Aot']['gru'],
            'LSTM':
            raw_data['gluon_rnns']['cpu']['MxNet']['lstm'] /
            raw_data['gluon_rnns']['cpu']['Aot']['lstm'],
            'CharRNN':
            0.0,
            'TreeLSTM':
            0.0,
        }),
        ('PyTorch', {
            'RNN':
            0.0,
            'GRU':
            0.0,
            'LSTM':
            0.0,
            'CharRNN':
            raw_data['char_rnn']['cpu']['Pytorch'] /
            raw_data['char_rnn']['cpu']['Aot'],
            'TreeLSTM':
            raw_data['treelstm']['cpu']['Pytorch'] /
            raw_data['treelstm']['cpu']['Aot'],
        }),
    ])

    try:
        generate_nlp_comparisons(our_name, plot_data, output_dir)
    except Exception as e:
        write_status(output_dir, False,
                     'Exception encountered:\n' + render_exception(e))
        return 1

    write_status(output_dir, True, 'success')
예제 #9
0
파일: run.py 프로젝트: uwsampl/relay-bench
def main(config_dir, home_dir, output_dir):
    config = read_config(config_dir)
    if 'channel_id' not in config:
        write_status(output_dir, False, 'No channel token given')
        return 1

    channel = config['channel_id']

    success, msg, client = new_client(config)

    if not success:
        write_status(output_dir, False, msg)
        return 1

    info = DashboardInfo(home_dir)

    failed_subsys = []
    reports = []
    failed_reports = []

    for subsys in info.all_present_subsystems():
        # ignore self
        if subsys == 'subsys_reporter':
            continue

        if not info.subsys_active(subsys):
            continue

        status = info.subsys_stage_status(subsys, 'run')
        if not status['success']:
            failed_subsys.append(failed_subsys_field(subsys, status))
            continue

        report_present = check_file_exists(info.subsys_output_dir(subsys),
                                           'report.json')
        if not report_present:
            continue

        try:
            report = read_json(info.subsys_output_dir(subsys), 'report.json')
            reports.append(
                build_field(title=report['title'], value=report['value']))
        except Exception:
            failed_reports.append(subsys_name)

    attachments = []
    if reports:
        attachments.append(build_attachment(title='Reports', fields=reports))
    if failed_reports or failed_subsys:
        failure_text = ''
        if failed_reports:
            failure_text = 'Failed to parse reports: {}'.format(
                ', '.join(failed_reports))
        attachments.append(
            build_attachment(title='Errors',
                             text=failure_text,
                             color='#fa0000',
                             fields=failed_subsys))

    if not attachments:
        write_status(output_dir, True, 'Nothing to report')
        return 0

    success, _, msg = post_message(
        client, channel,
        build_message(text='Subsystem Results', attachments=attachments))
    write_status(output_dir, success, msg)
예제 #10
0
def main(config_dir, home_dir, output_dir):
    config = read_config(config_dir)
    if 'webhook_url' not in config:
        write_status(output_dir, False, 'No webhook URL given')
        return 1

    webhook = config['webhook_url']
    description = ''
    if 'description' in config:
        description = config['description']

    info = DashboardInfo(home_dir)

    inactive_experiments = []     # list of titles
    failed_experiments = []       # list of slack fields
    successful_experiments = []   # list of slack fields
    failed_graphs = []            # list of titles

    for exp_name in info.all_present_experiments():
        stage_statuses = info.exp_stage_statuses(exp_name)
        if not stage_statuses['precheck']['success']:
            failed_experiments.append(
                failed_experiment_field(exp_name, stage_statuses, 'precheck'))
            continue

        exp_conf = info.read_exp_config(exp_name)

        exp_title = exp_name if 'title' not in exp_conf else exp_conf['title']
        notify = exp_conf['notify']
        if not exp_conf['active']:
            inactive_experiments.append(exp_title)
            continue

        failure = False
        for stage in ['setup', 'run', 'analysis', 'summary']:
            if stage not in stage_statuses:
                # setup is the only stage that's optional
                assert stage == 'setup'
                continue
            if not stage_statuses[stage]['success']:
                failed_experiments.append(
                    failed_experiment_field(exp_title, stage_statuses,
                                            stage, notify))
                failure = True
                break

        if failure:
            continue

        # failure to visualize is not as big a deal as failing to
        # run or analyze the experiment, so we only report it but
        # don't fail to report the summary
        if not stage_statuses['visualization']['success']:
            failed_graphs.append(exp_title)

        summary = info.read_exp_summary(exp_name)
        successful_experiments.append(
            build_field(summary['title'], summary['value']))

    # produce messages
    attachments = []
    if successful_experiments:
        attachments.append(
            build_attachment(
                title='Successful benchmarks',
                pretext=description,
                fields=successful_experiments))
    if failed_experiments:
        attachments.append(
            build_attachment(
                color='#fa0000',
                title='Failed benchmarks',
                fields=failed_experiments))
    if inactive_experiments:
        attachments.append(
            build_attachment(
                color='#616161',
                title='Inactive benchmarks',
                text=', '.join(inactive_experiments)))
    if failed_graphs:
        attachments.append(
            build_attachment(
                color='#fa0000',
                title='Failed to Visualize',
                text=', '.join(failed_graphs)))

    success, report = post_message(
        webhook,
        build_message(
            text='Dashboard Results',
            attachments=attachments))
    write_status(output_dir, success, report)
예제 #11
0
def main(config_dir, home_dir, output_dir):
    info = DashboardInfo(home_dir)
    conf = read_config(config_dir)

    # delete old report so it doesn't hang around if we exit
    # without a new one
    if check_file_exists(output_dir, 'report.json'):
        subprocess.call(['rm', '-f', os.path.join(output_dir, 'report.json')])

    time_window = -1
    if 'time_window' in conf:
        time_window = int(conf['time_window'])
    pings = conf['notify'] if 'notify' in conf else []

    # map: exp -> [(fields w/ high SD, historic mean, SD, current)]
    exp_alerts = {}
    for exp in info.all_present_experiments():
        if not info.exp_active(exp):
            continue

        # not this subsystem's job to report on failures
        stage_statuses = info.exp_stage_statuses(exp)
        if 'run' not in stage_statuses or 'analysis' not in stage_statuses:
            continue
        if not stage_statuses['analysis']['success']:
            continue

        all_data = sort_data(info.exp_data_dir(exp))
        if len(all_data) <= 1:
            continue

        exp_alerts[exp] = []
        most_recent = all_data[-1]
        past_data = all_data[:-1]
        if time_window >= 1:
            past_data = [
                entry for entry in past_data
                if time_difference(most_recent, entry).days <= time_window
            ]

        field_values = traverse_fields(most_recent)
        for fields in itertools.product(*field_values):
            current_stat, _ = gather_stats([most_recent], fields)
            current = current_stat[0]
            past_stats, _ = gather_stats(past_data, fields)

            past_sd = np.std(past_stats)
            past_mean = np.mean(past_stats)
            if abs(current - past_mean) > past_sd:
                exp_alerts[exp].append((fields, past_mean, past_sd, current))

        if not exp_alerts[exp]:
            del exp_alerts[exp]

    if exp_alerts:
        report = {
            'title': 'High SD Alerts',
            'value': format_report(info, exp_alerts, pings)
        }
        write_json(output_dir, 'report.json', report)

    write_status(output_dir, True, 'success')
예제 #12
0
def main(config_dir, home_dir, output_dir):
    config = read_config(config_dir)
    if 'channel_id' not in config:
        write_status(output_dir, False, 'No channel token given')
        return 1

    success, msg, client = new_client(config)
    info = DashboardInfo(home_dir)

    if not success:
        write_status(output_dir, False, msg)
        return 1

    slack_channel = config['channel_id']
    description = ''
    if 'description' in config:
        description = config['description']

    info = DashboardInfo(home_dir)

    inactive_experiments = []  # list of titles
    failed_experiments = []  # list of slack fields
    successful_experiments = []  # list of slack fields
    failed_graphs = []  # list of titles

    for exp_name in info.all_present_experiments():
        stage_statuses = info.exp_stage_statuses(exp_name)
        if not stage_statuses['precheck']['success']:
            failed_experiments.append(
                failed_experiment_field(exp_name, stage_statuses, 'precheck'))
            continue

        exp_conf = info.read_exp_config(exp_name)
        exp_status = info.exp_status_dir(exp_name)
        run_status = validate_json(exp_status,
                                   'time_delta',
                                   filename='run.json')

        exp_title = exp_name if 'title' not in exp_conf else exp_conf['title']
        notify = exp_conf['notify']
        if not exp_conf['active']:
            inactive_experiments.append(exp_title)
            continue

        failure = False
        for stage in ['setup', 'run', 'analysis', 'summary']:
            if stage not in stage_statuses:
                # setup is the only stage that's optional
                assert stage == 'setup'
                continue
            if not stage_statuses[stage]['success']:
                failed_experiments.append(
                    failed_experiment_field(
                        exp_title,
                        stage_statuses,
                        stage,
                        duration=run_status.get('time_delta'),
                        notify=notify))
                failure = True
                break

        if failure:
            continue

        # failure to visualize is not as big a deal as failing to
        # run or analyze the experiment, so we only report it but
        # don't fail to report the summary
        if not stage_statuses['visualization']['success']:
            failed_graphs.append(exp_title)

        summary = info.read_exp_summary(exp_name)
        successful_experiments.append(
            build_field(
                summary['title'],
                attach_duration(summary['value'],
                                run_status.get('time_delta'))))

    # produce messages
    attachments = []
    if successful_experiments:
        attachments.append(
            build_attachment(title='Successful benchmarks',
                             fields=successful_experiments))
    if failed_experiments:
        attachments.append(
            build_attachment(color='#fa0000',
                             title='Failed benchmarks',
                             fields=failed_experiments))
    if inactive_experiments:
        attachments.append(
            build_attachment(color='#616161',
                             title='Inactive benchmarks',
                             text=', '.join(inactive_experiments)))
    if failed_graphs:
        attachments.append(
            build_attachment(color='#fa0000',
                             title='Failed to Visualize',
                             text=', '.join(failed_graphs)))

    success, _, report = post_message(
        client, slack_channel,
        build_message(text='*Dashboard Results*{}'.format(
            '\n' + description if description != '' else ''),
                      attachments=attachments))
    if config.get('report_images', False):
        success, msg = send_graphs(config, info, client, output_dir)
        if not success:
            write_status(output_dir, False, msg)
            return 1

    write_status(output_dir, success, report)