def main(data_dir, config_dir, output_dir): config, msg = validate(config_dir) if config is None: write_status(output_dir, False, msg) return 1 all_data = sort_data(data_dir) most_recent = all_data[-1] most_recent = {k: v for (k, v) in most_recent.items() if k not in METADATA_KEYS} summary = '' for (model, targets) in most_recent.items(): # simulated target summary sim_targets = {target: targets[target] for target in targets if target in SIM_TARGETS} for (target, devices) in sim_targets.items(): for (device, stats) in devices.items(): summary += '_Stats on ({}, {}, {}) & _\n'.format(model, target.upper(), device.upper()) for (stat, val) in stats.items(): summary += '{}: {:.2E}\n'.format(stat, Decimal(val)) # physical target summary phys_targets = {target: v for (target, v) in targets.items() if target in PHYS_TARGETS} for (target, devices) in phys_targets.items(): for (device, mean_time) in devices.items(): summary += 'Time on ({}, {}, {}): {:.2f}\n'.format( model, target.upper(), device.upper(), mean_time) write_summary(output_dir, config['title'], summary) write_status(output_dir, True, 'success')
def write_generic_summary(data_dir, output_dir, title, devices, networks=None, use_networks=False): """ Given a data directory and output directory, this function writes a generic summary assuming that the data has a field keyed by device (cpu/gpu) and optionally by network. It writes a summary and status to the output dir. """ try: all_data = sort_data(data_dir) most_recent = all_data[-1] summary = None if use_networks: summary = summary_by_dev_and_network(most_recent, devices, networks) else: summary = summary_by_dev(most_recent, devices) write_summary(output_dir, title, summary) write_status(output_dir, True, 'success') # TODO do something about comparisons to previous days except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e))
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) idemp_mkdir(output_dir) for exp_name in info.all_present_experiments(): exp_status = info.exp_status_dir(exp_name) run_status = validate_json(exp_status, 'run_cpu_telemetry', 'run_gpu_telemetry', filename='run.json') if check_prerequisites( info, {exp_name: {}}) == (True, 'success') and run_status.get( 'success', False): telemetry_folder = info.subsys_telemetry_dir(exp_name) if os.path.exists(telemetry_folder): exp_graph_folder = os.path.join(telemetry_folder, 'graph') cpu_stat = info.exp_cpu_telemetry(exp_name) gpu_stat = info.exp_gpu_telemetry(exp_name) cpu_data = sort_data(cpu_stat) gpu_data = sort_data(gpu_stat) graph_folder = info.exp_graph_dir(exp_name) website_include_dir = os.path.join(graph_folder) try: if cpu_data and run_status.get('run_cpu_telemetry', False): visualize( 'cpu', process_cpu_telemetry(cpu_data[-1]), exp_graph_folder, os.path.join(website_include_dir, 'cpu_telemetry'), f'Visualizing CPU telemetry for {exp_name}', lambda adapter, title, *rest: f'{adapter}-{title}') if gpu_data and run_status.get('run_gpu_telemetry', False): visualize( 'gpu', process_gpu_telemetry(gpu_data[-1]), exp_graph_folder, os.path.join(website_include_dir, 'gpu_telemetry'), f'Visualizing GPU telemetry for {exp_name}', lambda _, title, *rest: title) except Exception as e: write_status( output_dir, False, f'Encountered err while generating graphs: {e}') return write_status(output_dir, True, 'success') else: write_status(output_dir, False, 'No telemetry data found') return
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] pass_spec_name_map = { '3;FuseOps': 'Op Fusion', '3;FoldConstant|FuseOps': '... + Constant Folding', '3;EliminateCommonSubexpr|FoldConstant|FuseOps': '... + Common Subexpr Elim', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldConstant|FuseOps': '... + Parallel Conv Comb', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|FoldConstant|FuseOps': '... + Axis Scale Folding', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|FoldConstant|FuseOps': '... + Cast Canonicalization', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|CanonicalizeOps|FoldConstant|FuseOps': '... + Op Canonicalization', '3;EliminateCommonSubexpr|CombineParallelConv2D|FoldScaleAxis|CanonicalizeCast|CanonicalizeOps|AlterOpLayout|FoldConstant|FuseOps': '... + Op Layout Alteration' } prereqs, msg = check_prerequisites(info, { 'pass_comparison': { 'networks': networks, 'passes': [ parse_combo(combo) for combo in pass_spec_name_map.keys() ] } }) all_data = sort_data(info.exp_data_dir('pass_comparison')) raw_data = all_data[-1] baseline = '0;' network_name_map = { 'resnet-18': 'ResNet-18', 'mobilenet': 'MobileNet V2', 'nature-dqn': 'DQN', 'vgg-16': 'VGG-16' } del raw_data['timestamp'] del raw_data['tvm_hash'] try: for (dev, raw_dev_data) in raw_data.items(): plot_data = OrderedDict([ (pass_spec_name_map[pass_spec], { network_name_map[network]: raw_dev_data[baseline][network] / raw_dev_data[pass_spec][network] for network in networks}) for pass_spec in pass_spec_name_map.keys() ]) generate_pass_comparisons(plot_data, output_dir, f'pass-comp-{dev}.png') except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) our_name = 'Relay' if 'our_name' in conf: our_name = conf['our_name'] conf_fws = ['relay', 'pt', 'tf', 'mxnet', 'nnvm'] networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] prereqs, msg = check_prerequisites( info, { 'cnn_comp': { 'devices': ['gpu'], 'use_xla': True, 'networks': networks, 'frameworks': conf_fws } }) if not prereqs: write_status(output_dir, False, msg) return 1 all_data = sort_data(info.exp_data_dir('cnn_comp')) raw_data = all_data[-1]['gpu'] our_fw = 'Relay' other_fws = ['TensorFlow', 'Pytorch', 'MxNet', 'NNVM', 'TF XLA'] fw_name_map = {fw: fw for fw in other_fws} fw_name_map['Pytorch'] = 'PyTorch' networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] network_name_map = { 'resnet-18': 'ResNet-18', 'mobilenet': 'MobileNet V2', 'nature-dqn': 'DQN', 'vgg-16': 'VGG-16' } plot_data = OrderedDict([(fw_name_map[fw], { network_name_map[network]: raw_data[fw][network] / raw_data[our_fw][network] for network in networks }) for fw in other_fws]) try: generate_vision_comparisons(our_name, plot_data, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def prepare_exp_data_pages(info, out_dir): idemp_mkdir(out_dir) for exp in info.all_present_experiments(): stage_statuses = info.exp_stage_statuses(exp) if 'analysis' not in stage_statuses or not stage_statuses['analysis'][ 'success']: continue all_exp_data = sort_data(info.exp_data_dir(exp)) # customize the formatting here so that it's at # least somewhat human-readable with open(os.path.join(out_dir, '{}.json'.format(exp)), 'w') as f: json.dump(all_exp_data[::-1], f, indent=1)
def process_score(info, score_metric, data_dir, graph_dir, timestamp): data = score_metric.compute_score(info) data['timestamp'] = timestamp write_json(data_dir, 'data_{}.json'.format(timestamp), data) # graphs failing is not a fatal error, just an inconvenience try: score_metric.score_graph(data, graph_dir) all_data = sort_data(data_dir) score_metric.longitudinal_graphs(all_data, graph_dir) except Exception as e: print(render_exception(e)) finally: return score_metric.score_text(data)
def main(data_dir, config_dir, output_dir): try: config, msg = validate_trials_config(config_dir) if config is None: write_status(output_dir, False, msg) return 1 all_data = sort_data(data_dir) most_recent = all_data[-1] success, msg = render_graph(config, most_recent, output_dir) write_status(output_dir, success, msg) except Exception as e: write_status(output_dir, False, 'Exception encountered: ' + render_exception(e)) return 1 finally: plt.close()
def main(data_dir, config_dir, output_dir): try: config, msg = validate_trials_config(config_dir) if config is None: write_status(output_dir, False, msg) return 1 all_data = sort_data(data_dir) most_recent = all_data[-1] summary = summarize(config, most_recent) write_summary(output_dir, 'Pareto Curve Trial', summary) write_status(output_dir, True, 'success') except Exception as e: write_status(output_dir, False, 'Exception encountered: ' + render_exception(e)) return 1
def main(data_dir, config_dir, output_dir): config, msg = validate(config_dir) if config is None: write_status(output_dir, False, msg) return 1 # read in data, output graphs of most recent data, and output longitudinal graphs all_data = sort_data(data_dir) most_recent = all_data[-1] try: generate_longitudinal_comparisons(all_data, output_dir) generate_arm_vta_comparisons(most_recent, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) networks = ['resnet-18', 'mobilenet', 'nature-dqn', 'vgg-16'] prereqs, msg = check_prerequisites( info, { 'relay_opt': { 'devices': ['gpu'], 'opt_levels': [0, 1, 2, 3, 4], 'networks': networks } }) if not prereqs: write_status(output_dir, False, msg) return 1 all_data = sort_data(info.exp_data_dir('relay_opt')) raw_data = all_data[-1]['gpu'] baseline = 'O0' opts = ['O1', 'O2', 'O3', 'O4'] network_name_map = { 'resnet-18': 'ResNet-18', 'mobilenet': 'MobileNet V2', 'nature-dqn': 'DQN', 'vgg-16': 'VGG-16' } plot_data = OrderedDict([(opt, { network_name_map[network]: raw_data[baseline][network] / raw_data[opt][network] for network in networks }) for opt in opts]) try: generate_opt_comparisons(plot_data, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def main(data_dir, config_dir, output_dir): try: config, msg = validate_config(config_dir) if config is None: write_status(output_dir, False, msg) return 1 all_data = sort_data(data_dir) most_recent = all_data[-1] last_two_weeks = [ entry for entry in all_data if time_difference(most_recent, entry).days < 14 ] generate_longitudinal_comparisons(all_data, output_dir, 'all_time') generate_longitudinal_comparisons(last_two_weeks, output_dir, 'two_weeks') generate_individual_comparisons(config, most_recent, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) our_name = 'Relay' if 'our_name' in conf: our_name = conf['our_name'] prereqs, msg = check_prerequisites( info, { 'treelstm': { 'devices': ['cpu'], 'frameworks': ['relay', 'pt'], 'relay_methods': ['aot'] }, 'char_rnn': { 'devices': ['cpu'], 'frameworks': ['relay', 'pt'], 'relay_methods': ['aot'], 'relay_configs': ['loop'] }, 'gluon_rnns': { 'devices': ['cpu'], 'frameworks': ['relay', 'mxnet'], 'networks': ['rnn', 'lstm', 'gru'], 'relay_methods': ['aot'] } }) if not prereqs: write_status(output_dir, False, msg) return 1 raw_data = {} for exp in ['treelstm', 'char_rnn', 'gluon_rnns']: all_data = sort_data(info.exp_data_dir(exp)) raw_data[exp] = all_data[-1] plot_data = OrderedDict([ ('MxNet', { 'RNN': raw_data['gluon_rnns']['cpu']['MxNet']['rnn'] / raw_data['gluon_rnns']['cpu']['Aot']['rnn'], 'GRU': raw_data['gluon_rnns']['cpu']['MxNet']['gru'] / raw_data['gluon_rnns']['cpu']['Aot']['gru'], 'LSTM': raw_data['gluon_rnns']['cpu']['MxNet']['lstm'] / raw_data['gluon_rnns']['cpu']['Aot']['lstm'], 'CharRNN': 0.0, 'TreeLSTM': 0.0, }), ('PyTorch', { 'RNN': 0.0, 'GRU': 0.0, 'LSTM': 0.0, 'CharRNN': raw_data['char_rnn']['cpu']['Pytorch'] / raw_data['char_rnn']['cpu']['Aot'], 'TreeLSTM': raw_data['treelstm']['cpu']['Pytorch'] / raw_data['treelstm']['cpu']['Aot'], }), ]) try: generate_nlp_comparisons(our_name, plot_data, output_dir) except Exception as e: write_status(output_dir, False, 'Exception encountered:\n' + render_exception(e)) return 1 write_status(output_dir, True, 'success')
def latest_data(info, exp, dev): return sort_data(info.exp_data_dir(exp))[-1][dev]
def main(config_dir, home_dir, output_dir): info = DashboardInfo(home_dir) conf = read_config(config_dir) # delete old report so it doesn't hang around if we exit # without a new one if check_file_exists(output_dir, 'report.json'): subprocess.call(['rm', '-f', os.path.join(output_dir, 'report.json')]) time_window = -1 if 'time_window' in conf: time_window = int(conf['time_window']) pings = conf['notify'] if 'notify' in conf else [] # map: exp -> [(fields w/ high SD, historic mean, SD, current)] exp_alerts = {} for exp in info.all_present_experiments(): if not info.exp_active(exp): continue # not this subsystem's job to report on failures stage_statuses = info.exp_stage_statuses(exp) if 'run' not in stage_statuses or 'analysis' not in stage_statuses: continue if not stage_statuses['analysis']['success']: continue all_data = sort_data(info.exp_data_dir(exp)) if len(all_data) <= 1: continue exp_alerts[exp] = [] most_recent = all_data[-1] past_data = all_data[:-1] if time_window >= 1: past_data = [ entry for entry in past_data if time_difference(most_recent, entry).days <= time_window ] field_values = traverse_fields(most_recent) for fields in itertools.product(*field_values): current_stat, _ = gather_stats([most_recent], fields) current = current_stat[0] past_stats, _ = gather_stats(past_data, fields) past_sd = np.std(past_stats) past_mean = np.mean(past_stats) if abs(current - past_mean) > past_sd: exp_alerts[exp].append((fields, past_mean, past_sd, current)) if not exp_alerts[exp]: del exp_alerts[exp] if exp_alerts: report = { 'title': 'High SD Alerts', 'value': format_report(info, exp_alerts, pings) } write_json(output_dir, 'report.json', report) write_status(output_dir, True, 'success')