def cli_main(): SEED = 204 BATCH_SIZE = 64 MAX_SOURCE_POSITIONS = 1024 EPOCH = 50 from ncc.utils.set_seed import set_seed set_seed(SEED) use_cuda = torch.cuda.is_available() if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') criterion = DeepTuneLoss(task=None, sentence_avg=-1) if use_cuda: criterion = criterion.cuda() data = [] for i, platform in enumerate(LANGUAGES): DATA_DIR = os.path.join(DATASET_DIR, f'mapping/{platform}/data-mmap') def get_attr(attr): oracle_file = os.path.join(DATA_DIR, f'train.{attr}') with open(oracle_file, 'rb') as reader: out = pickle.load(reader) return np.asarray(out) platform_name = mapping_metrics.platform2str(platform) benchmarks = get_attr('benchmark') runtime_cpus = get_attr('runtime_cpu') runtime_gpus = get_attr('runtime_gpu') #################### load dataset #################### src_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.src_tokens')) src_dataset = TruncateDataset(src_dataset, truncation_length=MAX_SOURCE_POSITIONS, truncate_prefix=0) tgt_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.oracle')) src_dict = Dictionary.load(os.path.join(DATA_DIR, 'src_tokens.dict.jsonl')) src_aux = OrderedDict() src_aux['transfer'] = get_attr('transfer') src_aux['wgsize'] = get_attr('wgsize') tgt_dict = Dictionary.load(os.path.join(DATA_DIR, 'oracle.dict.jsonl')) dataset = LanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=src_dict, src_aux=src_aux, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=tgt_dict, tgt_aux=None, left_pad_source=True, max_source_positions=MAX_SOURCE_POSITIONS, ) #################### load dataset #################### # build toy dataset for 10-fold cross validation tgt_data = [tgt_dataset[idx].item() for idx in range(len(tgt_dataset))] src_data = [None] * len(tgt_data) # 10-fold cross-validation kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED) for j, (train_ids, test_ids) in enumerate(kf.split(src_data, tgt_data)): # deeptune model model = DeepTuneEncoder(dictionary=src_dict, embed_dim=64, rnn_cell='lstm', rnn_hidden_dim=64, rnn_dropout=0., rnn_num_layers=2, aux_dim=2, inner_dim=32, out_dim=2) if use_cuda: model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) for epoch_i in range(EPOCH): if dataset.shuffle: random.shuffle(train_ids) train_batch_sampler = data_utils.batch_by_size( train_ids, num_tokens_fn=lambda *args: -1, max_sentences=BATCH_SIZE, ) train_dataloader = DataLoader(dataset=dataset, batch_sampler=train_batch_sampler, collate_fn=collate, ) with tqdm(total=len(train_dataloader)) as t: for sample_i, sample in enumerate(train_dataloader, start=1): t.set_description(f'Epoch {epoch_i + 1}/{EPOCH} Batch {sample_i}/{len(train_dataloader)}') if use_cuda: sample = move_to_cuda(sample) loss, sample_size, logging_output = criterion(model, sample) loss.div_(sample_size) t.set_postfix(loss=loss.item()) t.update() optimizer.zero_grad() loss.backward() optimizer.step() # test accuracy test_batch_sampler = data_utils.batch_by_size( test_ids, num_tokens_fn=lambda *args: -1, max_sentences=BATCH_SIZE, ) test_dataloader = DataLoader(dataset=dataset, batch_sampler=test_batch_sampler, collate_fn=collate, ) predictions, ground_truth = [], [] for sample in test_dataloader: if use_cuda: sample = move_to_cuda(sample) hybrid_out, _ = model(**sample['net_input']) predictions.append(hybrid_out.max(dim=-1)[1]) ground_truth.append(sample['target'].view(-1)) predictions = torch.cat(predictions) ground_truth = torch.cat(ground_truth) accuracy = (predictions == ground_truth).tolist() # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA) gt_runtimes = (runtime_cpus if platform == "amd" else runtime_gpus)[test_ids] pred_runtimes = [ (runtime_cpus if pred == 0 else runtime_gpus)[idx] for idx, pred in zip(test_ids, predictions) ] speedup = gt_runtimes / pred_runtimes # record results for benchmark_, o_, p_, accuracy_, p_speedup_ in \ zip(benchmarks[test_ids], ground_truth, predictions, accuracy, speedup): data.append({ "Model": model.__class__.__name__, "Platform": platform_name, 'Benchmark': mapping_metrics.escape_benchmark_name(benchmark_), 'Benchmark Suite': mapping_metrics.escape_suite_name(benchmark_), "Oracle Mapping": o_, "Predicted Mapping": p_, "Accuracy": accuracy_, "Speedup": p_speedup_, }) del model, optimizer performance = pd.DataFrame( data, index=range(1, len(data) + 1), columns=[ "Model", "Platform", "Benchmark", "Benchmark Suite", "Oracle Mapping", "Predicted Mapping", "Accuracy", "Speedup" ]) benchmark_out = performance.groupby(['Platform', 'Benchmark Suite'])[['Platform', 'Accuracy', 'Speedup']].mean() benchmark_out['Accuracy'] = round(benchmark_out['Accuracy'] * 100, 2) benchmark_out['Speedup'] = round(benchmark_out['Speedup'], 2) print(benchmark_out) out = performance.groupby(['Platform'])[['Platform', 'Accuracy', 'Speedup']].mean() out['Accuracy'] = round(out['Accuracy'] * 100, 2) out['Speedup'] = round(out['Speedup'], 2) print(out)
def cli_main(): data = [] for i, platform in enumerate(LANGUAGES): def get_attr(attr): oracle_file = os.path.join(ATTRIBUTES_DIR, platform, f'train.{attr}') with open(oracle_file, 'r') as reader: out = [json_io.json_loads(line) for line in reader] return np.asarray(out) def get_src_tokens(): transfer = get_attr("transfer") comp = get_attr("comp") mem = get_attr("mem") coalesced = get_attr("coalesced") localmem = get_attr("localmem") wgsize = get_attr("wgsize") return np.array([ transfer / (comp + mem), # F1 coalesced / mem, # F2 localmem / mem * wgsize, # F3 comp / mem, # F4 ]).T platform_name = mapping_metrics.platform2str(platform) devices = get_attr('oracle') benchmarks = get_attr('benchmark') runtime_cpus = get_attr('runtime_cpu') runtime_gpus = get_attr('runtime_gpu') # optimal mappings src_tokens = get_src_tokens() ground_truth = np.array([1 if x == "GPU" else 0 for x in devices]) # 10-fold cross-validation kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED) for j, (train_ids, test_ids) in enumerate(kf.split(src_tokens, ground_truth)): # decisoin tree model model = DecisionTree.build_model() # train model.fit(src_tokens=src_tokens[train_ids], ground_truth=ground_truth[train_ids]) # accuracy predictions = model(src_tokens=src_tokens[test_ids]) gt = ground_truth[test_ids] correct = (predictions == gt) # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA) gt_runtimes = (runtime_cpus if platform == "amd" else runtime_gpus)[test_ids] pred_runtimes = [(runtime_cpus if pred == 0 else runtime_gpus)[idx] for idx, pred in zip(test_ids, predictions)] speedup = gt_runtimes / pred_runtimes # record results for benchmark_, o_, p_, correct_, p_speedup_ in \ zip(benchmarks[test_ids], ground_truth[test_ids], predictions, correct, speedup): data.append({ "Model": model.__class__.__name__, "Platform": platform_name, 'Benchmark': mapping_metrics.escape_benchmark_name(benchmark_), 'Benchmark Suite': mapping_metrics.escape_suite_name(benchmark_), "Oracle Mapping": o_, "Predicted Mapping": p_, "Accuracy": correct_, "Speedup": p_speedup_, }) del model performance = pd.DataFrame(data, index=range(1, len(data) + 1), columns=[ "Model", "Platform", "Benchmark", "Benchmark Suite", "Oracle Mapping", "Predicted Mapping", "Accuracy", "Speedup" ]) benchmark_out = performance.groupby( ['Platform', 'Benchmark Suite'])[['Platform', 'Accuracy', 'Speedup']].mean() benchmark_out['Accuracy'] = round(benchmark_out['Accuracy'] * 100, 2) benchmark_out['Speedup'] = round(benchmark_out['Speedup'], 2) print(benchmark_out) out = performance.groupby(['Platform' ])[['Platform', 'Accuracy', 'Speedup']].mean() out['Accuracy'] = round(out['Accuracy'] * 100, 2) out['Speedup'] = round(out['Speedup'], 2) print(out)
def cli_main(): data = [] for i, platform in enumerate(['amd', 'nvidia']): def get_attr(attr): oracle_file = os.path.join(ATTRIBUTES_DIR, f'{platform}.{attr}') with open(oracle_file, 'r') as reader: out = [json_io.json_loads(line) for line in reader] return np.asarray(out) platform_name = mapping_metrics.platform2str(platform) devices = get_attr('oracle') benchmarks = get_attr('benchmark') runtime_cpus = get_attr('runtime_cpu') runtime_gpus = get_attr('runtime_gpu') # staic mapping model model = StaticMapping.build_model(devices) # optimal mappings src_tokens = torch.from_numpy(np.zeros(len(devices))) ground_truth = torch.from_numpy( np.array([1 if x == 1 else 0 for x in devices])) predictions = model(src_tokens) accuracy = (predictions == ground_truth).tolist() # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA) gt_runtimes = (runtime_cpus if platform == "amd" else runtime_gpus) pred_runtimes = [(runtime_cpus if pred == 0 else runtime_gpus)[idx] for idx, pred in enumerate(predictions)] speedup = gt_runtimes / pred_runtimes # record results for benchmark_, o_, p_, accuracy_, p_speedup_ in \ zip(benchmarks, ground_truth, predictions, accuracy, speedup): data.append({ "Model": model.__class__.__name__, "Platform": platform_name, 'Benchmark': mapping_metrics.escape_benchmark_name(benchmark_), 'Benchmark Suite': mapping_metrics.escape_suite_name(benchmark_), "Oracle Mapping": o_, "Predicted Mapping": p_, "Accuracy": accuracy_, "Speedup": p_speedup_, }) performance = pd.DataFrame(data, index=range(1, len(data) + 1), columns=[ "Model", "Platform", "Benchmark", "Benchmark Suite", "Oracle Mapping", "Predicted Mapping", "Accuracy", "Speedup" ]) benchmark_out = performance.groupby( ['Platform', 'Benchmark Suite'])[['Platform', 'Accuracy', 'Speedup']].mean() benchmark_out['Accuracy'] = round(benchmark_out['Accuracy'] * 100, 2) print(benchmark_out) out = performance.groupby(['Platform' ])[['Platform', 'Accuracy', 'Speedup']].mean() out['Accuracy'] = round(out['Accuracy'] * 100, 2) print(out)
def cli_main(): data = [] for i, platform in enumerate(LANGUAGES): def get_attr(attr): oracle_file = os.path.join(ATTRIBUTES_DIR, platform, f'train.{attr}') with open(oracle_file, 'r') as reader: out = [json_io.json_loads(line) for line in reader] return np.asarray(out) platform_name = mapping_metrics.platform2str(platform) devices = get_attr('oracle') benchmarks = get_attr('benchmark') runtime_cpus = get_attr('runtime_cpu') runtime_gpus = get_attr('runtime_gpu') # staic mapping model model = StaticMapping.build_model(devices) # optimal mappings features = np.zeros(len(devices)) ground_truth = np.array([1 if x == "GPU" else 0 for x in devices]) # 10-fold cross-validation kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED) for j, (train_ids, test_ids) in enumerate(kf.split(features, ground_truth)): # no training # accuracy src_tokens = torch.from_numpy(features[test_ids]) predictions = model(src_tokens) gt = torch.from_numpy(ground_truth[test_ids]) accuracy = (predictions == gt).tolist() # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA) gt_runtimes = (runtime_cpus if platform == "amd" else runtime_gpus)[test_ids] pred_runtimes = [(runtime_cpus if pred == 0 else runtime_gpus)[idx] for idx, pred in zip(test_ids, predictions)] speedup = gt_runtimes / pred_runtimes # record results for benchmark_, o_, p_, accuracy_, p_speedup_ in \ zip(benchmarks[test_ids], ground_truth[test_ids], predictions, accuracy, speedup): data.append({ "Model": model.__class__.__name__, "Platform": platform_name, 'Benchmark': mapping_metrics.escape_benchmark_name(benchmark_), 'Benchmark Suite': mapping_metrics.escape_suite_name(benchmark_), "Oracle Mapping": o_, "Predicted Mapping": p_, "Accuracy": accuracy_, "Speedup": p_speedup_, }) performance = pd.DataFrame(data, index=range(1, len(data) + 1), columns=[ "Model", "Platform", "Benchmark", "Benchmark Suite", "Oracle Mapping", "Predicted Mapping", "Accuracy", "Speedup" ]) benchmark_out = performance.groupby( ['Platform', 'Benchmark Suite'])[['Platform', 'Accuracy', 'Speedup']].mean() benchmark_out['Accuracy'] = round(benchmark_out['Accuracy'] * 100, 2) benchmark_out['Speedup'] = round(benchmark_out['Speedup'], 2) print(benchmark_out) out = performance.groupby(['Platform' ])[['Platform', 'Accuracy', 'Speedup']].mean() out['Accuracy'] = round(out['Accuracy'] * 100, 2) out['Speedup'] = round(out['Speedup'], 2) print(out)