def single_main(args, init_distributed=False): assert args['dataset']['max_tokens'] is not None or args['dataset']['max_sentences'] is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' metrics.reset() # 0. Initialize CUDA and distributed training if torch.cuda.is_available() and not args['common']['cpu']: torch.cuda.set_device(args['distributed_training']['device_id']) set_seed.set_seed(args['common']['seed']) if init_distributed: args['distributed_training'][ 'distributed_rank'] = distributed_utils.distributed_init(args) # Verify checkpoint directory if distributed_utils.is_master(args): save_dir = args['checkpoint']['save_dir'] checkpoint_utils.verify_checkpoint_directory(save_dir) PathManager.rm(os.path.join( save_dir, '*.pt')) # this code will remove pre-trained models # 1. Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # 2. Load valid dataset (we load training data below, based on the latest checkpoint) task.load_dataset(args['dataset']['valid_subset'], combine=False, epoch=1) # 3. Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) LOGGER.info(model) LOGGER.info('model {}, criterion {}'.format(args['model']['arch'], criterion.__class__.__name__)) LOGGER.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # 4. Build trainer trainer = Trainer(args, task, model, criterion) LOGGER.info('training on {} GPUs'.format( args['distributed_training']['distributed_world_size'])) LOGGER.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args['dataset']['max_tokens'], args['dataset']['max_sentences'], )) # 5. Load the latest checkpoint if one is available and restore the corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer, combine=False) # 6. Train until the learning rate gets too small max_epoch = args['optimization']['max_epoch'] or math.inf max_update = args['optimization']['max_update'] or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() valid_subsets = args['dataset']['valid_subset'].split(',') while (lr > args['optimization']['min_lr'] and epoch_itr.next_epoch_idx <= max_epoch and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr) if not args['dataset']['disable_validation'] and epoch_itr.epoch % args[ 'dataset']['validate_interval'] == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args['checkpoint']['save_interval'] == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) # early stop if should_stop_early(args, valid_losses[0]): LOGGER.info( 'early stop since valid performance hasn\'t improved for last {} runs' .format(args['checkpoint']['patience'])) break epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, combine=False, # TODO to be checked # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in args['task']['data']), ) train_meter.stop() LOGGER.info('done training in {:.1f} seconds'.format(train_meter.sum))
def cli_main(): SEED = 204 BATCH_SIZE = 64 MAX_SOURCE_POSITIONS = 1024 EPOCH = 50 from ncc.utils.set_seed import set_seed set_seed(SEED) use_cuda = torch.cuda.is_available() if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') criterion = DeepTuneLoss(task=None, sentence_avg=-1) if use_cuda: criterion = criterion.cuda() data = [] for i, platform in enumerate(LANGUAGES): DATA_DIR = os.path.join(DATASET_DIR, f'mapping/{platform}/data-mmap') def get_attr(attr): oracle_file = os.path.join(DATA_DIR, f'train.{attr}') with open(oracle_file, 'rb') as reader: out = pickle.load(reader) return np.asarray(out) platform_name = mapping_metrics.platform2str(platform) benchmarks = get_attr('benchmark') runtime_cpus = get_attr('runtime_cpu') runtime_gpus = get_attr('runtime_gpu') #################### load dataset #################### src_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.src_tokens')) src_dataset = TruncateDataset(src_dataset, truncation_length=MAX_SOURCE_POSITIONS, truncate_prefix=0) tgt_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.oracle')) src_dict = Dictionary.load(os.path.join(DATA_DIR, 'src_tokens.dict.jsonl')) src_aux = OrderedDict() src_aux['transfer'] = get_attr('transfer') src_aux['wgsize'] = get_attr('wgsize') tgt_dict = Dictionary.load(os.path.join(DATA_DIR, 'oracle.dict.jsonl')) dataset = LanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=src_dict, src_aux=src_aux, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=tgt_dict, tgt_aux=None, left_pad_source=True, max_source_positions=MAX_SOURCE_POSITIONS, ) #################### load dataset #################### # build toy dataset for 10-fold cross validation tgt_data = [tgt_dataset[idx].item() for idx in range(len(tgt_dataset))] src_data = [None] * len(tgt_data) # 10-fold cross-validation kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED) for j, (train_ids, test_ids) in enumerate(kf.split(src_data, tgt_data)): # deeptune model model = DeepTuneEncoder(dictionary=src_dict, embed_dim=64, rnn_cell='lstm', rnn_hidden_dim=64, rnn_dropout=0., rnn_num_layers=2, aux_dim=2, inner_dim=32, out_dim=2) if use_cuda: model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) for epoch_i in range(EPOCH): if dataset.shuffle: random.shuffle(train_ids) train_batch_sampler = data_utils.batch_by_size( train_ids, num_tokens_fn=lambda *args: -1, max_sentences=BATCH_SIZE, ) train_dataloader = DataLoader(dataset=dataset, batch_sampler=train_batch_sampler, collate_fn=collate, ) with tqdm(total=len(train_dataloader)) as t: for sample_i, sample in enumerate(train_dataloader, start=1): t.set_description(f'Epoch {epoch_i + 1}/{EPOCH} Batch {sample_i}/{len(train_dataloader)}') if use_cuda: sample = move_to_cuda(sample) loss, sample_size, logging_output = criterion(model, sample) loss.div_(sample_size) t.set_postfix(loss=loss.item()) t.update() optimizer.zero_grad() loss.backward() optimizer.step() # test accuracy test_batch_sampler = data_utils.batch_by_size( test_ids, num_tokens_fn=lambda *args: -1, max_sentences=BATCH_SIZE, ) test_dataloader = DataLoader(dataset=dataset, batch_sampler=test_batch_sampler, collate_fn=collate, ) predictions, ground_truth = [], [] for sample in test_dataloader: if use_cuda: sample = move_to_cuda(sample) hybrid_out, _ = model(**sample['net_input']) predictions.append(hybrid_out.max(dim=-1)[1]) ground_truth.append(sample['target'].view(-1)) predictions = torch.cat(predictions) ground_truth = torch.cat(ground_truth) accuracy = (predictions == ground_truth).tolist() # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA) gt_runtimes = (runtime_cpus if platform == "amd" else runtime_gpus)[test_ids] pred_runtimes = [ (runtime_cpus if pred == 0 else runtime_gpus)[idx] for idx, pred in zip(test_ids, predictions) ] speedup = gt_runtimes / pred_runtimes # record results for benchmark_, o_, p_, accuracy_, p_speedup_ in \ zip(benchmarks[test_ids], ground_truth, predictions, accuracy, speedup): data.append({ "Model": model.__class__.__name__, "Platform": platform_name, 'Benchmark': mapping_metrics.escape_benchmark_name(benchmark_), 'Benchmark Suite': mapping_metrics.escape_suite_name(benchmark_), "Oracle Mapping": o_, "Predicted Mapping": p_, "Accuracy": accuracy_, "Speedup": p_speedup_, }) del model, optimizer performance = pd.DataFrame( data, index=range(1, len(data) + 1), columns=[ "Model", "Platform", "Benchmark", "Benchmark Suite", "Oracle Mapping", "Predicted Mapping", "Accuracy", "Speedup" ]) benchmark_out = performance.groupby(['Platform', 'Benchmark Suite'])[['Platform', 'Accuracy', 'Speedup']].mean() benchmark_out['Accuracy'] = round(benchmark_out['Accuracy'] * 100, 2) benchmark_out['Speedup'] = round(benchmark_out['Speedup'], 2) print(benchmark_out) out = performance.groupby(['Platform'])[['Platform', 'Accuracy', 'Speedup']].mean() out['Accuracy'] = round(out['Accuracy'] * 100, 2) out['Speedup'] = round(out['Speedup'], 2) print(out)