def main(): args = get_args() # パラメータ設定 input_dim = args.dim # 入力ベクトルサイズ image_size = args.size # 画像サイズ batch = args.batch # 勾配更新までの回数 epochs = args.epoch # データを周回する回数 save_freq = args.save # スナップショットのタイミング input_dirname = to_dirname(args.input) # 読み込み先ディレクトリ output_dirname = to_dirname(args.output) # 出力先ディレクトリ # モデルを構築 G = build_generator(input_dim=input_dim, output_size=image_size) D = build_discriminator(input_size=image_size) # モデルをコンパイル optimizer = Adam(lr=1e-5, beta_1=0.1) D.compile(loss='binary_crossentropy', optimizer=optimizer) GAN = build_GAN(G, D) # この値にするとうまくいく、正直職人芸 optimizer = Adam(lr=1e-4, beta_1=0.5) GAN.compile(loss='binary_crossentropy', optimizer=optimizer) # モデルを保存 save_model(G, 'G_model.json') save_model(D, 'D_model.json') # データセットを読み込み images = load_images(name=input_dirname, size=image_size) # 学習開始 for epoch in range(epochs): print('Epoch: ' + str(epoch + 1) + '/' + str(epochs)) train(G, D, GAN, sets=images, batch=batch) if (epoch + 1) % save_freq == 0: # 一定間隔でスナップショットを撮る results = generate(G, batch=batch) save_images(results, name=output_dirname + str(epoch + 1)) G.save_weights('G_weights.hdf5') D.save_weights('D_weights.hdf5')
def main(): lecun_fix() parser = ArgumentParser(description='mammoth', allow_abbrev=False) parser.add_argument('--model', type=str, required=True, help='Model name.', choices=get_all_models()) parser.add_argument('--load_best_args', action='store_true', help='Loads the best arguments for each method, ' 'dataset and memory buffer.') add_management_args(parser) args = parser.parse_known_args()[0] mod = importlib.import_module('models.' + args.model) if args.load_best_args: parser.add_argument('--dataset', type=str, required=True, choices=DATASET_NAMES, help='Which dataset to perform experiments on.') if hasattr(mod, 'Buffer'): parser.add_argument('--buffer_size', type=int, required=True, help='The size of the memory buffer.') args = parser.parse_args() if args.model == 'joint': best = best_args[args.dataset]['sgd'] else: best = best_args[args.dataset][args.model] if args.model == 'joint' and args.dataset == 'mnist-360': args.model = 'joint_gcl' if hasattr(args, 'buffer_size'): best = best[args.buffer_size] else: best = best[-1] for key, value in best.items(): setattr(args, key, value) else: get_parser = getattr(mod, 'get_parser') parser = get_parser() args = parser.parse_args() if args.seed is not None: set_random_seed(args.seed) if args.model == 'mer': setattr(args, 'batch_size', 1) dataset = get_dataset(args) backbone = dataset.get_backbone() loss = dataset.get_loss() model = get_model(args, backbone, loss, dataset.get_transform()) if isinstance(dataset, ContinualDataset): train(model, dataset, args) else: assert not hasattr(model, 'end_task') or model.NAME == 'joint_gcl' ctrain(args)
def main(): args = get_args() # パラメータ設定 input_dim = args.dim # 入力ベクトルサイズ image_size2x = args.size # 画像サイズStage2 image_size = (image_size2x[0] // 2, image_size2x[1] // 2) # 画像サイズStage1 batch = args.batch # 勾配更新までの回数 epochs = args.epoch # データを周回する回数 save_freq = args.save # スナップショットのタイミング input_dirname = to_dirname(args.input) # 読み込み先ディレクトリ output_dirname = to_dirname(args.output) # 出力先ディレクトリ # Stage 1 G1 = build_generator(input_dim=input_dim, output_size=image_size) D1 = build_discriminator(input_size=image_size) optimizer = Adam(lr=1e-5, beta_1=0.1) D1.compile(loss='binary_crossentropy', optimizer=optimizer) GAN1 = build_GAN(G1, D1) optimizer = Adam(lr=1e-4, beta_1=0.5) GAN1.compile(loss='binary_crossentropy', optimizer=optimizer) # Stage 2 G2 = build_upsampler(input_size=image_size) D2 = build_discriminator(input_size=image_size2x) optimizer = Adam(lr=1e-5, beta_1=0.1) D2.compile(loss='binary_crossentropy', optimizer=optimizer) GAN2 = build_GAN(G2, D2) optimizer = Adam(lr=1e-4, beta_1=0.5) GAN2.compile(loss='binary_crossentropy', optimizer=optimizer) # モデルを保存 save_model(G1, 'G1_model.json') save_model(D1, 'D1_model.json') save_model(G2, 'G2_model.json') save_model(D2, 'D2_model.json') # データセットを読み込み images = load_images(name=input_dirname, size=image_size) images2x = load_images(name=input_dirname, size=image_size2x) # 学習開始 for epoch in range(epochs): # Stage 1 print('Epoch: ' + str(epoch + 1) + '/' + str(epochs) + ' - Stage: 1') train(G1, D1, GAN1, sets=images, batch=batch) # Stage 2 print('Epoch: ' + str(epoch + 1) + '/' + str(epochs) + ' - Stage: 2') train_with_images(G1, G2, D2, GAN2, sets=images2x, batch=batch) if (epoch + 1) % save_freq == 0: # 一定間隔でスナップショットを撮る noise = np.random.uniform(0, 1, (batch, input_dim)) results1 = generate(G1, source=noise) save_images(results1, name=output_dirname + 'stage1/' + str(epoch + 1)) results2 = generate(G2, source=results1 / 255) save_images(results2, name=output_dirname + 'stage2/' + str(epoch + 1)) G1.save_weights('G1_weights.hdf5') D1.save_weights('D1_weights.hdf5') G2.save_weights('G2_weights.hdf5') D2.save_weights('D2_weights.hdf5')
def test_trainloop(): args = SimpleNamespace(wdw=0.01, training_steps=1, rs=0, optimizer='ADAM', lr=0.01, half_life=1, device=torch.device('cpu'), num_warmup_steps=0) data_path = test_path / 'data/seq' shape = [256, 256] dataset = DatasetImpl(path=data_path, shape=shape, augmentation=False, collapse_length=1, is_raw=True, max_seq_length=1) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=collate_wrapper, batch_size=2, pin_memory=True, shuffle=False) model = init_model(SimpleNamespace(flownet_path=test_path.parent / 'EV_FlowNet', mish=False, sp=None, prefix_length=0, suffix_length=0, max_sequence_length=1, dynamic_sample_length=False, event_representation_depth=9), device=args.device) optimizer, scheduler = construct_train_tools(args, model) evaluator = Losses([ tuple(map(lambda x: x // 2**i, shape)) for i in range(4) ][::-1], 2, args.device) with tempfile.TemporaryDirectory() as td: logger = torch.utils.tensorboard.SummaryWriter(log_dir=td) train(model=model, device=args.device, loader=data_loader, optimizer=optimizer, num_steps=args.training_steps, scheduler=scheduler, logger=logger, evaluator=evaluator, timers=FakeTimer()) del logger time.sleep(1)
def train(epoch): since = time.time() ### Train ### trn_loss, trn_err = train_utils.train(model, train_loader, optimizer, criterion, epoch) #print('Epoch {:d}, Train - Loss: {:.4f}, Acc: {:.4f}'.format(epoch, trn_loss, 1-trn_err)) print("Epoch: %d, Train - Loss: %.4f, Acc: %.4f" % (epoch, trn_loss, 1 - trn_err)) time_elapsed = time.time() - since print('Train Time {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) return trn_loss, 1 - trn_err
def main(): error_list = [] for i in range(20): (X_train, y_train), (X_test, y_test), num_classes = get_mnist() model = create_model() test_error, history = train(model, (X_train, y_train), (X_test, y_test), epochs=10) error_list.append(test_error) print "20 runs over.." print error_list
def main(epochs, cpu, cudnn_flag, visdom_port, visdom_freq, temp_dir, seed, no_bias_decay, label_smoothing, temperature): device = torch.device( 'cuda:0' if torch.cuda.is_available() and not cpu else 'cpu') callback = VisdomLogger(port=visdom_port) if visdom_port else None if cudnn_flag == 'deterministic': setattr(cudnn, cudnn_flag, True) torch.manual_seed(seed) loaders, recall_ks = get_loaders() torch.manual_seed(seed) model = get_model(num_classes=loaders.num_classes) class_loss = SmoothCrossEntropy(epsilon=label_smoothing, temperature=temperature) model.to(device) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) parameters = [] if no_bias_decay: parameters.append( {'params': [par for par in model.parameters() if par.dim() != 1]}) parameters.append({ 'params': [par for par in model.parameters() if par.dim() == 1], 'weight_decay': 0 }) else: parameters.append({'params': model.parameters()}) optimizer, scheduler = get_optimizer_scheduler(parameters=parameters, loader_length=len( loaders.train)) # setup partial function to simplify call eval_function = partial(evaluate, model=model, recall=recall_ks, query_loader=loaders.query, gallery_loader=loaders.gallery) # setup best validation logger metrics = eval_function() if callback is not None: callback.scalars( ['l2', 'cosine'], 0, [metrics.recall['l2'][1], metrics.recall['cosine'][1]], title='Val Recall@1') pprint(metrics.recall) best_val = (0, metrics.recall, deepcopy(model.state_dict())) torch.manual_seed(seed) for epoch in range(epochs): if cudnn_flag == 'benchmark': setattr(cudnn, cudnn_flag, True) train(model=model, loader=loaders.train, class_loss=class_loss, optimizer=optimizer, scheduler=scheduler, epoch=epoch, callback=callback, freq=visdom_freq, ex=ex) # validation if cudnn_flag == 'benchmark': setattr(cudnn, cudnn_flag, False) metrics = eval_function() print('Validation [{:03d}]'.format(epoch)), pprint(metrics.recall) ex.log_scalar('val.recall_l2@1', metrics.recall['l2'][1], step=epoch + 1) ex.log_scalar('val.recall_cosine@1', metrics.recall['cosine'][1], step=epoch + 1) if callback is not None: callback.scalars( ['l2', 'cosine'], epoch + 1, [metrics.recall['l2'][1], metrics.recall['cosine'][1]], title='Val Recall') # save model dict if the chosen validation metric is better if metrics.recall['cosine'][1] >= best_val[1]['cosine'][1]: best_val = (epoch + 1, metrics.recall, deepcopy(model.state_dict())) # logging ex.info['recall'] = best_val[1] # saving save_name = os.path.join( temp_dir, '{}_{}.pt'.format(ex.current_run.config['model']['arch'], ex.current_run.config['dataset']['name'])) torch.save(state_dict_to_cpu(best_val[2]), save_name) ex.add_artifact(save_name) if callback is not None: save_name = os.path.join(temp_dir, 'visdom_data.pt') callback.save(save_name) ex.add_artifact(save_name) return best_val[1]['cosine'][1]
def main(args): """ Main function. """ # --------------------------------- DATA --------------------------------- # Tokenizer logging.disable(logging.INFO) try: tokenizer = BertTokenizer.from_pretrained( os.path.join('pretrained-models', args.embedding), do_lower_case=args.do_lower_case) except OSError: # For CharacterBert models use BertTokenizer.basic_tokenizer for tokenization # and CharacterIndexer for indexing tokenizer = BertTokenizer.from_pretrained( os.path.join('pretrained-models', 'bert-base-uncased'), do_lower_case=args.do_lower_case) tokenizer = tokenizer.basic_tokenizer characters_indexer = CharacterIndexer() logging.disable(logging.NOTSET) tokenization_function = tokenizer.tokenize # Pre-processsing: apply basic tokenization (both) then split into wordpieces (BERT only) data = {} for split in ['train', 'test']: if args.task == 'classification': func = load_classification_dataset elif args.task == 'sequence_labelling': func = load_sequence_labelling_dataset else: raise NotImplementedError data[split] = func(step=split, do_lower_case=args.do_lower_case) retokenize(data[split], tokenization_function) logging.info('Splitting training data into train / validation sets...') data['validation'] = data['train'][:int(args.validation_ratio * len(data['train']))] data['train'] = data['train'][int(args.validation_ratio * len(data['train'])):] logging.info('New number of training sequences: %d', len(data['train'])) logging.info('New number of validation sequences: %d', len(data['validation'])) # Count target labels or classes if args.task == 'classification': counter_all = Counter([ example.label for example in data['train'] + data['validation'] + data['test'] ]) counter = Counter([example.label for example in data['train']]) # Maximum sequence length is either 512 or maximum token sequence length + 3 max_seq_length = min( 512, 3 + max( map(len, [ e.tokens_a if e.tokens_b is None else e.tokens_a + e.tokens_b for e in data['train'] + data['validation'] + data['test'] ]))) elif args.task == 'sequence_labelling': counter_all = Counter([ label for example in data['train'] + data['validation'] + data['test'] for label in example.label_sequence ]) counter = Counter([ label for example in data['train'] for label in example.label_sequence ]) # Maximum sequence length is either 512 or maximum token sequence length + 5 max_seq_length = min( 512, 5 + max( map(len, [ e.token_sequence for e in data['train'] + data['validation'] + data['test'] ]))) else: raise NotImplementedError labels = sorted(counter_all.keys()) num_labels = len(labels) logging.info("Goal: predict the following labels") for i, label in enumerate(labels): logging.info("* %s: %s (count: %s)", label, i, counter[label]) # Input features: list[token indices] (BERT) or list[list[character indices]] (CharacterBERT) pad_token_id = None if 'character' not in args.embedding: pad_token_id = tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0] pad_token_label_id = None if args.task == 'sequence_labelling': pad_token_label_id = CrossEntropyLoss().ignore_index dataset = {} logging.info("Maximum sequence lenght: %s", max_seq_length) for split in data: dataset[split] = build_features( args, split=split, tokenizer=tokenizer \ if 'character' not in args.embedding \ else characters_indexer, examples=data[split], labels=labels, pad_token_id=pad_token_id, pad_token_label_id=pad_token_label_id, max_seq_length=max_seq_length) del data # Not used anymore # --------------------------------- MODEL --------------------------------- # Initialize model if args.task == 'classification': model = BertForSequenceClassification elif args.task == 'sequence_labelling': model = BertForTokenClassification else: raise NotImplementedError logging.info('Loading `%s` model...', args.embedding) logging.disable(logging.INFO) config = BertConfig.from_pretrained(os.path.join('pretrained-models', args.embedding), num_labels=num_labels) if 'character' not in args.embedding: model = model.from_pretrained(os.path.join('pretrained-models', args.embedding), config=config) else: model = model(config=config) model.bert = CharacterBertModel.from_pretrained(os.path.join( 'pretrained-models', args.embedding), config=config) logging.disable(logging.NOTSET) model.to(args.device) logging.info('Model:\n%s', model) # ------------------------------ TRAIN / EVAL ------------------------------ # Log args logging.info('Using the following arguments for training:') for k, v in vars(args).items(): logging.info("* %s: %s", k, v) # Training if args.do_train: global_step, train_loss, best_val_metric, best_val_epoch = train( args=args, dataset=dataset, model=model, tokenizer=tokenizer, labels=labels, pad_token_label_id=pad_token_label_id) logging.info("global_step = %s, average training loss = %s", global_step, train_loss) logging.info("Best performance: Epoch=%d, Value=%s", best_val_epoch, best_val_metric) # Evaluation on test data if args.do_predict: # Load best model if args.task == 'classification': model = BertForSequenceClassification elif args.task == 'sequence_labelling': model = BertForTokenClassification else: raise NotImplementedError logging.disable(logging.INFO) if 'character' not in args.embedding: model = model.from_pretrained(args.output_dir) else: state_dict = torch.load(os.path.join(args.output_dir, 'pytorch_model.bin'), map_location='cpu') model = model(config=config) model.bert = CharacterBertModel(config=config) model.load_state_dict(state_dict, strict=True) logging.disable(logging.NOTSET) model.to(args.device) # Compute predictions and metrics results, _ = evaluate(args=args, eval_dataset=dataset["test"], model=model, labels=labels, pad_token_label_id=pad_token_label_id) # Save metrics with open(os.path.join(args.output_dir, 'performance_on_test_set.txt'), 'w') as f: f.write(f'best validation score: {best_val_metric}\n') f.write(f'best validation epoch: {best_val_epoch}\n') f.write('--- Performance on test set ---\n') for k, v in results.items(): f.write(f'{k}: {v}\n')
def main(): args = parse_arguments() random.seed(args.pretrained_seed) torch.manual_seed(args.pretrained_seed) if args.use_cuda: torch.cuda.manual_seed_all(args.pretrained_seed) cudnn.benchmark = True # get the result path to store the results result_path = get_result_path(dataset_name=args.dataset, network_arch=args.pretrained_arch, random_seed=args.pretrained_seed, result_subfolder=args.result_subfolder, postfix=args.postfix) # Init logger log_file_name = os.path.join(result_path, 'log.txt') print("Log file: {}".format(log_file_name)) log = open(log_file_name, 'w') print_log('save path : {}'.format(result_path), log) state = {k: v for k, v in args._get_kwargs()} for key, value in state.items(): print_log("{} : {}".format(key, value), log) print_log("Random Seed: {}".format(args.pretrained_seed), log) print_log("Python version : {}".format(sys.version.replace('\n', ' ')), log) print_log("Torch version : {}".format(torch.__version__), log) print_log("Cudnn version : {}".format(torch.backends.cudnn.version()), log) _, pretrained_data_test = get_data(args.pretrained_dataset, args.pretrained_dataset) pretrained_data_test_loader = torch.utils.data.DataLoader(pretrained_data_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) ##### Dataloader for training #### num_classes, (mean, std), input_size, num_channels = get_data_specs(args.pretrained_dataset) data_train, _ = get_data(args.dataset, args.pretrained_dataset) data_train_loader = torch.utils.data.DataLoader(data_train, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) #################################### # Init model, criterion, and optimizer print_log("=> Creating model '{}'".format(args.pretrained_arch), log) # get a path for loading the model to be attacked model_path = get_model_path(dataset_name=args.pretrained_dataset, network_arch=args.pretrained_arch, random_seed=args.pretrained_seed) model_weights_path = os.path.join(model_path, "checkpoint.pth.tar") target_network = get_network(args.pretrained_arch, input_size=input_size, num_classes=num_classes, finetune=False) print_log("=> Network :\n {}".format(target_network), log) target_network = torch.nn.DataParallel(target_network, device_ids=list(range(args.ngpu))) # Set the target model into evaluation mode target_network.eval() # Imagenet models use the pretrained pytorch weights if args.pretrained_dataset != "imagenet": network_data = torch.load(model_weights_path) target_network.load_state_dict(network_data['state_dict']) # Set all weights to not trainable set_parameter_requires_grad(target_network, requires_grad=False) non_trainale_params = get_num_non_trainable_parameters(target_network) trainale_params = get_num_trainable_parameters(target_network) total_params = get_num_parameters(target_network) print_log("Target Network Trainable parameters: {}".format(trainale_params), log) print_log("Target Network Non Trainable parameters: {}".format(non_trainale_params), log) print_log("Target Network Total # parameters: {}".format(total_params), log) print_log("=> Inserting Generator", log) generator = UAP(shape=(input_size, input_size), num_channels=num_channels, mean=mean, std=std, use_cuda=args.use_cuda) print_log("=> Generator :\n {}".format(generator), log) non_trainale_params = get_num_non_trainable_parameters(generator) trainale_params = get_num_trainable_parameters(generator) total_params = get_num_parameters(generator) print_log("Generator Trainable parameters: {}".format(trainale_params), log) print_log("Generator Non Trainable parameters: {}".format(non_trainale_params), log) print_log("Generator Total # parameters: {}".format(total_params), log) perturbed_net = nn.Sequential(OrderedDict([('generator', generator), ('target_model', target_network)])) perturbed_net = torch.nn.DataParallel(perturbed_net, device_ids=list(range(args.ngpu))) non_trainale_params = get_num_non_trainable_parameters(perturbed_net) trainale_params = get_num_trainable_parameters(perturbed_net) total_params = get_num_parameters(perturbed_net) print_log("Perturbed Net Trainable parameters: {}".format(trainale_params), log) print_log("Perturbed Net Non Trainable parameters: {}".format(non_trainale_params), log) print_log("Perturbed Net Total # parameters: {}".format(total_params), log) # Set the target model into evaluation mode perturbed_net.module.target_model.eval() perturbed_net.module.generator.train() if args.loss_function == "ce": criterion = torch.nn.CrossEntropyLoss() elif args.loss_function == "neg_ce": criterion = NegativeCrossEntropy() elif args.loss_function == "logit": criterion = LogitLoss(num_classes=num_classes, use_cuda=args.use_cuda) elif args.loss_function == "bounded_logit": criterion = BoundedLogitLoss(num_classes=num_classes, confidence=args.confidence, use_cuda=args.use_cuda) elif args.loss_function == "bounded_logit_fixed_ref": criterion = BoundedLogitLossFixedRef(num_classes=num_classes, confidence=args.confidence, use_cuda=args.use_cuda) elif args.loss_function == "bounded_logit_neg": criterion = BoundedLogitLoss_neg(num_classes=num_classes, confidence=args.confidence, use_cuda=args.use_cuda) else: raise ValueError if args.use_cuda: target_network.cuda() generator.cuda() perturbed_net.cuda() criterion.cuda() optimizer = torch.optim.Adam(perturbed_net.parameters(), lr=state['learning_rate']) # Measure the time needed for the UAP generation start = time.time() train(data_loader=data_train_loader, model=perturbed_net, criterion=criterion, optimizer=optimizer, epsilon=args.epsilon, num_iterations=args.num_iterations, targeted=args.targeted, target_class=args.target_class, log=log, print_freq=args.print_freq, use_cuda=args.use_cuda) end = time.time() print_log("Time needed for UAP generation: {}".format(end - start), log) # evaluate print_log("Final evaluation:", log) metrics_evaluate(data_loader=pretrained_data_test_loader, target_model=target_network, perturbed_model=perturbed_net, targeted=args.targeted, target_class=args.target_class, log=log, use_cuda=args.use_cuda) save_checkpoint({ 'arch' : args.pretrained_arch, # 'state_dict' : perturbed_net.state_dict(), 'state_dict' : perturbed_net.module.generator.state_dict(), 'optimizer' : optimizer.state_dict(), 'args' : copy.deepcopy(args), }, result_path, 'checkpoint.pth.tar') log.close()
def main(): # torch.autograd.set_detect_anomaly(True) args = parse_args(sys.argv[1:]) device = torch.device(args.device) if device.type == 'cuda': torch.cuda.set_device(device) if args.timers: timers = SynchronizedWallClockTimer() else: timers = FakeTimer() model = init_model(args, device) serializer = Serializer(args.model, args.num_checkpoints, args.permanent_interval) args.do_not_continue = (args.do_not_continue or len(serializer.list_known_steps()) == 0) last_step = (0 if args.do_not_continue else serializer.list_known_steps()[-1]) optimizer, scheduler = construct_train_tools(args, model, passed_steps=last_step) losses = init_losses(args.shape, args.bs, model, device, sequence_length=args.prefix_length + args.suffix_length + 1, timers=timers) # allow only manual flush logger = SummaryWriter(str(args.log_path), max_queue=100000000, flush_secs=100000000) periodic_hooks, hooks = create_hooks(args, model, optimizer, losses, logger, serializer) if not args.do_not_continue: global_step, state = serializer.load_checkpoint(model, last_step, optimizer=optimizer, device=device) samples_passed = state.pop('samples_passed', global_step * args.bs) else: global_step = 0 samples_passed = 0 hooks['serialization'](global_step, samples_passed) loader = get_dataloader(get_trainset_params(args), sample_idx=samples_passed, process_only_once=False) if not args.skip_validation: hooks['validation'](global_step, samples_passed) with Profiler(args.profiling, args.model/'profiling'), \ GPUMonitor(args.log_path): train(model, device, loader, optimizer, args.training_steps, scheduler=scheduler, evaluator=losses, logger=logger, weights=args.loss_weights, is_raw=args.is_raw, accumulation_steps=args.accum_step, timers=timers, hooks=periodic_hooks, init_step=global_step, init_samples_passed=samples_passed, max_events_per_batch=args.max_events_per_batch) samples = samples_passed + (args.training_steps - global_step) * args.bs hooks['serialization'](args.training_steps, samples) if not args.skip_validation: hooks['validation'](args.training_steps, samples)
def main(): if torch.cuda.device_count() > 1: torch.set_num_threads(6 * torch.cuda.device_count()) else: torch.set_num_threads(2) parser = ArgumentParser(description='mammoth', allow_abbrev=False) parser.add_argument('--model', type=str, required=True, help='Model name.', choices=get_all_models()) parser.add_argument('--load_best_args', action='store_true', help='Loads the best arguments for each method, ' 'dataset and memory buffer.') add_management_args(parser) args = parser.parse_known_args()[0] mod = importlib.import_module('models.' + args.model) if args.load_best_args: parser.add_argument('--dataset', type=str, required=True, choices=DATASET_NAMES, help='Which dataset to perform experiments on.') if hasattr(mod, 'Buffer'): parser.add_argument('--buffer_size', type=int, required=True, help='The size of the memory buffer.') args = parser.parse_args() model = args.model if model == 'joint': model = 'sgd' best = best_args[args.dataset][model] if hasattr(args, 'buffer_size'): best = best[args.buffer_size] else: best = best[-1] for key, value in best.items(): setattr(args, key, value) else: get_parser = getattr(mod, 'get_parser') parser = get_parser() args = parser.parse_args() if args.seed is not None: set_random_seed(args.seed) off_joint = False if args.model == 'joint' and args.dataset == 'seq-core50': args.dataset = 'seq-core50j' args.model = 'sgd' off_joint = True dataset = get_dataset(args) # continual learning backbone = dataset.get_backbone() loss = dataset.get_loss() model = get_model(args, backbone, loss, dataset.get_transform()) if off_joint: print('BEGIN JOINT TRAINING') jtrain(model, dataset, args) else: print('BEGIN CONTINUAL TRAINING') train(model, dataset, args)
def main(): parser = ArgumentParser(description='mammoth', allow_abbrev=False) args = parser.parse_known_args()[0] args.model = 'ocilfast' args.seed = None args.validation = True args.img_dir = 'img/test' # 打印图片存储路径 args.print_file = open('../'+args.img_dir+'/result.txt', mode='w') args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") """ # seq-tinyimagenet args.dataset = 'seq-tinyimg' args.lr = 2e-3 args.batch_size = 32 args.buffer_size = 0 args.minibatch_size = 32 args.n_epochs = 100 args.nu = 0.7 args.eta = 0.04 args.eps = 1 args.embedding_dim = 250 args.weight_decay = 1e-2 args.margin = 1 args.r = 0.01 args.nf = 32 """ # seq-cifar10 args.dataset = 'seq-cifar10' args.lr = 1e-3 args.batch_size = 32 args.buffer_size = 0 args.minibatch_size = 32 args.n_epochs = 50 args.nu = 0.7 args.eta = 0.8 args.eps = 1 args.embedding_dim = 250 args.weight_decay = 1e-2 args.margin = 1 args.r = 0.1 args.nf = 32 # seq-mnist # args.dataset = 'seq-mnist' # args.buffer_size = 0 # args.lr = 1e-3 # args.batch_size = 128 # args.minibatch_size = 128 # args.n_epochs = 10 # # args.nu = 0.8 # args.eta = 1 # args.eps = 0.1 # args.embedding_dim = 150 # args.weight_decay = 0 # args.margin = 5 # args.r = 0.1 # 半径 # args.nf = 32 if args.seed is not None: set_random_seed(args.seed) train(args)
checkpoint = torch.load(args.resume) start_epoch = checkpoint["epoch"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) del checkpoint for epoch in range(start_epoch, args.epochs + 1): since = time.time() ### Train ### if epoch == args.ft_start: print("Now replacing data loader with fine-tuned data loader.") train_loader = loaders["fine_tune"] trn_loss, trn_err = train_utils.train(model, train_loader, optimizer, criterion, epoch=epoch, writer=writer) writer.add_scalar("train/loss", trn_loss, epoch) writer.add_scalar("train/error", trn_err, epoch) writer.add_scalar("params/mu0", model.aug.Mu[0], epoch) writer.add_scalar("params/mu2", model.aug.Mu[1], epoch) writer.add_scalar("params/mu3", model.aug.Mu[2], epoch) Sigma = model.aug.Sigma cov = (Sigma @ Sigma.T).detach().cpu() writer.add_scalar("params/cov00", cov[0, 0], epoch) writer.add_scalar("params/cov11", cov[1, 1], epoch) writer.add_scalar("params/cov22", cov[2, 2], epoch) print( "Epoch {:d}\nTrain - Loss: {:.4f}, Acc: {:.4f}".format( epoch, trn_loss, 1 - trn_err )
import sys from models.squeezenet_model import create_squeezenet_model from models.own_model import create_own_model from utils.training import train if len(sys.argv) > 1 and sys.argv[1] == '-s': print("Using Squeezenet model") model = create_squeezenet_model() else: print("Using own model") model = create_own_model() model = train(model)
# init CNN if args.pretrain: model = init_pretrained_model(args) elif args.model_name == 'customized': model = init_model_scratch(args) elif args.model_name == 'cifar10_wider': model = init_cifar10_wider(args) elif args.model_name == 'cifar10_deeper': model = init_cifar10_deeper(args) elif args.model_name == 'cifar2_deeper': model = init_cifar2_new(args) else: raise ValueError('can not find matched model, try the following: customized/vgg16/vgg19/inception/xception/resnet50') model.summary() # training pretrain_model, tensorboard_dir, checkpoint_path, csv_path, time_path = train(model, X_train, y_train, args) if args.show_plot: plot_training(args, csv_path) ### evaluate on test data print('--------------------') print('predicting test dateset') X_test, y_test = data_pipeline(args.test_dir, preprocess_input) y_test = to_categorical(y_test) saved_model = models.load_model(checkpoint_path) score = model.evaluate(X_test, y_test) print(list(zip(model.metrics_names, score)))
def main(): parser = argparse.ArgumentParser(description='Shallow-CNN Training') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default:64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default:1000)') parser.add_argument('--epochs', type=int, default=100, metavar='N', help='number of epochs to train for (default:100)') parser.add_argument('--lr', type=float, default=0.001, metavar='lr', help='learning rate for optimizer (default:0.001)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--early-stopping', type=int, default=10, metavar='N', help='Patience for early stopping (default:10)') parser.add_argument( '--data-dir', type=str, default='../data', metavar='path/to/dir', help='path to directory containing data (default:../data)') parser.add_argument( '--train-size', type=float, default=0.85, metavar='pct', help='fraction of dataset to use for training (default:0.85)') parser.add_argument( '--test-size', type=float, default=0.15, metavar='pct', help='fraction of dataset to use for testing (default:0.15)') parser.add_argument( '--dropout-rate', type=float, default=0.5, metavar='pct', help='dropout rate after convolution layers (default:0.5)') parser.add_argument('--conv1-width', type=int, default=10, metavar='w', help='Width of 1st convolution kernel (default:10)') parser.add_argument( '--n_channels', type=int, default=30, metavar='N', help='Number of channels ouput by convolution layers (default:30)') parser.add_argument( '--max-pool-kernel-size', type=int, default=25, metavar='w', help='Width of max-pool kernel after convolution (default:25)') parser.add_argument('--max-pool-stride', type=int, default=5, metavar='N', help='stride along 2nd axis for max-pool (default:5)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument( '--checkpoint', type=str, default='checkpoint.pt', metavar='path/to/file', help='file to save checkpoints (default:checkpoint.pt)') #TODO add arg to save everything to specific folder # Time id used for saving files time_id = int(time()) args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') torch.manual_seed(SEED) # Load the datsets print('loading datasets') train_set = RobotNavDataset(args.data_dir) submission_set = SubmissionDataset(args.data_dir) train_size = floor(0.8 * len(train_set)) test_size = floor(0.2 * len(train_set)) train_subset, test_subset = data.random_split(train_set, (train_size, test_size)) train_loader = torch.utils.data.DataLoader(train_subset, batch_size=args.batch_size, shuffle=True) # Don't think we actually need shuffle here... test_loader = torch.utils.data.DataLoader(test_subset, batch_size=args.test_batch_size) # Initialize objects print('creating model') model = ShallowCNN(n_channels=args.n_channels, conv1_width=args.conv1_width, max_pool_kernel_size=args.max_pool_kernel_size, max_pool_stride=args.max_pool_stride, dropout_rate=args.dropout_rate) model.double() # TODO: look into if this is actually needed... early_stopper = EarlyStopping(patience=args.early_stopping, check_file=args.checkpoint) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) logfile = '{}.log'.format(time_id) logger = setup_logger(logfile=logfile, console_out=True) loss_func = F.nll_loss # Train the model print('training model') for epoch in range(1, args.epochs + 1): train(model, train_loader, optimizer, loss_func, epoch, log_interval=args.log_interval, log_func=logger.info) test_loss = test(model, test_loader, loss_func, log_func=logger.info) # Early stopper will handle saving the checkpoints if early_stopper(test_loss, model): break print('creating submission') make_submission(model, submission_set.data, 'submission-{}.csv'.format(time_id))
for epoch in range(start_epoch, args.epochs + 1): if epoch % 50 is 1 and dbsn: print( torch.cat([F.softmax(alphas, 1), betas.exp()], 1).data.cpu().numpy()) since = time.time() ### Train ### if epoch == args.ft_start + 1: logging.info('Now replacing data loader with fine-tuned data loader.') train_loader = loaders['fine_tune'] trn_loss, trn_err = train_utils.train(model, train_loader, optimizer, criterion, biOptimizer, epoch, ngpus, dbsn) logging.info('Epoch {:d} Train - Loss: {:.4f}, Acc: {:.4f}'.format( epoch, trn_loss, 1 - trn_err)) # time_elapsed = time.time() - since # logging.info('Train Time {:.0f}m {:.0f}s'.format( # time_elapsed // 60, time_elapsed % 60)) if epoch % args.eval_freq is 0: ### Test ### val_loss, val_err, val_iou = train_utils.test(model, loaders['val'], criterion, alphas, betas, biOptimizer, ngpus, dbsn) logging.info('Val - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}'.format( val_loss, 1 - val_err, val_iou))
def experiment_vae(args, train_loader, val_loader, test_loader, model, optimizer, dir, model_name='vae'): from utils.training import train_vae as train from utils.evaluation import evaluate_vae as evaluate # SAVING torch.save(args, dir + args.model_name + '.config') # best_model = model best_loss = 100000. e = 0 train_loss_history = [] train_re_history = [] train_kl_history = [] val_loss_history = [] val_re_history = [] val_kl_history = [] time_history = [] for epoch in range(1, args.epochs + 1): time_start = time.time() model, train_loss_epoch, train_re_epoch, train_kl_epoch = train(epoch, args, train_loader, model, optimizer) val_loss_epoch, val_re_epoch, val_kl_epoch = evaluate(args, model, train_loader, val_loader, epoch, dir, mode='validation') time_end = time.time() time_elapsed = time_end - time_start # appending history train_loss_history.append(train_loss_epoch), train_re_history.append(train_re_epoch), train_kl_history.append( train_kl_epoch) val_loss_history.append(val_loss_epoch), val_re_history.append(val_re_epoch), val_kl_history.append( val_kl_epoch) time_history.append(time_elapsed) # printing results print('Epoch: {}/{}, Time elapsed: {:.2f}s\n' '* Train loss: {:.2f} (RE: {:.2f}, KL: {:.2f})\n' 'o Val. loss: {:.2f} (RE: {:.2f}, KL: {:.2f})\n' '--> Early stopping: {}/{} (BEST: {:.2f})\n'.format( epoch, args.epochs, time_elapsed, train_loss_epoch, train_re_epoch, train_kl_epoch, val_loss_epoch, val_re_epoch, val_kl_epoch, e, args.early_stopping_epochs, best_loss )) # early-stopping if val_loss_epoch < best_loss: e = 0 best_loss = val_loss_epoch # best_model = model print('->model saved<-') torch.save(model, dir + args.model_name + '.model') else: e += 1 if epoch < args.warmup: e = 0 if e > args.early_stopping_epochs: break # NaN if math.isnan(val_loss_epoch): break # FINAL EVALUATION best_model = torch.load(dir + args.model_name + '.model') test_loss, test_re, test_kl, test_log_likelihood, train_log_likelihood, test_elbo, train_elbo = evaluate(args, best_model, train_loader, test_loader, 9999, dir, mode='test') print('FINAL EVALUATION ON TEST SET\n' 'LogL (TEST): {:.2f}\n' 'LogL (TRAIN): {:.2f}\n' 'ELBO (TEST): {:.2f}\n' 'ELBO (TRAIN): {:.2f}\n' 'Loss: {:.2f}\n' 'RE: {:.2f}\n' 'KL: {:.2f}'.format( test_log_likelihood, train_log_likelihood, test_elbo, train_elbo, test_loss, test_re, test_kl )) with open('vae_experiment_log.txt', 'a') as f: print('FINAL EVALUATION ON TEST SET\n' 'LogL (TEST): {:.2f}\n' 'LogL (TRAIN): {:.2f}\n' 'ELBO (TEST): {:.2f}\n' 'ELBO (TRAIN): {:.2f}\n' 'Loss: {:.2f}\n' 'RE: {:.2f}\n' 'KL: {:.2f}'.format( test_log_likelihood, train_log_likelihood, test_elbo, train_elbo, test_loss, test_re, test_kl ), file=f) # SAVING torch.save(train_loss_history, dir + args.model_name + '.train_loss') torch.save(train_re_history, dir + args.model_name + '.train_re') torch.save(train_kl_history, dir + args.model_name + '.train_kl') torch.save(val_loss_history, dir + args.model_name + '.val_loss') torch.save(val_re_history, dir + args.model_name + '.val_re') torch.save(val_kl_history, dir + args.model_name + '.val_kl') torch.save(test_log_likelihood, dir + args.model_name + '.test_log_likelihood') torch.save(test_loss, dir + args.model_name + '.test_loss') torch.save(test_re, dir + args.model_name + '.test_re') torch.save(test_kl, dir + args.model_name + '.test_kl')
def experiment_vae(args, train_loader, val_loader, test_loader, model, optimizer, dir, model_name='vae'): from utils.training import train_vae as train from utils.evaluation import evaluate_vae as evaluate # SAVING torch.save(args, dir + args.model_name + '.config') # best_model = model best_loss = 100000. e = 0 train_loss_history = [] train_re_history = [] train_kl_history = [] train_fi_history = [] train_mi_history = [] train_psnr_history = [] val_loss_history = [] val_re_history = [] val_kl_history = [] val_fi_history = [] val_mi_history = [] val_psnr_history = [] time_history = [] scheduler = StepLR(optimizer, step_size=10000, gamma=0.1) for epoch in range(1, args.epochs + 1): scheduler.step() time_start = time.time() model, train_loss_epoch, train_re_epoch, train_kl_epoch, train_fi_epoch, train_mi_epoch, train_psnr = train(epoch, args, train_loader, model, optimizer) val_loss_epoch, val_re_epoch, val_kl_epoch, val_fi_epoch, val_mi_epoch, val_psnr = evaluate(args, model, train_loader, val_loader, epoch, dir, mode='validation') time_end = time.time() time_elapsed = time_end - time_start # appending history train_loss_history.append(train_loss_epoch), train_re_history.append(train_re_epoch), train_kl_history.append( train_kl_epoch), train_fi_history.append(train_fi_epoch), train_mi_history.append(train_mi_epoch), train_psnr_history.append(train_psnr) val_loss_history.append(val_loss_epoch), val_re_history.append(val_re_epoch), val_kl_history.append( val_kl_epoch), val_fi_history.append(val_fi_epoch), val_mi_history.append(val_mi_epoch), val_psnr_history.append(val_psnr) time_history.append(time_elapsed) # printing results print('Epoch: {}/{}, Time elapsed: {:.2f}s\n' '* Train loss: {:.2f} (RE: {:.2f}, KL: {:.2f}, FI: {:.2f}, MI: {:.2f}, PSNR: {:.2f})\n' 'o Val. loss: {:.2f} (RE: {:.2f}, KL: {:.2f}, FI: {:.2f}, MI: {:.2f}, PSNR: {:.2f})\n' '--> Early stopping: {}/{} (BEST: {:.2f})\n'.format( epoch, args.epochs, time_elapsed, train_loss_epoch, train_re_epoch, train_kl_epoch, train_fi_epoch, train_mi_epoch, train_psnr, val_loss_epoch, val_re_epoch, val_kl_epoch, val_fi_epoch, val_mi_epoch, val_psnr, e, args.early_stopping_epochs, best_loss )) # early-stopping if val_loss_epoch < best_loss: e = 0 best_loss = val_loss_epoch # best_model = model print('->model saved<-') torch.save(model, dir + args.model_name + '.model') else: e += 1 if epoch < args.warmup: e = 0 if e > args.early_stopping_epochs: break # NaN if math.isnan(val_loss_epoch): break # FINAL EVALUATION best_model = torch.load(dir + '/' + args.model_name + '.model') test_loss, test_re, test_kl, test_fi, test_mi, test_log_likelihood, train_log_likelihood, test_elbo, train_elbo, test_psnr = evaluate(args, best_model, train_loader, test_loader, 9999, dir, mode='test') if args.fisher is True: fishier = [] hid = list(model.named_children()) for i in hid: if i[0] == 'p_x_layers': #print(i) temp = 0 cnt_number_layers = 0 for j in range(1,len(i[1])): #print("----------------------") #print(i[1][j]) temp += torch.mm(torch.mm(i[1][j].h.weight.grad,torch.t(i[1][j].h.weight)),torch.t(torch.mm(i[1][j].h.weight.grad,torch.t(i[1][j].h.weight)))).abs().sum().data[0] cnt_number_layers += 1 fishier.append((i[0],temp/cnt_number_layers)) if i[0] == 'pixelcnn': temp = 0 cnt_number_layers = 0 for j in range(len(i[1])): if isinstance(i[1][j],MaskedConv2d): g = i[1][j].weight.grad.view(i[1][j].weight.grad.size()[0],-1) h = i[1][j].weight.view(i[1][j].weight.size()[0],-1) temp += torch.mm(torch.mm(g,torch.t(h)),torch.t(torch.mm(g,torch.t(h)))).abs().sum().data[0] cnt_number_layers += 1 fishier.append((i[0],temp/cnt_number_layers)) if i[0] == 'q_z_layers': #print(i) if args.model_name[:3] == 'rev': cnt_number_layers = 0 temp = 0 for j in range(1,len(i[1].stack)): #print("----------------------") #print(i[1][j]) for l in i[1].stack[j].bottleneck_block.children(): if isinstance(l, torch.nn.modules.conv.Conv2d): g = l.weight.grad.view(l.weight.grad.size()[0],-1) h = l.weight.view(l.weight.size()[0],-1) temp += torch.mm(torch.mm(g,torch.t(h)),torch.t(torch.mm(g,torch.t(h)))).abs().sum().data[0] cnt_number_layers += 1 FI_linear = torch.mm(torch.mm(i[1].linear.weight.grad,torch.t(i[1].linear.weight)),torch.t(torch.mm(i[1].linear.weight.grad,torch.t(i[1].linear.weight)))).abs().sum().data[0] temp += FI_linear fishier.append((i[0],temp/(cnt_number_layers+1))) else: temp = 0 cnt_number_layers = 0 for j in range(1,len(i[1])): #print("----------------------") #print(i[1][j]) temp += torch.mm(torch.mm(i[1][j].h.weight.grad,torch.t(i[1][j].h.weight)),torch.t(torch.mm(i[1][j].h.weight.grad,torch.t(i[1][j].h.weight)))).abs().sum().data[0] cnt_number_layers += 1 fishier.append((i[0],temp/cnt_number_layers)) with open(dir + args.model_name + '_' + args.model_signature + '_fishier.txt','a') as o : o.write("Fishier Information Ratio in Hidden Layers\n") o.write(str(args)+'\n') for i in fishier: o.write(i[0] + ": " + str(i[1]) + "\n") print('FINAL EVALUATION ON TEST SET\n' 'LogL (TEST): {:.2f}\n' 'LogL (TRAIN): {:.2f}\n' 'ELBO (TEST): {:.2f}\n' 'ELBO (TRAIN): {:.2f}\n' 'Loss: {:.2f}\n' 'RE: {:.2f}\n' 'KL: {:.2f}\n' 'FI: {:.2f}\n' 'MI: {:.2f}\n' 'PSNR: {:.2f}'.format( test_log_likelihood, train_log_likelihood, test_elbo, train_elbo, test_loss, test_re, test_kl, test_fi, test_mi, test_psnr )) with open(dir + 'vae_experiment_log.txt', 'a') as f: print('FINAL EVALUATION ON TEST SET\n' 'LogL (TEST): {:.2f}\n' 'LogL (TRAIN): {:.2f}\n' 'ELBO (TEST): {:.2f}\n' 'ELBO (TRAIN): {:.2f}\n' 'Loss: {:.2f}\n' 'RE: {:.2f}\n' 'KL: {:.2f}\n' 'FI: {:.2f}\n' 'MI: {:.2f}\n' 'PSNR: {:.2f}'.format( test_log_likelihood, train_log_likelihood, test_elbo, train_elbo, test_loss, test_re, test_kl, test_fi, test_mi, test_psnr ), file=f) # Plot MI and FI #if args.MI is True: plot_info_evolution(val_mi_history, dir, 'MI') #if args.FI is True: plot_info_evolution(val_fi_history, dir, 'FI') # SAVING torch.save(train_loss_history, dir + args.model_name + '.train_loss') torch.save(train_re_history, dir + args.model_name + '.train_re') torch.save(train_kl_history, dir + args.model_name + '.train_kl') torch.save(train_mi_history, dir + args.model_name + '.train_mi') torch.save(train_fi_history, dir + args.model_name + '.train_fi') torch.save(train_psnr_history, dir + args.model_name + '.train_psnr') torch.save(val_loss_history, dir + args.model_name + '.val_loss') torch.save(val_re_history, dir + args.model_name + '.val_re') torch.save(val_kl_history, dir + args.model_name + '.val_kl') torch.save(val_mi_history, dir + args.model_name + '.val_mi') torch.save(val_fi_history, dir + args.model_name + '.val_fi') torch.save(val_psnr_history, dir + args.model_name + '.val_psnr') torch.save(test_log_likelihood, dir + args.model_name + '.test_log_likelihood') torch.save(test_loss, dir + args.model_name + '.test_loss') torch.save(test_re, dir + args.model_name + '.test_re') torch.save(test_kl, dir + args.model_name + '.test_kl') torch.save(test_fi, dir + args.model_name + '.test_fi') torch.save(test_mi, dir + args.model_name + '.test_mi') torch.save(test_psnr, dir + args.model_name + '.test_psnr')
return dropout_probabilities since = time.time() training_data = { 'mode': mode, 'model_id': model_id, 'dropout_data': dropout_data, 'epoch_data': [] } for epoch in range(1, N_EPOCHS + 1): ### Train ### trn_loss, reg_loss, trn_err = train_utils.train( model, train_loader, optimizer, criterion, epoch, gpu_id, True) print('Epoch {:d}\nTrain - Loss: {:.4f}, Reg - Loss: {:.4f}, IOU: {:.4f}'.format( epoch, trn_loss, reg_loss, trn_err)) time_elapsed = time.time() - since print('Train Time {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) ### Test ### val_loss, val_err = train_utils.test(model, val_loader, criterion, gpu_id, 1, epoch=epoch) print('Val - Loss: {:.4f} | IOU: {:.4f}'.format(val_loss, val_err)) time_elapsed = time.time() - since print('Total Time {:.0f}m {:.0f}s\n'.format( time_elapsed // 60, time_elapsed % 60)) ### Adjust Lr ###
def main(data_h5, output_dir, train_model, stage_training, filename_st1, filename_st2, learning_rate, batch_size, epochs, optim_value, use_gpu): #if train_model is True start training if train_model: model = load_model() train_set, train_real_labels, train_app_labels = load_data.load_data_set( data_h5, 'train') valid_set, valid_real_labels, valid_app_labels = load_data.load_data_set( data_h5, 'valid') train_all_extra_labels = load_data.load_data_extra_set( data_h5, 'train') valid_all_extra_labels = load_data.load_data_extra_set( data_h5, 'valid') train_data = load_data.dataSet(train_set, train_real_labels, train_app_labels, train_all_extra_labels) train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True, num_workers=4) valid_data = load_data.dataSet(valid_set, valid_real_labels, valid_app_labels, valid_all_extra_labels) valid_loader = DataLoader(dataset=valid_data, batch_size=batch_size // 2, num_workers=4) criterion = nn.MSELoss() if use_gpu: cudnn.benchmark = True torch.cuda.set_device(0) model.cuda() criterion.cuda() if optim_value == 'SGD': optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0001) elif optim_value == 'Adam': optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=[0.9, 0.999], eps=1e-08, weight_decay=0.0, amsgrad=False) else: print("optim paramter is not set.") display_interval = 1 best_valid_auc = 0.0 best_valid_epoch = 0 training.train(train_loader, valid_loader, model, criterion, optimizer, epochs, learning_rate, stage_training, use_gpu, batch_size, display_interval, best_valid_auc, best_valid_epoch, filename_st1, filename_st2, output_dir) else: print("Load pre-trained model (and predicting ages.)") print(">> Warning-model generated from stage:", stage_training) print(">> Results shown in the paper are based on 2 stages training") model = vgg16_real_app.vgg16_bn() if stage_training == '1': checkpoint = torch.load( os.path.join(output_dir, 'best_models', filename_st1)) else: checkpoint = torch.load( os.path.join(output_dir, 'best_models', filename_st2)) test_set, test_real_labels, test_app_labels = load_data.load_data_set( data_h5, 'test') test_all_extra_labels = load_data.load_data_extra_set(data_h5, 'test') cudnn.benchmark = True model.load_state_dict(checkpoint) model.cuda() pred_out1 = [] pred_out2 = [] real_label = [] app_label = [] test_sets = np.transpose(test_set, (0, 3, 1, 2)) for data, real, app, extra in zip(test_sets, test_real_labels, test_app_labels, test_all_extra_labels): output1, output2 = utils.predict(model, data, extra, use_gpu) pred_out1.extend(output1.cpu().detach().numpy()[0]) pred_out2.extend(output2.cpu().detach().numpy()[0]) real_label.append(float(real)) app_label.append(float(app)) mae_app = utils.evalute(app_label, pred_out1, 100) mae_real = utils.evalute(real_label, pred_out2, 100) print("The test mae_app is: {}, mae_real is: {}.".format( mae_app, mae_real))
*model_cfg.args, num_classes=num_classes, use_aleatoric=args.loss == 'aleatoric', **model_cfg.kwargs) swag_model.to(args.device) swag_model.load_state_dict(checkpoint['state_dict']) for epoch in range(start_epoch, args.epochs + 1): since = time.time() ### Train ### if epoch == args.ft_start: print('Now replacing data loader with fine-tuned data loader.') train_loader = loaders['fine_tune'] trn_loss, trn_err = train_utils.train(model, train_loader, optimizer, criterion) print('Epoch {:d}\nTrain - Loss: {:.4f}, Acc: {:.4f}'.format( epoch, trn_loss, 1 - trn_err)) time_elapsed = time.time() - since print('Train Time {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) if epoch % args.eval_freq is 0: ### Test ### val_loss, val_err, val_iou = train_utils.test(model, loaders['val'], criterion) print('Val - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}'.format( val_loss, 1 - val_err, val_iou)) time_elapsed = time.time() - since print('Total Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60,
def experiment_vae(args, train_loader, val_loader, test_loader, model, optimizer, dir, model_name='vae'): if model_name == 'vae': from utils.training import train_vae as train from utils.evaluation import evaluate_vae as evaluate elif model_name == 'vae_vampprior': from utils.training import train_vae_vampprior as train from utils.evaluation import evaluate_vae_vampprior as evaluate elif model_name == 'vae_vampprior_2level': from utils.training import train_vae_vampprior_2level as train from utils.evaluation import evaluate_vae_vampprior_2level as evaluate else: raise Exception('Wrong name of the model!') best_model = model best_loss = 1000. e = 0 train_loss_history = [] train_re_history = [] train_kl_history = [] val_loss_history = [] val_re_history = [] val_kl_history = [] time_history = [] for epoch in range(1, args.epochs + 1): time_start = time.time() model, train_loss_epoch, train_re_epoch, train_kl_epoch = train(epoch, args, train_loader, model, optimizer) val_loss_epoch, val_re_epoch, val_kl_epoch = evaluate(args, model, train_loader, val_loader, epoch, dir, mode='validation') time_end = time.time() time_elapsed = time_end - time_start # appending history train_loss_history.append(train_loss_epoch), train_re_history.append(train_re_epoch), train_kl_history.append( train_kl_epoch) val_loss_history.append(val_loss_epoch), val_re_history.append(val_re_epoch), val_kl_history.append( val_kl_epoch) time_history.append(time_elapsed) # printing results print('Epoch: {}/{}, Time elapsed: {:.2f}s\n' '* Train loss: {:.2f} (RE: {:.2f}, KL: {:.2f})\n' 'o Val. loss: {:.2f} (RE: {:.2f}, KL: {:.2f})\n' '--> Early stopping: {}/{} (BEST: {:.2f})\n'.format( epoch, args.epochs, time_elapsed, train_loss_epoch, train_re_epoch, train_kl_epoch, val_loss_epoch, val_re_epoch, val_kl_epoch, e, args.early_stopping_epochs, best_loss )) # early-stopping if val_loss_epoch < best_loss: e = 0 best_loss = val_loss_epoch best_model = model else: e += 1 if e > args.early_stopping_epochs: break if epoch < args.warmup: e = 0 # FINAL EVALUATION test_loss, test_re, test_kl, test_log_likelihood, train_log_likelihood, test_elbo, train_elbo = evaluate(args, best_model, train_loader, test_loader, 9999, dir, mode='test') print('FINAL EVALUATION ON TEST SET\n' 'LogL (TEST): {:.2f}\n' 'LogL (TRAIN): {:.2f}\n' 'ELBO (TEST): {:.2f}\n' 'ELBO (TRAIN): {:.2f}\n' 'Loss: {:.2f}\n' 'RE: {:.2f}\n' 'KL: {:.2f}'.format( test_log_likelihood, train_log_likelihood, test_elbo, train_elbo, test_loss, test_re, test_kl )) with open('vae_experiment_log.txt', 'a') as f: print('FINAL EVALUATION ON TEST SET\n' 'LogL (TEST): {:.2f}\n' 'LogL (TRAIN): {:.2f}\n' 'ELBO (TEST): {:.2f}\n' 'ELBO (TRAIN): {:.2f}\n' 'Loss: {:.2f}\n' 'RE: {:.2f}\n' 'KL: {:.2f}'.format( test_log_likelihood, train_log_likelihood, test_elbo, train_elbo, test_loss, test_re, test_kl ), file=f) # SAVING torch.save(best_model, dir + args.model_name + '.model') torch.save(args, dir + args.model_name + '.config') torch.save(train_loss_history, dir + args.model_name + '.train_loss') torch.save(train_re_history, dir + args.model_name + '.train_re') torch.save(train_kl_history, dir + args.model_name + '.train_kl') torch.save(val_loss_history, dir + args.model_name + '.val_loss') torch.save(val_re_history, dir + args.model_name + '.val_re') torch.save(val_kl_history, dir + args.model_name + '.val_kl') torch.save(test_log_likelihood, dir + args.model_name + '.test_log_likelihood') torch.save(test_loss, dir + args.model_name + '.test_loss') torch.save(test_re, dir + args.model_name + '.test_re') torch.save(test_kl, dir + args.model_name + '.test_kl')
def experiment_vae(args, train_loader, val_loader, test_loader, model, optimizer, dir, log_dir, model_name='vae'): from utils.training import train_vae as train from utils.evaluation import evaluate_vae as evaluate # SAVING torch.save(args, dir + args.model_name + '.config') # best_model = model best_ndcg = 0. e = 0 last_epoch = 0 train_loss_history = [] train_re_history = [] train_kl_history = [] val_loss_history = [] val_re_history = [] val_kl_history = [] val_ndcg_history = [] time_history = [] for epoch in range(1, args.epochs + 1): time_start = time.time() model, train_loss_epoch, train_re_epoch, train_kl_epoch = train(epoch, args, train_loader, model, optimizer) val_loss_epoch, val_re_epoch, val_kl_epoch, val_ndcg_epoch = evaluate(args, model, train_loader, val_loader, epoch, dir, mode='validation') time_end = time.time() time_elapsed = time_end - time_start # appending history train_loss_history.append(train_loss_epoch), train_re_history.append(train_re_epoch), train_kl_history.append( train_kl_epoch) val_loss_history.append(val_loss_epoch), val_re_history.append(val_re_epoch), val_kl_history.append( val_kl_epoch), val_ndcg_history.append(val_ndcg_epoch) time_history.append(time_elapsed) # printing results print('Epoch: {}/{}, Time elapsed: {:.2f}s\n' '* Train loss: {:.2f} (RE: {:.2f}, KL: {:.2f})\n' 'o Val. loss: {:.2f} (RE: {:.2f}, KL: {:.2f}, NDCG: {:.5f})\n' '--> Early stopping: {}/{} (BEST: {:.5f})\n'.format( epoch, args.epochs, time_elapsed, train_loss_epoch, train_re_epoch, train_kl_epoch, val_loss_epoch, val_re_epoch, val_kl_epoch, val_ndcg_epoch, e, args.early_stopping_epochs, best_ndcg )) # early-stopping last_epoch = epoch if val_ndcg_epoch > best_ndcg: e = 0 best_ndcg = val_ndcg_epoch # best_model = model print('->model saved<-') torch.save(model, dir + args.model_name + '.model') else: e += 1 if epoch < args.warmup: e = 0 if e > args.early_stopping_epochs: break # NaN if math.isnan(val_loss_epoch): break # FINAL EVALUATION best_model = torch.load(dir + args.model_name + '.model') test_loss, test_re, test_kl, test_ndcg, \ eval_ndcg20, eval_ndcg10, eval_recall50, eval_recall20, \ eval_recall10, eval_recall5, eval_recall1 = evaluate(args, best_model, train_loader, test_loader, 9999, dir, mode='test') print("NOTE: " + args.note) print('FINAL EVALUATION ON TEST SET\n' '- BEST VALIDATION NDCG: {:.5f} ({:} epochs) -\n' 'NDCG@100: {:} | Loss: {:.2f}\n' 'NDCG@20: {:} | RE: {:.2f}\n' 'NDCG@10: {:} | KL: {:.2f}\n' 'Recall@50: {:} | Recall@5: {:}\n' 'Recall@20: {:} | Recall@1: {:}\n' 'Recall@10: {:}'.format( best_ndcg, last_epoch, test_ndcg, test_loss, eval_ndcg20, test_re, eval_ndcg10, test_kl, eval_recall50, eval_recall5, eval_recall20, eval_recall1, eval_recall10 )) print('-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-') if not args.no_log: with open(log_dir, 'a') as f: print(args, file=f) print("NOTE: " + args.note, file=f) print('FINAL EVALUATION ON TEST SET\n' '- BEST VALIDATION NDCG: {:.5f} ({:} epochs) -\n' 'NDCG@100: {:} | Loss: {:.2f}\n' 'NDCG@20: {:} | RE: {:.2f}\n' 'NDCG@10: {:} | KL: {:.2f}\n' 'Recall@50: {:} | Recall@5: {:}\n' 'Recall@20: {:} | Recall@1: {:}\n' 'Recall@10: {:}'.format( best_ndcg, last_epoch, test_ndcg, test_loss, eval_ndcg20, test_re, eval_ndcg10, test_kl, eval_recall50, eval_recall5, eval_recall20, eval_recall1, eval_recall10 ), file=f) print('-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n', file=f) # SAVING torch.save(train_loss_history, dir + args.model_name + '.train_loss') torch.save(train_re_history, dir + args.model_name + '.train_re') torch.save(train_kl_history, dir + args.model_name + '.train_kl') torch.save(val_loss_history, dir + args.model_name + '.val_loss') torch.save(val_re_history, dir + args.model_name + '.val_re') torch.save(val_kl_history, dir + args.model_name + '.val_kl') torch.save(val_ndcg_history, dir +args.model_name + '.val_ndcg') torch.save(test_loss, dir + args.model_name + '.test_loss') torch.save(test_re, dir + args.model_name + '.test_re') torch.save(test_kl, dir + args.model_name + '.test_kl') torch.save(test_ndcg, dir +args.model_name + '.test_ndcg')
def main(): args = get_args() print('----- Params for debug: ----------------') print(args) print('data = {}'.format(args.data)) print('road = {}'.format(args.road)) print('Train model ...') # Imagenet normalization in case of pre-trained network normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Resize data before using transform = transforms.Compose([ transforms.Resize(260), transforms.CenterCrop(250), transforms.ToTensor(), normalize ]) train_record = None # 'Record001' train_dataset = Apolloscape(root=args.data, road=args.road, transform=transform, record=train_record, normalize_poses=True, pose_format='quat', train=True, cache_transform=not args.no_cache_transform, stereo=args.stereo) val_record = None # 'Record011' val_dataset = Apolloscape(root=args.data, road=args.road, transform=transform, record=val_record, normalize_poses=True, pose_format='quat', train=False, cache_transform=not args.no_cache_transform, stereo=args.stereo) # Show datasets print(train_dataset) print(val_dataset) shuffle_data = True train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=shuffle_data) # batch_size = 75 val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=shuffle_data) # batch_size = 75 # Get mean and std from dataset poses_mean = val_dataset.poses_mean poses_std = val_dataset.poses_std # Select active device if torch.cuda.is_available() and args.device == 'cuda': device = torch.device('cuda') else: device = torch.device('cpu') print('device = {}'.format(device)) # Used as prefix for filenames time_str = datetime.now().strftime('%Y%m%d_%H%M%S') # Create pretrained feature extractor if args.feature_net == 'resnet18': feature_extractor = models.resnet18(pretrained=args.pretrained) elif args.feature_net == 'resnet34': feature_extractor = models.resnet34(pretrained=args.pretrained) elif args.feature_net == 'resnet50': feature_extractor = models.resnet50(pretrained=args.pretrained) # Num features for the last layer before pose regressor num_features = args.feature_net_features # 2048 experiment_name = get_experiment_name(args) # Create model model = PoseNet(feature_extractor, num_features=num_features) model = model.to(device) # Criterion criterion = PoseNetCriterion(stereo=args.stereo, beta=args.beta, learn_beta=args.learn_beta) criterion.to(device) # Add all params for optimization param_list = [{'params': model.parameters()}] if criterion.learn_beta: param_list.append({'params': criterion.parameters()}) # Create optimizer optimizer = optim.Adam(params=param_list, lr=args.lr, weight_decay=0.0005) start_epoch = 0 # Restore from checkpoint is present if args.checkpoint is not None: checkpoint_file = args.checkpoint if os.path.isfile(checkpoint_file): print('\nLoading from checkpoint: {}'.format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optim_state_dict']) start_epoch = checkpoint['epoch'] if 'criterion_state_dict' in checkpoint: criterion.load_state_dict(checkpoint['criterion_state_dict']) print('Loaded criterion params too.') n_epochs = start_epoch + args.epochs print('\nTraining ...') val_freq = args.val_freq for e in range(start_epoch, n_epochs): # Train for one epoch train(train_dataloader, model, criterion, optimizer, e, n_epochs, log_freq=args.log_freq, poses_mean=train_dataset.poses_mean, poses_std=train_dataset.poses_std, device=device, stereo=args.stereo) # Run validation loop if e > 0 and e % val_freq == 0: end = time.time() validate(val_dataloader, model, criterion, e, log_freq=args.log_freq, device=device, stereo=args.stereo) # Make figure if e > 0 and args.fig_save > 0 and e % args.fig_save == 0: exp_name = '{}_{}'.format(time_str, experiment_name) make_figure(model, train_dataloader, poses_mean=poses_mean, poses_std=poses_std, epoch=e, experiment_name=exp_name, device=device, stereo=args.stereo) # Make checkpoint if e > 0 and e % args.checkpoint_save == 0: make_checkpoint(model, optimizer, criterion, epoch=e, time_str=time_str, args=args) print('\nn_epochs = {}'.format(n_epochs)) print('\n=== Test Training Dataset ======') pred_poses, gt_poses = model_results_pred_gt(model, train_dataloader, poses_mean, poses_std, device=device, stereo=args.stereo) print('gt_poses = {}'.format(gt_poses.shape)) print('pred_poses = {}'.format(pred_poses.shape)) t_loss = np.asarray([ np.linalg.norm(p - t) for p, t in zip(pred_poses[:, :3], gt_poses[:, :3]) ]) q_loss = np.asarray([ quaternion_angular_error(p, t) for p, t in zip(pred_poses[:, 3:], gt_poses[:, 3:]) ]) print('poses_std = {:.3f}'.format(np.linalg.norm(poses_std))) print('T: median = {:.3f}, mean = {:.3f}'.format(np.median(t_loss), np.mean(t_loss))) print('R: median = {:.3f}, mean = {:.3f}'.format(np.median(q_loss), np.mean(q_loss))) # Save for later visualization pred_poses_train = pred_poses gt_poses_train = gt_poses print('\n=== Test Validation Dataset ======') pred_poses, gt_poses = model_results_pred_gt(model, val_dataloader, poses_mean, poses_std, device=device, stereo=args.stereo) print('gt_poses = {}'.format(gt_poses.shape)) print('pred_poses = {}'.format(pred_poses.shape)) t_loss = np.asarray([ np.linalg.norm(p - t) for p, t in zip(pred_poses[:, :3], gt_poses[:, :3]) ]) q_loss = np.asarray([ quaternion_angular_error(p, t) for p, t in zip(pred_poses[:, 3:], gt_poses[:, 3:]) ]) print('poses_std = {:.3f}'.format(np.linalg.norm(poses_std))) print('T: median = {:.3f}, mean = {:.3f}'.format(np.median(t_loss), np.mean(t_loss))) print('R: median = {:.3f}, mean = {:.3f}'.format(np.median(q_loss), np.mean(q_loss))) # Save for later visualization pred_poses_val = pred_poses gt_poses_val = gt_poses # Save checkpoint print('\nSaving model params ....') make_checkpoint(model, optimizer, criterion, epoch=n_epochs, time_str=time_str, args=args)
num_workers=4) print('loading Validation Dataset:') valid_loader = torch.utils.data.DataLoader(CustomDataset(batch_size, path, phase='val'), batch_size, shuffle=False, num_workers=4) print('number of iteration per epoch', len(train_loader)) for epoch in range(1, N_EPOCHS + 1): since = time.time() train_utils.validation(model, valid_loader) ### Train ### print('TRAINING ---->') trn_loss = train_utils.train(model, train_loader, optimizer, epoch) print('Epoch {:d}\nTrain - Loss: {:.4f}'.format(epoch, trn_loss)) time_elapsed = time.time() - since print('Train Time {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) ### Checkpoint ### if epoch % 1 == 0: print('VALIDATION ---->') train_utils.validation(model, valid_loader) print('SAVING WEIGHTS ---->') train_utils.save_weights(model, epoch, trn_loss, trn_loss) ### Adjust Lr ### train_utils.adjust_learning_rate(LR, LR_DECAY, optimizer, epoch, DECAY_EVERY_N_EPOCHS)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--nClasses', type=int, default=10) #CIFAR parser.add_argument('--reduction', type=float, default=1.0) #no reduction parser.add_argument('--bottleneck', type=bool, default=False) parser.add_argument('--growthRate', type=int, default=12) parser.add_argument('--modelDepth', type=int, default=40) parser.add_argument('--batchSize', type=int, default=64) parser.add_argument('--nEpochs', type=int, default=2) parser.add_argument('--no-cuda', action='store_true') parser.add_argument('--save', type=str, default=RESULTS_PATH) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--existingWeights', type=str, default=None) parser.add_argument('--sessionName', type=str, default=train_utils.get_rand_str(5)) parser.add_argument('--opt', type=str, default='sgd', choices=('sgd', 'adam', 'rmsprop')) args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() setproctitle.setproctitle(args.save) #The process name torch.manual_seed(args.seed) if args.cuda: print("Using CUDA") torch.cuda.manual_seed(args.seed) # if os.path.exists(args.save): # shutil.rmtree(args.save) # os.makedirs(args.save, exist_ok=True) normMean = [0.49139968, 0.48215827, 0.44653124] normStd = [0.24703233, 0.24348505, 0.26158768] normTransform = transforms.Normalize(normMean, normStd) trainTransform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normTransform ]) testTransform = transforms.Compose([transforms.ToTensor(), normTransform]) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} print("Kwargs: " + str(kwargs)) trainLoader = DataLoader(dset.CIFAR10(root=CIFAR10_PATH, train=True, download=True, transform=trainTransform), batch_size=args.batchSize, shuffle=True, **kwargs) testLoader = DataLoader(dset.CIFAR10(root=CIFAR10_PATH, train=False, download=True, transform=testTransform), batch_size=args.batchSize, shuffle=False, **kwargs) net = DenseNet(growthRate=args.growthRate, depth=args.modelDepth, reduction=args.reduction, bottleneck=args.bottleneck, nClasses=args.nClasses) if args.existingWeights: print("Loading existing weights: %s" % args.existingWeights) startEpoch = train_utils.load_weights(net, args.existingWeights) endEpoch = startEpoch + args.nEpochs print('Resume training at epoch: {}'.format(startEpoch)) if os.path.exists(args.save + 'train.csv'): #assume test.csv exists print("Found existing train.csv") append_write = 'a' # append if already exists else: print("Creating new train.csv") append_write = 'w' # make a new file if not trainF = open(os.path.join(args.save, 'train.csv'), append_write) testF = open(os.path.join(args.save, 'test.csv'), append_write) else: print("Training new model from scratch") startEpoch = 1 endEpoch = args.nEpochs trainF = open(os.path.join(args.save, 'train.csv'), 'w') testF = open(os.path.join(args.save, 'test.csv'), 'w') print(' + Number of params: {}'.format( sum([p.data.nelement() for p in net.parameters()]))) if args.cuda: net = net.cuda() if args.opt == 'sgd': optimizer = optim.SGD(net.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-4) elif args.opt == 'adam': optimizer = optim.Adam(net.parameters(), weight_decay=1e-4) elif args.opt == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), weight_decay=1e-4) print("Training....") for epoch in range(startEpoch, endEpoch + 1): since = time.time() train_utils.adjust_opt(args.opt, optimizer, epoch) train_utils.train(epoch, net, trainLoader, optimizer, trainF, sessionName=args.sessionName) train_utils.test(epoch, net, testLoader, optimizer, testF) time_elapsed = time.time() - since print('Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60, time_elapsed % 60)) if epoch != 1: os.system('./plot.py {} &'.format(args.save)) trainF.close() testF.close()
def main(): # torch.autograd.set_detect_anomaly(True) args = parse_args() device = torch.device(args.device) torch.cuda.set_device(device) if args.timers: timers = SynchronizedWallClockTimer() else: timers = FakeTimer() model = init_model(args, device) loader = get_dataloader(get_trainset_params(args)) serializer = Serializer(args.model, args.num_checkpoints, args.permanent_interval) args.do_not_continue = (args.do_not_continue or len(serializer.list_known_steps()) == 0) last_step = (0 if args.do_not_continue else serializer.list_known_steps()[-1]) optimizer, scheduler = construct_train_tools(args, model, passed_steps=last_step) losses = init_losses(get_resolution(args), args.bs, model, device, timers=timers) logger = SummaryWriter(str(args.log_path)) periodic_hooks, hooks = create_hooks(args, model, optimizer, losses, logger, serializer) if not args.do_not_continue: global_step, state = serializer.load_checkpoint(model, last_step, optimizer=optimizer, device=device) samples_passed = state.pop('samples_passed', global_step * args.bs) else: global_step = 0 samples_passed = 0 hooks['serialization'](global_step, samples_passed) hooks['validation'](global_step, samples_passed) with Profiler(args.profiling, args.model / 'profiling'): train(model, device, loader, optimizer, args.training_steps, scheduler=scheduler, evaluator=losses, logger=logger, weights=args.loss_weights, is_raw=args.is_raw, accumulation_steps=args.accum_step, timers=timers, hooks=periodic_hooks, init_step=global_step, init_samples_passed=samples_passed) samples = samples_passed + (args.training_steps - global_step) * args.bs hooks['serialization'](args.training_steps, samples) hooks['validation'](args.training_steps, samples)