def _load_wikisql_data(jsonl_path, db_path, toy): # NOTE: because the tokenizer was StanfordCoreNLP and not spacy, # it's a bit finicky to use the spacy entity recognition in # order to describe the part of speech of all the query questions. # For this reason a slow python loop needs to be used to parse the ent # field of the dataset. queries = [] db = DBEngine(db_path) parsers, fields, validators = _wikisql_data_readers(db) # weird api for Example.fromdict ex_fields = {k: [(k, v)] for k, v in fields.items()} excs = [] for query_json in _annotate_queries(jsonl_path, toy): try: parsed_fields = { k: parse(query_json) for k, parse in parsers.items() } ex = torchtext.data.Example.fromdict(parsed_fields, ex_fields) for v in validators.values(): v(query_json, ex) queries.append(ex) except _QueryParseException as e: excs.append(e.args[0]) track.debug('dropped {} of {} queries{}{}', len(excs), len(excs) + len(queries), ':\n ' if excs else '', '\n '.join(excs)) return TableDataset(queries, fields, db)
def _debug_stats_str(stats, warn=False): """output: | mymetric1: 0 | mymetric2: 1 |""" all_floats = all(map(lambda f: isinstance(f, float), stats.values())) if not (all_floats or warn): track.debug("WARNING: I'm printing your metric arguments as floats") s = [' | %s: %.2f ' % (k, float(v)) for k, v in stats.items()] return ''.join(s)
def wikisql(toy): """ Loads the WikiSQL dataset. Per the original SQLNet implementation, a subsampled version is returned if the toy argument is true. Returns the train, val, and test TableDatasets for WikiSQL. """ # compressed WikiSQL file after the annotation recommended in # https://github.com/salesforce/WikiSQL has been run. wikisql_dir = check_or_fetch('wikisql', 'wikisql.tgz', _URL) wikisql_dir = os.path.join(wikisql_dir, 'wikisql') track.debug('loading spacy tagger') _nlp() track.debug('loading vocabulary') pretrained_vocab(toy) train = _load_wikisql_data( os.path.join(wikisql_dir, 'annotated', 'train.jsonl'), os.path.join(wikisql_dir, 'dbs', 'train.db'), toy) val = _load_wikisql_data( os.path.join(wikisql_dir, 'annotated', 'dev.jsonl'), os.path.join(wikisql_dir, 'dbs', 'dev.db'), toy) test = _load_wikisql_data( os.path.join(wikisql_dir, 'annotated', 'test.jsonl'), os.path.join(wikisql_dir, 'dbs', 'test.db'), toy) return train, val, test
def eval_model(model, env, y_placeholder, obs_placeholder, attack_method, attack_ord=2, num_rollouts=5, eps=0.1, trial_num=0, render=False, alg_name='ERROR', env_name='ERROR'): # cleverhans needs to get the logits tensor, but expects you to run # through and recompute it for the given observation # even though the graph is already created cleverhans_model = CallableModelWrapper(lambda o: y_placeholder, "logits") attack = ATTACKS[attack_method](cleverhans_model) fgsm_params = {'eps': eps, 'ord': attack_ord} # we'll keep tracking metrics here prev_done_step = 0 stats = {} rewards = [] stats['eval_step'] = 0 stats['episode'] = 0 stats['episode_reward'] = 0. obs = env.reset() num_episodes = 0 while num_episodes < num_rollouts: # the attack_op tensor will generate the perturbed state! attack_op = attack.generate(obs_placeholder, **fgsm_params) adv_obs = attack_op.eval({obs_placeholder: obs[None, :]}) action = model(adv_obs)[0] # it's time for my child to act out in this adversarial world obs, rew, done, _ = env.step(action) reward = rew[0] if isinstance(env, VecEnv) else rew if render: env.render() done = done.any() if isinstance(done, np.ndarray) else done # let's get our metrics stats['eval_step'] += 1 stats['episode_reward'] += reward stats['episode_len'] = stats['eval_step'] + prev_done_step if done: rewards.append(stats['episode_reward']) obs = env.reset() prev_done_step = stats['eval_step'] track.debug("Finished episode %d!" % (stats['episode'])) stats['episode'] += 1 stats['episode_reward'] = 0 stats['eval_step'] = 0 num_episodes += 1 # track metrics to access later through pandas track.metric(iteration=stats['eval_step'] + prev_done_step, trial_num=trial_num, **stats) env.close() np.save('./data/{0}_{1}_{2}_{3}_{4}.npy'.format(alg_name, env_name, attack_method, attack_ord, eps), rewards) print('REWARDS', rewards) return stats # gimme the final stats for the episode
def seed_all(seed): """Seed all devices deterministically off of seed and somewhat independently.""" track.debug('seeding with seed {}', seed) np.random.seed(seed) rand_seed, torch_cpu_seed, torch_gpu_seed = _next_seeds(3) random.seed(rand_seed) torch.manual_seed(torch_cpu_seed) torch.cuda.manual_seed_all(torch_gpu_seed)
def experiment(args): track_local_dir = os.path.join(args.logroot, args.experimentname) if args.remote: track_remote_dir = os.path.join(args.remote, args.projectname, args.experimentname) else: track_remote_dir = None with track.trial(track_local_dir, track_remote_dir, param_map=vars(args)): track.debug("Starting trial") do_training(args)
def _copy_best_checkpoint(checkpoint_file): bestfile = os.path.join(os.path.dirname(checkpoint_file), 'best.pth') if not os.path.isfile(bestfile): raise ValueError( 'was expecting checkpoint file {} to have a sibling ' 'file best.pth for the running best model'.format(checkpoint_file)) best_dst = _checkpoint_file('best.pth') track.debug('copying best running model file from {} to {}', bestfile, best_dst) os.makedirs(os.path.dirname(best_dst), exist_ok=True) shutil.copyfile(bestfile, best_dst)
def _experiment(experiment_fn, args): """ Launches the track experiment (+/- S3 backup) by calling `experiment_fn(args)` where args contains the parsed arguments. """ track_local_dir = os.path.join(args.logroot, args.experimentname) if args.s3: track_remote_dir = os.path.join(args.s3, args.projectname, args.experimentname) else: track_remote_dir = None with track.trial(track_local_dir, track_remote_dir, param_map=vars(args)): track.debug("Starting experiment!") experiment_fn(args)
def tsne_embeddings(vecs, train_iters, batch_size, perplexity=30, cuda=False): track.debug("[track]\tComputing image densities PMF") densities = _compute_densities(vecs, perplexity=perplexity) i, j = np.indices(densities.shape) i = i.ravel() j = j.ravel() track.debug("[track]\tTraining the TSNE embedding") tsne = TSNE(len(densities), 2, 2) # visualize in 2d tsne_train_wrapper = Wrapper(tsne, batchsize=batch_size, cuda=cuda) for k in range(train_iters): # plot progress progress_bar(k, train_iters) tsne_train_wrapper.fit(densities, i, j) return tsne.logits.weight.detach().cpu().numpy()
def run(ensemble, proj_df, results_dir='./logs', dataroot='./data', batch_size=128, eval_batch_size=100, cuda=False, num_workers=2, **unused): """ this evaluates both the ensemble and the baseline model on the full test set we also evaluate each model and compute their individual losses, so that we can plot the variance around the ensemble's dashed horizontal line (see top of file) """ trainloader, testloader = build_dataset('cifar10', dataroot=dataroot, batch_size=batch_size, eval_batch_size=eval_batch_size, num_workers=2) ensemble_criterion = SoftmaxNLL() track.debug("[baseline] testing the ensemble on full dataset") ensemble_loss, ensemble_acc = test(testloader, ensemble, ensemble_criterion, epoch=-1, cuda=cuda, metric=False) # get the no-noise baseline evaluation proj = track.Project(results_dir) best_model, best_df = load_trial(proj, noise_scale=0.0) track.debug("[baseline] testing no-noise baseline model on full dataset") baseline_criterion = torch.nn.CrossEntropyLoss() baseline_loss, baseline_acc = test(testloader, best_model, baseline_criterion, epoch=-1, cuda=cuda, metric=False) # now, test each of the ensemble's models model_losses = [] model_accs = [] track.debug("[baseline] testing individual models on full dataset") for i, model in enumerate(ensemble.models): track.debug("[baseline] testing model %d of %d" % (i, len(ensemble.models))) model_loss, model_acc = test(testloader, model, baseline_criterion, epoch=-1, cuda=cuda, metric=False) model_losses.append(model_loss) model_accs.append(model_acc) # we just need to track the scalar results of this evaluation # we can access the baseline test *curve* from the jupyter notebook (later) track.metric(iteration=0, ensemble_loss=ensemble_loss, ensemble_acc=ensemble_acc, best_baseline_loss=baseline_loss, best_baseline_acc=baseline_acc, model_losses=model_losses, model_accs=model_accs)
def _annotate_queries(jsonl_path, toy): # read all json in and annotate with a spacy tagger in a single pass # we also use this opportunity to re-tokenize with a spacy tokenizer. track.debug('reading json data from {}', jsonl_path) track.debug(' reading json data into memory') with open(jsonl_path, 'r') as f: max_lines = 1000 if toy else _count_lines(jsonl_path) query_jsons = [] for line in tqdm(itertools.islice(f, max_lines), total=max_lines): query_jsons.append(json.loads(line)) nlp = _nlp() track.debug(' parsing json into dataset') words = (detokenize(q['question']['gloss'], q['question']['after']) for q in query_jsons) pipeline = nlp.pipe(words, batch_size=512, n_threads=_n_procs()) for i, doc in enumerate(tqdm(pipeline, total=len(query_jsons))): ent = [tok.tag_ for tok in doc] tok = [_process_token(tok.lemma_, toy) for tok in doc] original = [tok.text_with_ws for tok in doc] query_jsons[i]['question'] = {} query_jsons[i]['question']['ent'] = ent query_jsons[i]['question']['tok'] = tok query_jsons[i]['question']['original'] = original yield query_jsons[i]
def main(args): args = _fix_baseline_args(args) # try to load with pattern matching (this makes grid search easier) if args.load_path == '': default_path = os.path.join(args.model_dir, '%s_%s.pkl' % (args.alg, args.env)) track.debug("Didn't find a load_path, so we will try to load from: %s" % default_path) load_path = default_path model, env, y_placeholder, obs_placeholder = _load( args.alg, args.env, args.network, load_path) final_stats = eval_model(model, env, y_placeholder, obs_placeholder, num_rollouts=args.num_rollouts, attack_method=args.attack, attack_ord=args.attack_ord, eps=args.eps, render=args.render, alg_name=args.alg, env_name=args.env) track.debug("FINAL STATS:%s" % _debug_stats_str(final_stats))
def run(ensemble, proj_df, dataroot='./data', batch_size=128, eval_batch_size=100, cuda=False, num_workers=2, **unused): """ let's compute that entropy baby """ num_classes = 10 # build_dataset('cifar10') <- not worth computing rn entropy_criterion = Entropy() ensemble.models = ensemble.models[::4] # iterate for all possible classes in dataset for class_ind in range(num_classes): # build dataset per class track.debug("Evaluating entropy for class id: %d" % (class_ind)) class_trainlaoder, class_testloader = build_single_class_dataset( 'cifar10', class_ind=class_ind, dataroot=dataroot, batch_size=batch_size, eval_batch_size=eval_batch_size, num_workers=2) # compute the entropy of the model post-hoc as well entropy = test(class_testloader, ensemble, entropy_criterion, epoch=-1, cuda=cuda, metric=False, criterion_has_labels=False, compute_acc=False) track.debug("\n\n\tEntropy: %.2f" % entropy) track.metric(cifar_class_id=class_ind, entropy=entropy)
def do_training(args): trainloader, testloader = build_dataset( args.dataset, dataroot=args.dataroot, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, num_workers=2) model = build_model(args.arch, num_classes=num_classes(args.dataset)) if args.cuda: model = torch.nn.DataParallel(model).cuda() # Calculate total number of model parameters num_params = sum(p.numel() for p in model.parameters()) track.metric(iteration=0, num_params=num_params) if args.optimizer == 'sgd': optimizer = SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: optimizer = EKFAC(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, eps=args.eps, update_freq=args.update_freq) criterion = torch.nn.CrossEntropyLoss() best_acc = 0.0 for epoch in range(args.epochs): track.debug("Starting epoch %d" % epoch) args.lr = adjust_learning_rate(epoch, optimizer, args.lr, args.schedule, args.gamma) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, args.cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, args.cuda) track.debug('Finished epoch %d... | train loss %.3f | train acc %.3f ' '| test loss %.3f | test acc %.3f' % (epoch, train_loss, train_acc, test_loss, test_acc)) # Save model model_fname = os.path.join(track.trial_dir(), "model{}.ckpt".format(epoch)) torch.save(model, model_fname) if test_acc > best_acc: best_acc = test_acc best_fname = os.path.join(track.trial_dir(), "best.ckpt") track.debug("New best score! Saving model") torch.save(model, best_fname)
def do_training(args): trainloader, testloader = build_dataset( args.dataset, dataroot=args.dataroot, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, num_workers=2) model = build_model(args.arch, num_classes=num_classes(args.dataset)) if args.cuda: model = torch.nn.DataParallel(model).cuda() # Calculate total number of model parameters num_params = sum(p.numel() for p in model.parameters()) track.metric(iteration=0, num_params=num_params) num_chunks = max(1, args.batch_size // args.max_samples_per_gpu) optimizer = LARS(params=model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, eta=args.eta, max_epoch=args.epochs) criterion = torch.nn.CrossEntropyLoss() best_acc = 0.0 for epoch in range(args.epochs): track.debug("Starting epoch %d" % epoch) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, args.cuda, num_chunks=num_chunks) test_loss, test_acc = test(testloader, model, criterion, epoch, args.cuda) track.debug('Finished epoch %d... | train loss %.3f | train acc %.3f ' '| test loss %.3f | test acc %.3f' % (epoch, train_loss, train_acc, test_loss, test_acc)) # Save model model_fname = os.path.join(track.trial_dir(), "model{}.ckpt".format(epoch)) torch.save(model, model_fname) if test_acc > best_acc: best_acc = test_acc best_fname = os.path.join(track.trial_dir(), "best.ckpt") track.debug("New best score! Saving model") torch.save(model, best_fname)
lr = args.lr best_val_loss = None # At any point you can hit Ctrl + C to break out of training early. param_map = {'batch_size': args.batch_size} with track.trial(args.logroot, None, param_map=param_map): try: for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train_loss = train() val_loss = evaluate(val_data) print('-' * 89) track.debug( '| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), train_loss, val_loss, math.exp(val_loss))) print('-' * 89) track.metric(iteration=epoch, train_loss=train_loss, test_loss=val_loss) # Log model model_fname = os.path.join(track.trial_dir(), "model{}.ckpt".format(epoch)) torch.save(model, model_fname) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: best_fname = os.path.join(track.trial_dir(), "best.ckpt") with open(best_fname, 'wb') as f: torch.save(model, f)
def do_training(args): hyperparameters = { 'lr': args.lr, 'epochs': args.epochs, 'resume_from': 0, 'coco_version': args.coco_version, #can be either '2014' or '2017' 'batch_size': args.batch_size, 'weight_decay': args.weight_decay, 'momentum': args.momentum, 'optimizer': args.optimizer, 'alpha': args.alpha, 'gamma': args.gamma, 'lcoord': args.lcoord, 'lno_obj': args.lno_obj, 'iou_type': tuple(int(a) for a in tuple(args.iou_type)), 'iou_ignore_thresh': args.iou_ignore_thresh, 'tfidf': args.tfidf, 'idf_weights': True, 'tfidf_col_names': ['img_freq', 'none', 'none', 'none', 'no_softmax'], 'wasserstein': args.wasserstein, 'inf_confidence': args.inf_confidence, 'inf_iou_threshold': args.inf_iou_threshold, 'augment': args.augment, 'workers': 1, 'pretrained': args.is_pretrained, 'path': args.trial_id, 'reduction': args.reduction } mode = { 'bayes_opt': False, 'multi_scale': args.multi_scale, 'show_hp': args.show_hp, 'show_output': args.show_output, 'multi_gpu': False, 'train_subset': args.train_subset, 'test_subset': args.test_subset, 'show_temp_summary': args.show_temp_summary, 'save_summary': False } this_proj = track.Project("./logs/" + args.experimentname) if (args.resume == 'last'): this_proj = track.Project("./logs/" + args.experimentname) most_recent = this_proj.ids["start_time"].nlargest(2).idxmin() most_recent_id = this_proj.ids["trial_id"].iloc[[most_recent]] PATH = os.path.join("./logs/" + args.experimentname, most_recent_id.item()) hyperparameters['path'] = os.path.join(PATH, 'last.tar') args.resume = most_recent_id.item() elif (args.resume == 'best'): ids = this_proj.ids["trial_id"] res = this_proj.results(ids) best_map = res["coco_stats:map_all"].idxmax() best_map_id = res["trial_id"].iloc[[best_map]] PATH = os.path.join("./logs/" + args.experimentname, best_map_id.item()) hyperparameters['path'] = os.path.join(PATH, 'best.tar') args.resume = best_map_id.item() else: PATH = os.path.join("./logs/" + args.experimentname, args.resume) hyperparameters['path'] = os.path.join(PATH, 'last.tar') coco_version = hyperparameters['coco_version'] mAP_best = 0 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model, optimizer, hyperparameters, PATH = init_model.init_model( hyperparameters, mode) model.hp = hyperparameters model.mode = mode if type(model) is nn.DataParallel: inp_dim = model.module.inp_dim else: inp_dim = model.inp_dim if hyperparameters['augment'] > 0: train_dataset = Coco(partition='train', coco_version=coco_version, subset=mode['train_subset'], transform=transforms.Compose([ Augment(hyperparameters['augment']), ResizeToTensor(inp_dim) ])) else: train_dataset = Coco(partition='train', coco_version=coco_version, subset=mode['train_subset'], transform=transforms.Compose( [ResizeToTensor(inp_dim)])) batch_size = hyperparameters['batch_size'] train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=helper.collate_fn, num_workers=hyperparameters['workers'], pin_memory=True) test_dataset = Coco(partition='val', coco_version=coco_version, subset=mode['test_subset'], transform=transforms.Compose([ResizeToTensor(inp_dim) ])) test_dataloader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, collate_fn=helper.collate_fn, num_workers=1, pin_memory=True) # Calculate total number of model parameters num_params = sum(p.numel() for p in model.parameters()) track.metric(iteration=0, num_params=num_params) for epoch in range(args.epochs): track.debug("Starting epoch %d" % epoch) # args.lr = adjust_learning_rate(epoch, optimizer, args.lr, args.schedule, # args.gamma) outcome = train(train_dataloader, model, optimizer, epoch) mAP = 0 mAP = test(test_dataloader, model, epoch, device) track.debug( 'Finished epoch %d... | train loss %.3f | avg_iou %.3f | avg_conf %.3f | avg_no_conf %.3f' '| avg_pos %.3f | avg_neg %.5f | mAP %.5f' % (epoch, outcome['avg_loss'], outcome['avg_iou'], outcome['avg_conf'], outcome['avg_no_conf'], outcome['avg_pos'], outcome['avg_neg'], mAP)) model_fname = os.path.join(track.trial_dir(), "last.tar") torch.save( { 'model_state_dict': model.module.state_dict() if type(model) is nn.DataParallel else model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'avg_loss': outcome['avg_loss'], 'avg_iou': outcome['avg_iou'], 'avg_pos': outcome['avg_pos'], 'avg_neg': outcome['avg_neg'], 'avg_conf': outcome['avg_conf'], 'avg_no_conf': outcome['avg_no_conf'], 'mAP': mAP, 'hyperparameters': hyperparameters }, model_fname) if mAP > mAP_best: mAP_best = mAP best_fname = os.path.join(track.trial_dir(), "best.tar") track.debug("New best score! Saving model") torch.save( { 'model_state_dict': model.module.state_dict() if type(model) is nn.DataParallel else model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'avg_loss': outcome['avg_loss'], 'avg_iou': outcome['avg_iou'], 'avg_pos': outcome['avg_pos'], 'avg_neg': outcome['avg_neg'], 'avg_conf': outcome['avg_conf'], 'avg_no_conf': outcome['avg_no_conf'], 'mAP': mAP, 'hyperparameters': hyperparameters }, best_fname)
def run(ensemble, proj_df, dataroot='./data', batch_size=128, cuda=False, class_ind=0, num_batches=4, tsne_train_iters=4000, **kwargs): """ let's do some dimensionality reduction """ track.debug("[tsne] starting experiment with class %d" % class_ind) trainloader, testloader = build_single_class_dataset( 'cifar10', class_ind=2, dataroot=dataroot, batch_size=batch_size, eval_batch_size=batch_size, num_workers=2) # stores for any loader; we have to copy these to the last two dicts train_activations = {} labels = [] track.debug("[tsne] starting forward passes") ensemble.models = ensemble.models[0::4] # plot every 4 epochs for now for model_ind, model in enumerate(ensemble.models): # plot progress progress_bar(model_ind, len(ensemble.models)) model_activations = [] # this hook will aggregate a list of model outputs in `activations` model.linear.register_forward_pre_hook( _create_preactivation_hook(model_activations)) with torch.no_grad(): for inputs, _ in islice(trainloader, 0, num_batches): model(inputs) train_activations[model_ind] = torch.cat(model_activations) labels.extend([model_ind] * len(train_activations[model_ind])) # now, we have all activations for all models! we can do tsne track.debug("[tsne] forward pass done! starting stacking + embedding") all_train_activations = torch.cat( [vec for vec in train_activations.values()]) embedding = tsne_embeddings(all_train_activations, tsne_train_iters, batch_size=len(all_train_activations), cuda=cuda) f = plt.figure() # create labels for the models by iteration y = np.array(labels) plt.scatter(embedding[:, 0], embedding[:, 1], c=y * 1.0 / y.max()) # plot the model means too model_means = [] num_model_vecs = len(list(train_activations.values())[0]) endpoints = [] start = 0 for stop in range(0, len(embedding), num_model_vecs): if stop - start > 0: endpoints.append((start, stop)) start = stop for start, stop in endpoints: model_means.append(embedding[start:stop, :].mean(axis=0)) model_means = np.array(model_means) ys = np.array(list(range(len(model_means)))) / float(len(model_means)) plt.scatter(model_means[:, 0], model_means[:, 1], c=ys, s=100, linewidth=2, edgecolors='black', marker='D') plt.axis('off') plt.savefig('/Users/noah/Dev/SGLD/embeddings.png', bbox_inches='tight') plt.close(f) track.debug("[tsne] done! saved to embeddings.jpg")
def _run(experiment): track.debug('Starting to run experiment: %s' % experiment) experiment_module = 'sgld.experiments.' + experiment runner = getattr(importlib.import_module(experiment_module), 'run') runner(model, trial_df, **vars(args))
def _main(_): with track.trial(os.getenv('TRACK_DIRECTORY'), param_map=track.absl_flags(), trial_prefix=flags.FLAGS.trial_prefix): seed_all(flags.FLAGS.seed) track.debug('found gpus {}', gpus()) dataset_file = os.path.join( flags.FLAGS.dataroot, 'wikisql', 'processed-toy{}.pth'.format(1 if flags.FLAGS.toy else 0)) track.debug('loading data from {}', dataset_file) train, val, _ = torch.load(dataset_file) track.debug('building model') model = wikisql_specific.WikiSQLSpecificModel(train.fields) track.debug('built model:\n{}', model) num_parameters = int( sum(p.numel() for p in model.parameters() if p.requires_grad)) track.debug('number of parameters in model {}', num_parameters) device = get_device() torch.save(model.to(torch.device('cpu')), os.path.join(track.trial_dir(), 'untrained_model.pth')) model = model.to(device) training_state = _TrainingState() if flags.FLAGS.restore_checkpoint: _copy_best_checkpoint(flags.FLAGS.restore_checkpoint) _load_checkpoint(flags.FLAGS.restore_checkpoint, model, training_state) params_to_optimize = [p for p in model.parameters() if p.requires_grad] if flags.FLAGS.optimizer == 'sgd': # lr required here but will be set in _do_training optimizer = optim.SGD(params_to_optimize, lr=1, weight_decay=flags.FLAGS.weight_decay) elif flags.FLAGS.optimizer == 'momentum': # lr required here but will be set in _do_training optimizer = optim.SGD(params_to_optimize, lr=1, momentum=0.9, weight_decay=flags.FLAGS.weight_decay) elif flags.FLAGS.optimizer == 'adam': optimizer = optim.Adam(params_to_optimize, weight_decay=flags.FLAGS.weight_decay) else: raise ValueError('unrecognized optimizer {}'.format( flags.FLAGS.optimizer)) num_workers = flags.FLAGS.workers track.debug('initializing {} workers', num_workers) with closing(SharedGPU(optimizer, model, num_workers)) as shared: _do_training(train, val, shared, training_state)
def _load_checkpoint(checkpoint_file, model, training_state): track.debug('restoring model from {}', flags.FLAGS.restore_checkpoint) state_dict = torch.load(checkpoint_file) model.load_state_dict(state_dict['model']) training_state.load_state_dict(state_dict['training_state'])
# Save checkpoint. acc = 100.0 * correct / total if acc > best_acc: print("Saving..") state = {"net": net.state_dict(), "acc": acc, "epoch": epoch} if not os.path.isdir("checkpoint"): os.mkdir("checkpoint") ckpt_path = os.path.join(track.trial_dir(), "ckpt.pth") torch.save(state, ckpt_path) best_acc = acc test_loss = test_loss / len(testloader) return test_loss, acc, best_acc with track.trial(args.logroot, None, param_map=vars(args)): for epoch in range(start_epoch, start_epoch + 200): train_loss, train_acc = train(epoch) test_loss, test_acc, best_acc = test(epoch) track.metric( iteration=epoch, train_loss=train_loss, train_acc=train_acc, test_loss=test_loss, test_acc=test_acc, best_acc=best_acc, ) track.debug( f"epoch {epoch} finished with stats: best_acc = {best_acc} | train_acc = {train_acc} | test_acc = {test_acc} | train_loss = {train_loss} | test_loss = {test_loss}" )
def _do_training(train, val, shared, training_state): batch_size = flags.FLAGS.batch_size loss_window = RollingAverageWindow(len(train) // 10 // batch_size) acc_window = RollingAverageWindow(len(train) // 10 // batch_size) grad_window = RollingAverageWindow(len(train) // 10 // batch_size) def _tqdm_postfix(): return { 'loss': '{:06.3f}'.format(loss_window.value()), 'acc': '{:05.1%}'.format(acc_window.value()), 'gradnorm': '{:08.2e}'.format(grad_window.value()) } shared.set_mode(evaluation=False) shared.lr(training_state.lr) perm = np.arange(len(train)) for epoch in range(1 + training_state.epoch, 1 + flags.FLAGS.max_epochs): epochfmt = intfmt(flags.FLAGS.max_epochs) training_state.epoch = epoch track.debug('begin epoch ' + epochfmt, epoch) # one sample at a time greatly simplifies pytorch seq2seq! np.random.shuffle(perm) samples = (train[i] for i in perm) with tqdm(total=len(train), postfix=_tqdm_postfix()) as progbar: for exs in chunkify(samples, batch_size): shared.zero_grad() loss, acc, gradnorm = shared.train(exs) loss_window.update(loss) acc_window.update(acc) grad_window.update(gradnorm) shared.step() progbar.update(len(exs)) progbar.set_postfix(**_tqdm_postfix()) shared.set_mode(evaluation=True) val_diagnostics = _diagnose(val, shared) train_diagnostics = _diagnose(train, shared, min(len(val), len(train))) track.metric(iteration=epoch, lr=training_state.lr) track.metric(iteration=epoch, **{'val ' + k: v for k, v in val_diagnostics.items()}) track.metric(iteration=epoch, **{'train ' + k: v for k, v in train_diagnostics.items()}) shared.set_mode(evaluation=False) val_diagnostics_str = _str_diagnostics('val', val_diagnostics) train_diagnositcs_str = _str_diagnostics('(sampled) train', train_diagnostics) track.debug('epoch ' + epochfmt + ' of ' + epochfmt + '\n{}\n{}', epoch, flags.FLAGS.max_epochs, val_diagnostics_str, train_diagnositcs_str) cur_val_loss = val_diagnostics['loss (*total)'] if cur_val_loss < training_state.best_val_loss: training_state.patience = training_state.initial_patience training_state.best_val_loss = cur_val_loss best_file = _checkpoint_file('best.pth') track.debug('updating best model into file {}', best_file) _save_checkpoint(best_file, shared.model, training_state) else: training_state.patience -= 1 track.debug('val loss not improving; dropping patience') shared.lr(training_state.lr) if training_state.patience == 0: track.debug('out of patience, dropping lr') training_state.lr *= flags.FLAGS.lr_decay_rate training_state.patience = training_state.initial_patience track.debug('lr {} patience {} best val loss so far {}', training_state.lr, training_state.patience, training_state.best_val_loss) early_stop = training_state.lr < flags.FLAGS.min_lr if early_stop: track.debug( 'lr dropped to {} < min tolerable lr {}, early stopping', training_state.lr, flags.FLAGS.min_lr) if _check_period(epoch, flags.FLAGS.persist_every) or early_stop: epochfmt = intfmt(flags.FLAGS.max_epochs, fill='0') checkpoint_file = _checkpoint_file(epochfmt.format(epoch) + '.pth') track.debug('persisting model to {}', checkpoint_file) _save_checkpoint(checkpoint_file, shared.model, training_state) if early_stop: break
def run(ensemble, trial_df, results_dir='./logs', dataroot='./data', class_ind=0, batch_size=128, eval_batch_size=100, cuda=False, num_workers=2, start_epoch=160, end_epoch=200, **unused): trainloader, testloader = build_dataset('cifar10', dataroot=dataroot, batch_size=batch_size, eval_batch_size=eval_batch_size, num_workers=2) # this will only iterate over examples of one class class_trainlaoder, class_testloader = build_single_class_dataset( 'cifar10', class_ind=class_ind, dataroot=dataroot, batch_size=batch_size, eval_batch_size=eval_batch_size, num_workers=2) full_ensemble = ensemble track.debug("[ensemble_size] starting to test all ensembles (class = %d)" % class_ind) for i in range(len(ensemble.models)): ensemble_size = i + 1 model_ind = len(ensemble.models) - 1 - i track.debug("[ensemble_size] starting size %d / %d ensemble" % (i + 1, len(ensemble.models))) ensemble_loss = SoftmaxNLL() one_loss = CrossEntropyLoss() entropy_criterion = Entropy() ensemble = Ensemble(full_ensemble.models[model_ind:]) single_model = full_ensemble.models[model_ind] # we want to do metrics for (a) the ensemble with varying sizes and # (b) the individual models corresponding to that epoch def _test_dataset(model, testloader, criterion): loss, acc = test(testloader, model, criterion, epoch=-1, cuda=cuda, metric=False) # compute the entropy of the model post-hoc as well entropy = test(testloader, model, entropy_criterion, epoch=-1, cuda=cuda, metric=False, criterion_has_labels=False, compute_acc=False) return loss, acc, entropy # metrics for the both models over both datasets # (a) on the whole dataset # (i) for the ensemble # (ii)for the single model from this epoch # (b) on a single class # (i) for the ensemble # (ii)for the single model from this epoch stats = {} models = (ensemble, single_model) loaders = (testloader, class_testloader) losses = ensemble_loss, one_loss model_names = ['ensemble', 'single_model'] loader_names = ['full', 'single_class'] for i, j in itertools.product(range(len(models)), range(len(loaders))): track.debug("[ensemble size: %d] Evaluating loss/acc/entropy for " "%s on %s dataset" % (ensemble_size, model_names[i], loader_names[i])) metric = model_names[i] + '_' + loader_names[i] loss, acc, entropy = _test_dataset(models[i], loaders[j], losses[i]) stats[metric + '_loss'] = loss stats[metric + '_acc'] = acc stats[metric + '_entropy'] = entropy track.metric(ensemble_size=ensemble_size, **stats)