def train_and_report(model_name, kernel, warp, ard): dataset_dir = os.path.join(MODEL_DIR, DATASET) try: os.makedirs(dataset_dir) except OSError: print "skipping output folder" for fold in xrange(10): fold_dir = os.path.join(SPLIT_DIR, DATASET, str(fold)) train_data = np.loadtxt(os.path.join(fold_dir, 'train')) test_data = np.loadtxt(os.path.join(fold_dir, 'test')) params_file = None output_dir = os.path.join(dataset_dir, str(fold)) try: os.makedirs(output_dir) except OSError: print "skipping output folder" if ard: iso_dir = output_dir.replace('True', 'False') params_file = os.path.join(iso_dir, 'params') gp = util.train_gp_model(train_data, kernel, warp, ard, params_file) util.save_parameters(gp, os.path.join(output_dir, 'params')) util.save_gradients(gp, os.path.join(output_dir, 'grads')) metrics = util.get_metrics(gp, test_data) util.save_metrics(metrics, os.path.join(output_dir, 'metrics')) util.save_cautious_curves(gp, test_data, os.path.join(output_dir, 'curves')) util.save_predictions(gp, test_data, os.path.join(output_dir, 'preds')) asym_metrics = util.get_asym_metrics(gp, test_data) util.save_asym_metrics(asym_metrics, os.path.join(output_dir, 'asym_metrics')) gc.collect(2) # buggy GPy has allocation cycles...
def train_and_report(model_name, kernel, warp, ard, likelihood='gaussian'): dataset_dir = os.path.join(MODEL_DIR, DATASET) try: os.makedirs(dataset_dir) except OSError: print "skipping output folder" for fold in xrange(10): fold_dir = os.path.join(SPLIT_DIR, DATASET, str(fold)) train_data = np.loadtxt(os.path.join(fold_dir, 'train')) test_data = np.loadtxt(os.path.join(fold_dir, 'test')) output_dir = os.path.join(dataset_dir, str(fold)) params_file = None if ard: iso_dir = output_dir.replace('True', 'False') params_file = os.path.join(iso_dir, 'params') gp = util.train_gp_model(train_data, kernel, warp, ard, params_file, likelihood=likelihood) metrics = util.get_metrics(gp, test_data) try: os.makedirs(output_dir) except OSError: print "skipping output folder" util.save_parameters(gp, os.path.join(output_dir, 'params')) util.save_metrics(metrics, os.path.join(output_dir, 'metrics')) #util.save_gradients(gp, os.path.join(output_dir, 'grads')) util.save_cautious_curves(gp, test_data, os.path.join(output_dir, 'curves')) util.save_predictions(gp, test_data, os.path.join(output_dir, 'preds'))
def main(args): os.makedirs(args.exp_dir, exist_ok=True) # ==== LOAD DATA ==== if args.debug: max_data = 1000 else: max_data = None train = SNLI("data/snli_1.0/", "train", max_data=max_data) val = SNLI("data/snli_1.0/", "dev", max_data=max_data, vocab=(train.stoi, train.itos)) dataloaders = { "train": DataLoader( train, batch_size=100, shuffle=True, pin_memory=False, num_workers=0, collate_fn=pad_collate, ), "val": DataLoader( val, batch_size=100, shuffle=False, pin_memory=True, num_workers=0, collate_fn=pad_collate, ), } # ==== BUILD MODEL ==== model = build_model( len(train.stoi), args.model_type, embedding_dim=args.embedding_dim, hidden_dim=args.hidden_dim, ) if args.cuda: model = model.cuda() optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss() metrics = defaultdict(list) metrics["best_val_epoch"] = 0 metrics["best_val_acc"] = 0 metrics["best_val_loss"] = np.inf # Save model with 0 training util.save_checkpoint(serialize(model, train), False, args.exp_dir, filename="0.pth") # ==== TRAIN ==== for epoch in range(args.epochs): train_metrics = run("train", epoch, model, optimizer, criterion, dataloaders, args) val_metrics = run("val", epoch, model, optimizer, criterion, dataloaders, args) for name, val in train_metrics.items(): metrics[f"train_{name}"].append(val) for name, val in val_metrics.items(): metrics[f"val_{name}"].append(val) is_best = val_metrics["acc"] > metrics["best_val_acc"] if is_best: metrics["best_val_epoch"] = epoch metrics["best_val_acc"] = val_metrics["acc"] metrics["best_val_loss"] = val_metrics["loss"] util.save_metrics(metrics, args.exp_dir) util.save_checkpoint(serialize(model, train), is_best, args.exp_dir) if epoch % args.save_every == 0: util.save_checkpoint(serialize(model, train), False, args.exp_dir, filename=f"{epoch}.pth")
print("Starting training for %s steps max" % args.num_steps) classifier.fit( x=train_bottlenecks.astype(np.float32), y=train_ground_truth, batch_size=10, max_steps=args.num_steps) # We've completed our training, so run a test evaluation on some new images we haven't used before. test_bottlenecks, test_ground_truth, image_paths = util.get_all_cached_bottlenecks( sess, image_lists, 'testing', args.bottleneck_dir, args.image_dir, jpeg_data_tensor, bottleneck_tensor) test_bottlenecks = np.array(test_bottlenecks) test_ground_truth = np.array(test_ground_truth) print("evaluating....") classifier.evaluate(test_bottlenecks.astype(np.float32), test_ground_truth) # write the output labels file if it doesn't already exist if gfile.Exists(output_labels_file): print("Labels list file already exists; not writing.") else: output_labels = json.dumps(list(image_lists.keys())) with gfile.FastGFile(output_labels_file, 'w') as f: f.write(output_labels) print("\nSaving metrics...") util.save_metrics(args, classifier, test_bottlenecks.astype(np.float32), all_label_names, test_ground_truth, \ image_paths, image_lists, exemplars) util_plot.plot_metrics(args.model_dir) print("Done !")
def main(args): os.makedirs(args.exp_dir, exist_ok=True) # ==== LOAD DATA ==== train = IMDB("data/imdb/", "train") # FIXME: sample a real dev set later (this isn't too bad because we are not # trying to optimize for test set perf at all) val = IMDB("data/imdb/", "test") dataloaders = { "train": DataLoader( train, batch_size=100, shuffle=True, pin_memory=True, num_workers=0, collate_fn=pad_collate, ), "val": DataLoader( val, batch_size=100, shuffle=False, pin_memory=True, num_workers=0, collate_fn=pad_collate, ), } # ==== BUILD MODEL ==== model = build_model( len(train.stoi), args.model_type, embedding_dim=args.embedding_dim, hidden_dim=args.hidden_dim, ) if args.cuda: model = model.cuda() optimizer = optim.Adam(model.parameters()) criterion = nn.BCEWithLogitsLoss() metrics = defaultdict(list) metrics["best_val_epoch"] = 0 metrics["best_val_acc"] = 0 metrics["best_val_loss"] = np.inf # Save model with 0 training util.save_checkpoint(serialize(model, train), False, args.exp_dir, filename="0.pth") # ==== TRAIN ==== for epoch in range(args.epochs): train_metrics = run( "train", epoch, model, optimizer, criterion, dataloaders, args ) val_metrics = run("val", epoch, model, optimizer, criterion, dataloaders, args) for name, val in train_metrics.items(): metrics[f"train_{name}"].append(val) for name, val in val_metrics.items(): metrics[f"val_{name}"].append(val) is_best = val_metrics["acc"] > metrics["best_val_acc"] if is_best: metrics["best_val_epoch"] = epoch metrics["best_val_acc"] = val_metrics["acc"] metrics["best_val_loss"] = val_metrics["loss"] util.save_metrics(metrics, args.exp_dir) util.save_checkpoint(serialize(model, train), is_best, args.exp_dir) if epoch % args.save_every == 0: util.save_checkpoint( serialize(model, train), False, args.exp_dir, filename=f"{epoch}.pth" )
random_state=random) for metric, value in train_metrics.items(): try: metrics['train_{}'.format(metric)].append(value) except KeyError: pass # Could be missing due to resuming from older code for metric, value in val_metrics.items(): try: metrics['val_{}'.format(metric)].append(value) except KeyError: pass metrics['current_epoch'] = epoch is_best = val_metrics['f1'] > metrics['best_f1'] if is_best: metrics['best_f1'] = val_metrics['f1'] metrics['best_loss'] = val_metrics['loss'] metrics['best_epoch'] = epoch # Save model util.save_checkpoint( { 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch }, is_best, args.exp_dir) # Save metrics util.save_metrics(metrics, args.exp_dir)
def main(file_path, batch_size, base_model, num_epochs): """Train movie sentiment model""" # %% # base_model = "roberta-base" # batch_size=8 # num_epochs=5 print("Initializing models") tokenizer = RobertaTokenizerFast.from_pretrained(base_model) if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') model = RoBERTaSentimentClassifier(device=device, base_model=base_model) print(f"Using device {model.device}") #%% train_cache = Path(".data/cache/train_data") val_cache = Path(".data/cache/validate_data") if train_cache.exists() and val_cache.exists(): print("Load cached datasets") train = load_cached_dataset(train_cache) val = load_cached_dataset(val_cache) else: print("Generating datasets") PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) # set up fields TEXT = data.Field(use_vocab=False, include_lengths=False, batch_first=True, lower=False, fix_length=512, tokenize=tokenizer.encode, pad_token=PAD_INDEX, unk_token=UNK_INDEX) LABEL = data.LabelField() # make splits for data train, test = datasets.IMDB.splits(TEXT, LABEL) LABEL.build_vocab(train) test, val = test.split(split_ratio=0.9) print("Cache train and validate sets") save_cached_dataset(train, train_cache) save_cached_dataset(val, val_cache) print("Prepare dataset iterators") # make iterator for splits train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=batch_size, device=device) #%% for batch in val_iter: if batch.text.shape[0] != batch.label.shape[0]: print(batch) # print(batch.text.shape, batch.label.shape) # break #%% #dir(val_iter) #%% # initialize running values running_loss = 0.0 valid_running_loss = 0.0 global_step = 0 train_loss_list = [] valid_loss_list = [] global_steps_list = [] best_valid_loss = float("Inf") model.train() optimizer = torch.optim.Adam(model.parameters(), lr=2e-5) for item in train_iter: print(item) break print("Start training") for epoch in range(1, num_epochs + 1): print(f"Epoch {epoch}") train_iter.init_epoch() val_iter.init_epoch() for i, (text, labels) in enumerate(tqdm(train_iter, desc="train")): labels = labels.type(torch.LongTensor) labels = labels.to(device) output = model(text, labels) loss, _ = output optimizer.zero_grad() loss.backward() optimizer.step() # update running values running_loss += loss.item() global_step += 1 model.eval() with torch.no_grad(): answers = [] # validation loop for i, (text, labels) in enumerate(tqdm(val_iter, desc="validate")): labels = labels.type(torch.LongTensor) labels = labels.to(device) output = model(text, labels) loss, preds = output correct = torch.argmax(preds, dim=1) == labels answers.extend(correct.cpu().tolist()) valid_running_loss += loss.item() average_accuracy = sum([1 for a in answers if a]) / len(answers) # evaluation average_train_loss = running_loss / epoch average_valid_loss = valid_running_loss / 10 train_loss_list.append(average_train_loss) valid_loss_list.append(average_valid_loss) global_steps_list.append(global_step) # resetting running values running_loss = 0.0 valid_running_loss = 0.0 model.train() # print progress print( 'Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}, Valid Acc: {:.4f}' .format(epoch + 1, num_epochs, global_step, num_epochs * len(train_iter), average_train_loss, average_valid_loss, average_accuracy)) # checkpoint if best_valid_loss > average_valid_loss: best_valid_loss = average_valid_loss save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss) save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list) save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list) print('Finished Training!')
def train_net(model, args): data_path = args.data_path num_epochs = args.epochs gpu = args.gpu n_classes = args.classes data_width = args.width data_height = args.height # set device configuration device_ids = [] if gpu == 'gpu': if not torch.cuda.is_available(): print("No cuda available") raise SystemExit device = torch.device(args.device1) device_ids.append(args.device1) if args.device2 != -1: device_ids.append(args.device2) if args.device3 != -1: device_ids.append(args.device3) if args.device4 != -1: device_ids.append(args.device4) else: device = torch.device("cpu") if len(device_ids) > 1: model = nn.DataParallel(model, device_ids=device_ids) model = model.to(device) # set image into training and validation dataset train_dataset = SampleDataset(data_path) print('total image : {}'.format(len(train_dataset))) train_indices, val_indices = train_test_split(np.arange( len(train_dataset)), test_size=0.2, random_state=42) train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = DataLoader(train_dataset, batch_size=20, num_workers=4, sampler=train_sampler) val_loader = DataLoader(train_dataset, batch_size=10, num_workers=4, sampler=valid_sampler) model_folder = os.path.abspath('./checkpoints') if not os.path.exists(model_folder): os.mkdir(model_folder) if args.model == 'UNet': model_path = os.path.join(model_folder, 'UNet.pth') elif args.model == 'SegNet': model_path = os.path.join(model_folder, 'SegNet.pth') elif args.model == 'DenseNet': model_path = os.path.join(model_folder, 'DenseNet.pth') # set optimizer optimizer = torch.optim.Adam(model.parameters()) # main train display_steps = 30 best_loss = 1e10 loss_history = [] ## for early stopping early_stop = False patience = 7 counter = 0 for epoch in range(num_epochs): print('Starting epoch {}/{}'.format(epoch + 1, num_epochs)) # train model.train() metrics = defaultdict(float) epoch_size = 0 # train model for batch_idx, (images, masks) in enumerate(train_loader): images = images.to(device).float() masks = masks.to(device).long() optimizer.zero_grad() outputs = model(images) loss, cross, dice = combined_loss(outputs, masks.squeeze(1), device, n_classes) save_metrics(metrics, images.size(0), loss, cross, dice) loss.backward() optimizer.step() # statistics epoch_size += images.size(0) if batch_idx % display_steps == 0: print(' ', end='') print('batch {:>3}/{:>3} cross: {:.4f} , dice {:.4f} , combined_loss {:.4f}\r'\ .format(batch_idx+1, len(train_loader), cross.item(), dice.item(),loss.item())) del images, masks, outputs, loss, cross, dice print_metrics(metrics, epoch_size, 'train') # evalute print('Finished epoch {}, starting evaluation'.format(epoch + 1)) model.eval() # validate model for images, masks in val_loader: images = images.to(device).float() masks = masks.to(device).long() outputs = model(images) loss, cross, dice = combined_loss(outputs, masks.squeeze(1), device, n_classes) save_metrics(metrics, images.size(0), loss, cross, dice) # statistics epoch_size += images.size(0) del images, masks, outputs, loss, cross, dice print_metrics(metrics, epoch_size, 'val') epoch_loss = metrics['loss'] / epoch_size # save model if best validation loss if epoch_loss < best_loss: print("saving best model") best_loss = epoch_loss model_copy = copy.deepcopy(model) model_copy = model_copy.cpu() model_state_dict = model_copy.module.state_dict( ) if len(device_ids) > 1 else model_copy.state_dict() torch.save(model_state_dict, model_path) del model_copy counter = 0 else: counter += 1 print('EarlyStopping counter : {:>3} / {:>3}'.format( counter, patience)) if counter >= patience: early_stop = True loss_history.append(best_loss) print('Best val loss: {:4f}'.format(best_loss)) if early_stop: print('Early Stopping') break return loss_history