def train_model(model: SLModel, trainset: NpDataset, valset: NpDataset, epochs=5, batch_size=32): # Create the generators logging.info("Training model for {} epochs and {} batch size".format( epochs, batch_size)) logging.info("Flowing the train and validation sets") traingen = trainset.flow( batch_size=batch_size, shuffle=True, seed=utils.get_random_seed()) valgen = valset.flow(batch_size=batch_size, shuffle=False) # Create the callbacks logging.info("Creating the callbacks") callbacks = [ ModelCheckpoint( utils.get_model_path(RUN_ID), "val_loss", verbose=1, save_best_only=True), Plotter( "loss", scale='log', plot_during_train=True, save_to_file=utils.get_plot_path(RUN_ID), block_on_end=False), Plotter( "accuracy", scale='linear', plot_during_train=True, save_to_file=utils.get_plot_path(RUN_ID + "_acc"), block_on_end=False) ] # Create the optiizer logging.info("Creating the optimizer") params = [param for param in model.parameters() if param.requires_grad] # optimizer = optim.SGD( # params # lr=0.01, # momentum=0.9, # nesterov=True) optimizer = optim.Adam(params) logging.info("Optimizer: %r" % optimizer) # Train the model logs = model.fit_generator( traingen, traingen.steps_per_epoch, epochs=epochs, optimizer=optimizer, validation_generator=valgen, validation_steps=valgen.steps_per_epoch, metrics=["accuracy"], callbacks=callbacks, verbose=1) return logs
def fit(self, X, y, *args, parallel=None, **kwargs): """ :param X: n_samples X n_models :param y: n_samples :param args: placeholder :param kwargs: placeholder """ data = NpDataset(X, y=y) self.__val_preds = np.zeros(y.shape) if self.__val_preds.ndim != 1: logging.error( "Shape of validation predictions is incorrect: {}".format( self.__val_preds.shape)) model_and_preds = parallel( delayed(fit_fold_model)(self.models[i], train_data.x, train_data.y, val_data.x, val_data.y, *args, **kwargs) for i, (train_data, val_data) in enumerate(data.kfold(self.k, shuffle=False))) cur_sample_ind = 0 for i, (model, val_preds) in enumerate(model_and_preds): self.models[i] = model self.__val_preds[cur_sample_ind:cur_sample_ind + val_preds.shape[0]] = val_preds cur_sample_ind += val_preds.shape[0] assert cur_sample_ind == X.shape[0]
def test_model(model: SLModel, test_data: NpDataset, batch_size=BATCH_SIZE): logging.info("Testing model with batch size of {batch_size}".format(**locals())) logging.info("Flowing the test set") test_data.output_labels = False testgen = test_data.flow(batch_size=batch_size, shuffle=False) test_preds = model.predict_generator( testgen, testgen.steps_per_epoch, verbose=1) return test_preds.squeeze(-1)
def roc_auc(self, X, y, *args, **kwargs): data = NpDataset(X, y=y) score = 0. for i, (train_data, val_data) in enumerate(data.kfold(self.k, shuffle=False)): score = score + self.models[i].roc_auc(val_data.x, val_data.y, * args, **kwargs) return score / self.k
def validate_model(model: SLModel, val_data: NpDataset, batch_size=32): logging.info("Validating model with batch size of {}".format(batch_size)) val_data.output_labels = False logging.info("Flowing the validation set") valgen = val_data.flow(batch_size=batch_size, shuffle=False) logging.info("Getting validation predictions") val_preds = model.predict_generator(valgen, valgen.steps_per_epoch) score = roc_auc_score(val_data.y[:, 0], val_preds[:, 0]) logging.info("Validation ROC AUC score: {}".format(score)) return score
def train_model(model: SLModel, trainset: NpDataset, valset: NpDataset, epochs=EPOCHS, batch_size=BATCH_SIZE, plot=True): # Create the generators logging.info("Training model for {epochs} epochs and {batch_size} batch " "size".format(**locals())) logging.info("Flowing the train and validation sets") traingen = trainset.flow( batch_size=batch_size, shuffle=True, seed=utils.get_random_seed()) valgen = valset.flow(batch_size=batch_size, shuffle=False) # Create the callbacks logging.info("Creating the callbacks") callbacks = [ ModelCheckpoint( utils.get_model_path(RUN_ID), "val_loss", verbose=1, save_best_only=True), Plotter( "bce", scale='log', plot_during_train=plot, save_to_file=utils.get_plot_path(RUN_ID+"_bce"), block_on_end=False), Plotter( "dice", plot_during_train=plot, save_to_file=utils.get_plot_path(RUN_ID+"_dice"), block_on_end=False), Plotter( "iou", plot_during_train=plot, save_to_file=utils.get_plot_path(RUN_ID + "_iou"), block_on_end=False), ] # Train the model logs = model.fit_generator( traingen, traingen.steps_per_epoch, epochs=epochs, validation_data=valgen, validation_steps=valgen.steps_per_epoch, callbacks=callbacks, metrics=["iou", mean_iou], verbose=1) return logs
def load_train(self): # Just load the data into a numpy dataset, it ain't that big logging.info("Loading train images from {self.path_to_train_images} " "and masks from {self.path_to_train_masks}" "".format(**locals())) img_paths = sorted(glob(self.glob_train_images)) mask_paths = set(glob(self.glob_train_masks)) # Use set to look up # Initialize the numpy data containers x = np.zeros((len(img_paths), ) + self.img_size + (4, )) y = np.zeros((len(mask_paths), ) + self.img_size + (1, )) ids = [] for i, img_path in enumerate(tqdm(img_paths)): img_basename = os.path.basename(img_path) ids.append(os.path.splitext(img_basename)[0]) x[i, ..., :3] = ImageDataset.load_img(img_path, img_size=None, mode=self.mode)[0] x[i, ..., 3] = self.depths.loc[ids[-1]] # Load the mask mask_path = os.path.join(self.path_to_train_masks, img_basename) # Use the 0 mask if its not there if mask_path not in mask_paths: logging.info("Could not find {img_basename} in masks" "".format(**locals())) continue y[i] = ImageDataset.load_img(mask_path, img_size=None, mode="gray")[0] print("X shape:", x.shape) print("Y Shape:", y.shape) return NpDataset(x.astype('float32'), y.astype('float32'), ids=np.array(ids))
def test_model(model: SLModel, test_data: NpDataset, batch_size=32): logging.info("Testing model with batch size of {}".format(batch_size)) logging.info("Flowing the test set") testgen = test_data.flow(batch_size=batch_size, shuffle=False) test_preds = model.predict_generator( testgen, testgen.steps_per_epoch, verbose=1) return test_preds[:, 0]
def kfold(self, k=True, shuffle=False, seed=None): for train_split, val_split in self.original_dataset.get_kfold_indices( k, shuffle, seed): train_data = MultiNpDatasetAugmenter(*(NpDataset( dataset.x[train_split], y=None if not dataset.has_labels else dataset.y[train_split], ids=None if not dataset.has_ids else dataset.ids[train_split]) for dataset in self.datasets )) val_data = NpDataset(self.original_dataset.x[val_split], y=None if not self.original_dataset.has_labels else self.original_dataset.y[val_split], ids=None if not self.original_dataset.has_ids else self.original_dataset.ids[val_split]) yield train_data, val_data
def load_supervised(data): ids = data["ids"] text = data["texts"] if "labels" in data: labels = data["labels"] else: labels = None return ids, NpDataset(text, labels, ids=ids)
def fit(self, *args, **kwargs): scores = np.zeros(len(LABEL_NAMES)) rocs = np.zeros(len(LABEL_NAMES)) accs = np.zeros(len(LABEL_NAMES)) for label_num in range(len(LABEL_NAMES)): logging.info("Training for label {label}".format( label=LABEL_NAMES[label_num])) subdataset = NpDataset(self.base_dataset.x[..., label_num], y=self.base_dataset.y[..., label_num]) # Best stuff best_score = float('inf') best_param_num = 0 for param_num in trange(len(self.model[label_num])): # This will also save the val_preds self.model[label_num][param_num].fit(subdataset.x, subdataset.y, *args, **kwargs) # Save the model if its our best so far score = self.model[label_num][param_num].score( subdataset.x, subdataset.y) print(self.grid[param_num]) if score < best_score: logging.info( "Score improved from {best_score} to {score}".format( best_score=best_score, score=score)) self.best_model.models[label_num] = copy.deepcopy( self.model[label_num][param_num]) best_score = score best_param_num = param_num # Calculate the stats scores[label_num] = best_score rocs[label_num] = self.best_model.models[ label_num].roc_auc(subdataset.x, subdataset.y) accs[label_num] = self.best_model.models[ label_num].accuracy(subdataset.x, subdataset.y) # Remove the current model from memory self.model[label_num][param_num] = None logging.info( "Best score achieved is {best_score} with params {best_params}" .format(best_score=best_score, best_params=self.grid[best_param_num])) logging.info( "Metrics are: Accuracy - {acc} --- ROC AUC - {roc}".format( acc=accs[label_num], roc=rocs[label_num])) logging.info("C") logging.info( "Average Scores: LogLoss - {loss} --- Accuracy - {acc} --- ROC AUC - {roc}" .format(loss=np.average(scores), acc=np.average(accs), roc=np.average(rocs)))
def test_augmenter_basic(): # Try different combinations of with labels and without data = NpDataset(x=np.ones((32, )), y=np.ones((32, ))) augmenter = ZeroAugmenter(labels=False, augment_labels=False) assert not augmenter.labels assert not augmenter.augment_labels data.output_labels = False x = next(augmenter(data.flow(batch_size=32))) assert np.all(x == 0.) augmenter = ZeroAugmenter(labels=False, augment_labels=True) assert not augmenter.labels assert augmenter.augment_labels data.output_labels = False x = next(augmenter(data.flow(batch_size=32))) assert np.all(x == 0.) augmenter = ZeroAugmenter(labels=True, augment_labels=False) assert augmenter.labels assert not augmenter.augment_labels data.output_labels = True x, y = next(augmenter(data.flow(batch_size=32))) assert np.all(x == 0.) assert np.all(y == 1.) augmenter = ZeroAugmenter(labels=True, augment_labels=True) assert augmenter.labels assert augmenter.augment_labels data.output_labels = True x, y = next(augmenter(data.flow(batch_size=32))) assert np.all(x == 0.) assert np.all(y == 0.) # Try a generic python generator def datagen(): yield np.ones((32, )) augmenter = ZeroAugmenter(labels=False, augment_labels=False) x = next(augmenter(datagen())) assert np.all(x == 0.)
def validation_split(self, split=0.2, shuffle=False, seed=None, stratified=False): """ NOTE: Only use stratified if the labels are the same between the augmented sets This will assume the first dataset provided is the original and others are augmented versions. Thus the validation set will only be pulled from the original dataset. """ # Get the split indicies train_split, val_split = self.original_dataset.get_split_indicies( split, shuffle, seed, stratified) # Create each subdataset train_data = MultiNpDatasetAugmenter(*(NpDataset( dataset.x[train_split], None if not self.output_labels else dataset.y[train_split]) for dataset in self.datasets)) # We use the original dataset for the validation set val_data = NpDataset( self.original_dataset.x[val_split], None if not self.output_labels else self.original_dataset.y[val_split]) return train_data, val_data
def load_test(self): # Just load the data into a numpy dataset, it ain't that big logging.info("Loading test images from {self.path_to_test_images}" " and glob {self.glob_test_images}".format(**locals())) img_paths = sorted(glob(self.glob_test_images)) # Initialize the numpy data containers x = np.zeros((len(img_paths), ) + self.img_size + (4, )) ids = [] for i, img_path in enumerate(tqdm(img_paths)): x[i, ..., :3] = ImageDataset.load_img(img_path, img_size=None, mode=self.mode)[0] x[i, ..., :4] = self.depths.loc[ids[-1]] / MAX_DEPTH # Load the mask img_basename = os.path.basename(img_path) ids.append(os.path.splitext(img_basename)[0]) print("Xte Shape:", x.shape) return NpDataset(x.astype('float32'), ids=np.array(ids))
def create_predictions(model_names, k, seed=7, savedir="../superlearner_preds/", data_paths=tuple(), batch_size=32): num_base_learners = len(model_names) logging.info("Using %s base learners" % num_base_learners) # Build the new train data to train the meta learner on predictions, pred_labels = None, None for j, model_name in enumerate(model_names): # Try to load it, otherwise create the predictions try: single_predictions, pred_labels = load_predictions( model_name, savedir, pred_labels=pred_labels) except: # If the file is not there, create it logging.info("Couldn't load predictions for " + model_name + ", creating instead") train_data = load_dataset(data_paths[j]) single_predictions, pred_labels = predict_val( model_name, train_data, k, seed=seed, Y=pred_labels, batch_size=batch_size) save_predictions(single_predictions, predictions, model_names[j], savedir) assert single_predictions.ndim == 2 # Construct the X array if this is our first iteration if j == 0: predictions = np.zeros( (single_predictions.shape[0], num_base_learners, single_predictions.shape[1]), dtype=np.float32) assert predictions.shape[0] == single_predictions.shape[0] assert predictions.shape[2] == single_predictions.shape[1] predictions[:, j] = single_predictions return NpDataset(predictions, y=pred_labels)
def train_superlearner(pred_X, pred_Y): # Now train 6 dense layers num_base_learners = pred_X.shape[1] weights = np.zeros((num_base_learners, len(LABEL_NAMES))) for i, label in enumerate(LABEL_NAMES): logging.info("Training logistic regression for label %s" % label) pred_dataset = NpDataset(x=pred_X[:, :, i], y=pred_Y[:, i:i + 1]) datagen = DatasetGenerator(pred_dataset, batch_size=len(pred_dataset), shuffle=False) logistic_reg = build_model(num_base_learners, 1) optimizer = torch.optim.SGD(logistic_reg.parameters(), lr=0.01) train_logs, val_logs = logistic_reg.fit_generator(datagen, steps_per_epoch=datagen.steps_per_epoch, epochs=1000, optimizer=optimizer, loss_fn=F.binary_cross_entropy_with_logits, metrics=[accuracy_with_logits], verbose=0) logging.info("Final Loss: %s" % train_logs["loss"][-1]) logging.info("Final Accuracy: %s" % train_logs["accuracy_with_logits"][-1]) weight = logistic_reg.torch_module.linear.weight.data weights[:, i] = weight.cpu().numpy().flatten() logging.info("Trained weights: {}".format(weights[:, i])) return weights
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) return F.log_softmax(x) model = Net() if args.cuda: model.cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Turn the numpy dataset into a BatchGenerator train_datagen = DatasetGenerator(NpDataset(xtr, y=ytr), batch_size=32, shuffle=True, seed=1234) # Turn the val data into a BatchGenerator val_datagen = DatasetGenerator(NpDataset(xval, y=yval), batch_size=1000, shuffle=True, seed=1234) def train(epoch): model.train() for batch_idx, (data, target) in enumerate(train_datagen): data, target = torch.Tensor(data), torch.LongTensor(target) if args.cuda:
def train_model(model, trainset: NpDataset, valset: NpDataset, epochs=70, batch_size=32, val_batch_size=32, plot=True, run_id='default_model_name', augmenter=None, verbose=1, debug=False): # Create the generators logger.info( f'Training model for {epochs} epochs and {batch_size} batch size') logger.info('Flowing the train and validation sets') traingen = trainset.flow(batch_size=batch_size, shuffle=True, seed=utils.get_random_seed()) valgen = valset.flow(batch_size=val_batch_size, shuffle=False) if augmenter is not None: logger.info(f'Training with augmenter {augmenter.image_augmenter}') augmenter.labels = True traingen = augmenter(traingen) # Create the callbacks logger.info('Creating the callbacks') callbacks = [ ModelCheckpoint(utils.get_model_path(run_id), 'val_loss', verbose=verbose, save_best_only=True, save_weights_only=True), ModelCheckpoint(utils.get_model_path(run_id + '_dice_coef'), 'val_dice_coef', verbose=verbose, save_best_only=True, save_weights_only=True, mode='max'), Plotter('loss', scale='log', plot_during_train=plot, save_to_file=utils.get_plot_path(run_id + '_loss'), block_on_end=False), Plotter('dice_coef', plot_during_train=plot, save_to_file=utils.get_plot_path(run_id + '_dice_coef'), block_on_end=False), ] train_steps = 3 if debug else traingen.steps_per_epoch val_steps = 3 if debug else valgen.steps_per_epoch epochs = 2 if debug else epochs # Train the model logs = model.fit_generator(traingen, train_steps, epochs=epochs, validation_data=valgen, validation_steps=val_steps, callbacks=callbacks, verbose=verbose, max_queue_size=3) return logs
def test_model(model, test_data: NpDataset, batch_size=32, num_augmentations=0, view_preds=False, debug=False): logger.info(f'Testing model with batch size of {batch_size}') logger.info('Flowing the test set') test_data.output_labels = False testgen = test_data.flow(batch_size=batch_size, shuffle=False) if num_augmentations: print(f'Testing with a flip augmenter') augmenter = FlipAugmenter(flipud=True, fliplr=True) aug_params = [ dict(flipud=True, fliplr=True), dict(flipud=True, fliplr=False), dict(flipud=False, fliplr=True), dict(flipud=False, fliplr=False) ] augmenter.labels = False testgen = augmenter(testgen) else: num_augmentations = 1 augmenter = None test_steps = 3 if debug else testgen.steps_per_epoch test_preds = 0. for i in range(num_augmentations): if augmenter is not None: print( f'Testing for augmentation {i+1}/{num_augmentations} with flipud={aug_params[i]["flipud"]} and fliplr={aug_params[i]["fliplr"]}' ) augmenter.flipud = aug_params[i]['flipud'] augmenter.fliplr = aug_params[i]['fliplr'] aug_test_preds = model.predict_generator( testgen, test_steps, verbose=1, max_queue_size=0, workers=0 ) # Must set to workers=0 to maintain test prediction order # Reverse the augmentations # TODO: only works with flips, implement general solution for non-flips if augmenter is not None: print('Running reverse augmentation on predictions...') aug_test_preds = augmenter.reverse_augment(aug_test_preds) if view_preds: if augmenter: testgen.generator.restart() display_predictions(testgen.generator, aug_test_preds) else: display_predictions(testgen, aug_test_preds) test_preds = test_preds + aug_test_preds test_preds /= num_augmentations if debug: filler = np.zeros( (len(test_data) - len(test_preds), *test_preds.shape[1:])) test_preds = np.concatenate([test_preds, filler]) if view_preds: display_predictions(testgen, test_preds) return test_preds.squeeze(-1)
submission_file) if __name__ == "__main__": args = parser.parse_args() # Load the train_config train_config = load_train_setup(args.train_id) trained_model = None PLOT = args.plot if args.train: # Load the train data train_ids, x_train, y_train = dsb.load_train_data( path_to_train="../input/train/", img_size=train_config["img_size"], num_channels=3) train_dataset = NpDataset(x=x_train, y=y_train, ids=train_ids) # train the models if not train_config["kfold"]: raise NotImplementedError("Non-kfold training is not implemented") trained_model = kfold(train_dataset, train_config, args.train_id, num_completed=args.num_completed) if args.test: # Load the test data test_ids, x_test, sizes_test = dsb.load_test_data( path_to_test="../input/test/", img_size=train_config["img_size"], num_channels=3) test_dataset = NpDataset(x=x_test, ids=test_ids)
model = MNISTModel() model.add_loss(nn.CrossEntropyLoss()) # This will save the best scoring model weights to the current directory best_model = ModelCheckpoint( "mnist_pyjet" + ".state", monitor="val_accuracy", mode="max", verbose=1, save_best_only=True, ) # This will plot the model's accuracy during training plotter = Plotter(scale="linear", monitor="accuracy") # Turn the numpy dataset into a BatchGenerator train_datagen = NpDataset(xtr, y=ytr).flow(batch_size=64, shuffle=True, seed=1234) # Turn the val data into a BatchGenerator val_datagen = NpDataset(xval, y=yval).flow(batch_size=1000, shuffle=True, seed=1234) # Set up the optimizer optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) model.add_optimizer(optimizer) # Add the LR scheduler one_cycle = OneCycleScheduler( optimizer, (1e-4, 1e-2), (0.95, 0.85), train_datagen.steps_per_epoch * 5 ) class LR(Metric): def __init__(self, onecycle):
def load_test(self): ids, data = self.load_application_data(type='test') return NpDataset(data.values, ids=ids.values)
def load_train(self): # TODO: For now just loads the trianing data ids, data, targets = self.load_application_data(type='train') y = targets.values.astype(np.float32)[:, None] return NpDataset(data.values, y=y, ids=ids.values)