def compute_features_learner( data, dataset_type: DatasetType, learn: Learner, embedding_layer: Module) -> List[Dict[str, np.array]]: """Compute features for multiple image using mini-batching. Use this function to featurize the training or test set of a learner Args: dataset_type: Specify train, valid or test set. learn: Trained model to use as featurizer embedding_layer: Number of columns on which to display the images Note: this function processes each image at a time and is hence slower compared to using mini-batches of >1. Returns: DNN feature of the provided image. """ # Note: In Fastai, for DatasetType.Train, only the output of complete minibatches is computed. Ie if one has 101 images, # and uses a minibatch size of 16, then len(feats) is 96 and not 101. For DatasetType.Valid this is not the case, # and len(feats) is as expected 101. A way around this is to use DatasetType.Fix instead when referring to the training set. # See e.g. issue: https://forums.fast.ai/t/get-preds-returning-less-results-than-length-of-original-dataset/34148 if dataset_type == DatasetType.Train or dataset_type == DatasetType.Fix: dataset_type = ( DatasetType.Fix ) # Training set without shuffeling and no dropping of last batch. See note above. label_list = list(data.train_ds.items) elif dataset_type == DatasetType.Valid: label_list = list(data.valid_ds.items) elif dataset_type == DatasetType.Test: label_list = list(data.test_ds.items) else: raise Exception( "Dataset_type needs to be of type DatasetType.Train, DatasetType.Valid, DatasetType.Test or DatasetType.Fix." ) # Update what data the learner object is using tmp_data = learn.data learn.data = data # Compute features featurizer = SaveFeatures(embedding_layer) learn.get_preds(dataset_type) feats = featurizer.features[:] # Set data back to before learn.data = tmp_data # Get corresponding image paths assert len(feats) == len(label_list) im_paths = [str(x) for x in label_list] return dict(zip(im_paths, feats))
def compute_features_learner( data, dataset_type: DatasetType, learn: Learner, embedding_layer: Module) -> List[Dict[str, np.array]]: """Compute features for multiple image using mini-batching. Use this function to featurize the training or test set of a learner Args: dataset_type: Specify train, valid or test set. learn: Trained model to use as featurizer embedding_layer: Number of columns on which to display the images Note: this function processes each image at a time and is hence slower compared to using mini-batches of >1. Returns: DNN feature of the provided image. """ if dataset_type == DatasetType.Train: label_list = list(data.train_ds.items) elif dataset_type == DatasetType.Valid: label_list = list(data.valid_ds.items) elif dataset_type == DatasetType.Test: label_list = list(data.test_ds.items) else: raise Exception( "Dataset_type needs to be of type DatasetType.Train, DatasetType.Valid or DatasetType.Test." ) featurizer = SaveFeatures(embedding_layer) _ = learn.get_preds(dataset_type) feats = featurizer.features[:] # Get corresponding image paths im_paths = [str(x) for x in label_list] assert len(feats) == len(im_paths) return dict(zip(im_paths, feats))
] learn = Learner(db, model, metrics=[rmse, mae], callback_fns=callback_fns, wd=args.wd, loss_func=contribs_rmse_loss) if args.start_epoch > 0: learn.load(model_se_str + f'_{args.start_epoch-1}') else: learn.load(model_str) torch.cuda.empty_cache() if distributed_train: learn = learn.to_distributed(args.local_rank) learn.fit(args.epochs) # make predictions n_val = len(train_df[train_df['molecule_id'].isin(val_mol_ids)]) val_preds = np.zeros((n_val, args.epochs)) test_preds = np.zeros((len(test_df), args.epochs)) for m in range(args.epochs): print(f'Predicting for model {m}') learn.load(model_se_str + f'_{m}') val_contrib_preds = learn.get_preds(DatasetType.Valid) test_contrib_preds = learn.get_preds(DatasetType.Test) val_preds[:, m] = val_contrib_preds[0][:, -1].detach().numpy() test_preds[:, m] = test_contrib_preds[0][:, -1].detach().numpy() val_preds = val_preds * C.SC_STD + C.SC_MEAN test_preds = test_preds * C.SC_STD + C.SC_MEAN # store results store_submit(pd.DataFrame(test_preds), snapshots_str, print_head=True) store_oof(pd.DataFrame(val_preds), snapshots_str, print_head=True)