def load_images(images, data): """ Load all of the specified images from the data folder. :param images: A list of strings that contains the image name and folder name but not the entire path. :param data: A base directory to load images from. :return: A list of numpy array images. """ return [load_image(os.path.join(data,image)).reshape(224,224,3) \ for image in images]
def main(args): print('[INFO] Starting clustering...') # Setup weights and biases setup_wandb(args) # Setup the pre-trained backbone for our model. This is # done first to get the preprocessing function for the net. encoder, preprocess = model_factory(args.backbone, pooling=args.pooling) # Load the images into memory. Right now # I am not supporting loading from disk. train, dev = load_dataframes(args.base_dir, args.min_samples) # Load the features for the dev set which we # are going to cluster up. features = np.zeros(shape=(len(train) + len(dev), encoder.output.shape[1])) for i, imagefile in enumerate(train['file']): img = load_image(image_path=os.path.join(args.base_dir, 'train/' + imagefile), preprocess_input=preprocess) features[i, :] = encoder.predict(img) for i, imagefile in enumerate(dev['file']): img = load_image(image_path=os.path.join(args.base_dir, 'dev/' + imagefile), preprocess_input=preprocess) features[i + len(train), :] = encoder.predict(img) # Scale before doing PCA, which # expects to have standardized # features. scaler = StandardScaler() features = scaler.fit_transform(features) if args.pca_components > 0: pca = PCA(n_components=args.pca_components) features = pca.fit_transform(features) # Setup clustering agglom = AgglomerativeClustering(n_clusters=args.clusters, affinity=args.affinity) print('[INFO] Running clustering...') agglom.fit(features) # Save the dataframe of validation predictions and labels df = pd.DataFrame({ 'file': dev['file'], 'label': dev['label'], 'pred': agglom.labels_[len(train):] }) df.to_csv('dev_agglom_pca_ms{}_{}.csv'.format(args.min_samples, wandb.run.id), index=False) # A quick performance estimate. ar_score = adjusted_rand_score(dev['label'], agglom.labels_[len(train):]) wandb.log({'ari': ar_score}) wandb.log({ 'nmi': normalized_mutual_info_score(dev['label'], agglom.labels_[len(train):]) }) if args.pca_components > 0: wandb.log( {'explained_variance': np.sum(pca.explained_variance_ratio_)}) else: wandb.log({'explained_variance': 0.00}) # if the number of clusters is the same as # the true labels, we can do hungarian if args.clusters == dev['label'].nunique(): hba = hungarian_balanced_accuracy( LabelEncoder().fit_transform(df['label']), df['pred']) wandb.log({'balanced_accuracy': hba}) hba = hungarian_accuracy(LabelEncoder().fit_transform(df['label']), df['pred']) wandb.log({'accuracy': hba}) print('[INFO] Finished!')
def main(args): print('[INFO] Starting clustering...') # Setup weights and biases setup_wandb(args) # Setup the pre-trained backbone for our model. This is # done first to get the preprocessing function for the net. encoder, preprocess = model_factory(args.backbone, pooling=args.pooling) # Load the images into memory. Right now # I am not supporting loading from disk. train, dev = load_dataframes(args.base_dir, args.min_samples) # Load the features for the dev set which we # are going to cluster up. features = np.zeros(shape=(len(train) + len(dev), encoder.output.shape[1])) for i, imagefile in enumerate(train['file']): img = load_image(image_path=os.path.join(args.base_dir, 'train/' + imagefile), preprocess_input=preprocess) features[i, :] = encoder.predict(img) for i, imagefile in enumerate(dev['file']): img = load_image(image_path=os.path.join(args.base_dir, 'dev/' + imagefile), preprocess_input=preprocess) features[i + len(train), :] = encoder.predict(img) # Scale before doing PCA, which # expects to have standardized # features. scaler = StandardScaler() features = scaler.fit_transform(features) if args.pca_components > 0: pca = PCA(n_components=args.pca_components) features = pca.fit_transform(features) # Setup DBSCAN dbscan = DBSCAN(eps=args.eps, min_samples=args.min_dbscan_samples, n_jobs=-1) print('[INFO] Running DBSCAN...') dbscan.fit(features) # Save the dataframe of validation predictions and labels pd.DataFrame({ 'file': dev['file'], 'label': dev['label'], 'pred': dbscan.labels_[len(train):] }).to_csv('dev_dbscan_pca_ms{}_{}.csv'.format(args.min_samples, wandb.run.id), index=False) # A quick performance estimate. ar_score = adjusted_rand_score(dev['label'], dbscan.labels_[len(train):]) wandb.log({'ari': ar_score}) wandb.log({ 'nmi': normalized_mutual_info_score(dev['label'], dbscan.labels_[len(train):]) }) if args.pca_components > 0: wandb.log( {'explained_variance': np.sum(pca.explained_variance_ratio_)}) else: wandb.log({'explained_variance': 0.00}) # Log the number of empty clusters # on the dev set. nunique = len(np.unique(dbscan.labels_)) wandb.log({'dev_clusters': nunique}) print('[INFO] Finished!')