def main(args): print('[INFO] Starting clustering...') # Setup weights and biases setup_wandb(args) # Setup the pre-trained backbone for our model. This is # done first to get the preprocessing function for the net. encoder, preprocess = model_factory(args.backbone, pooling=args.pooling) # Load the images into memory. Right now # I am not supporting loading from disk. train, dev = load_dataframes(args.base_dir, args.min_samples) # Save the dataframe of validation predictions and labels df = pd.DataFrame({ 'file': dev['file'], 'label': dev['label'], 'pred': np.random.choice(np.arange(dev['label'].nunique()), len(dev)) }) # if the number of clusters is the same as # the true labels, we can do hungarian if args.clusters == dev['label'].nunique(): hba = hungarian_balanced_accuracy( LabelEncoder().fit_transform(df['label']), df['pred']) print("Balanced Accuracy: {}".format(hba)) wandb.log({'balanced_accuracy': hba}) ha = hungarian_accuracy(LabelEncoder().fit_transform(df['label']), df['pred']) print("Accuracy: {}".format(ha)) wandb.log({'accuracy': ha}) print('[INFO] Finished!')
def main(args): print('[INFO] Starting clustering...') # Setup weights and biases setup_wandb(args) # Setup the pre-trained backbone for our model. This is # done first to get the preprocessing function for the net. encoder, preprocess = model_factory(args.backbone, pooling=args.pooling) # Load the images into memory. Right now # I am not supporting loading from disk. train, dev = load_dataframes(args.base_dir, args.min_samples) # Load the features for the dev set which we # are going to cluster up. features = np.zeros(shape=(len(train) + len(dev), encoder.output.shape[1])) for i, imagefile in enumerate(train['file']): img = load_image(image_path=os.path.join(args.base_dir, 'train/' + imagefile), preprocess_input=preprocess) features[i, :] = encoder.predict(img) for i, imagefile in enumerate(dev['file']): img = load_image(image_path=os.path.join(args.base_dir, 'dev/' + imagefile), preprocess_input=preprocess) features[i + len(train), :] = encoder.predict(img) # Scale before doing PCA, which # expects to have standardized # features. scaler = StandardScaler() features = scaler.fit_transform(features) if args.pca_components > 0: pca = PCA(n_components=args.pca_components) features = pca.fit_transform(features) # Setup clustering agglom = AgglomerativeClustering(n_clusters=args.clusters, affinity=args.affinity) print('[INFO] Running clustering...') agglom.fit(features) # Save the dataframe of validation predictions and labels df = pd.DataFrame({ 'file': dev['file'], 'label': dev['label'], 'pred': agglom.labels_[len(train):] }) df.to_csv('dev_agglom_pca_ms{}_{}.csv'.format(args.min_samples, wandb.run.id), index=False) # A quick performance estimate. ar_score = adjusted_rand_score(dev['label'], agglom.labels_[len(train):]) wandb.log({'ari': ar_score}) wandb.log({ 'nmi': normalized_mutual_info_score(dev['label'], agglom.labels_[len(train):]) }) if args.pca_components > 0: wandb.log( {'explained_variance': np.sum(pca.explained_variance_ratio_)}) else: wandb.log({'explained_variance': 0.00}) # if the number of clusters is the same as # the true labels, we can do hungarian if args.clusters == dev['label'].nunique(): hba = hungarian_balanced_accuracy( LabelEncoder().fit_transform(df['label']), df['pred']) wandb.log({'balanced_accuracy': hba}) hba = hungarian_accuracy(LabelEncoder().fit_transform(df['label']), df['pred']) wandb.log({'accuracy': hba}) print('[INFO] Finished!')
def main(args): print('[INFO] Starting clustering...') # Setup weights and biases setup_wandb(args) # Setup the pre-trained encoder from autoencoder.py. encoder = load_model(args.model) model = PretrainedDeepClusteringModel(backbone=encoder, n_clusters=args.n_clusters) optimizer = Adam(learning_rate=args.learning_rate, beta_1=args.beta1, beta_2=args.beta2) model.compile(optimizer=optimizer, loss='kld') encoder_weights, encoder_biases = encoder.layers[1].get_weights() model_weights, model_biases = model.backbone.layers[1].get_weights() print("[INFO] Checking weights and biases equality before running...") print("[INFO] Weights ", np.sum(encoder_weights - model_weights)) print("[INFO] Biases ", np.sum(encoder_biases - model_biases)) # Load the images into memory. Right now # I am not supporting loading from disk. train, dev, test = load_dataframes(args.base_dir, args.min_samples) # Use an image data generator to save memory. augs = dict(preprocessing_function=normalize, ) gen = ImageDataGenerator(**augs) train_flow = gen.flow_from_dataframe( dataframe=train, directory=os.path.join(args.base_dir, 'train'), batch_size=args.batch_size, target_size=(args.pixels, args.pixels), shuffle=True, x_col='file', class_mode=None) # Setup a generator for dev dev_flow = gen.flow_from_dataframe(dataframe=dev, directory=os.path.join( args.base_dir, 'dev'), batch_size=args.batch_size, target_size=(args.pixels, args.pixels), shuffle=False, x_col='file', class_mode=None) test_flow = gen.flow_from_dataframe(dataframe=test, directory=os.path.join( args.base_dir, 'test'), batch_size=args.batch_size, target_size=(args.pixels, args.pixels), shuffle=False, x_col='file', class_mode=None) print('[INFO] Starting initialization of clusters') model.initialize_clusters_generator( train_flow, epochs=1, steps_per_epoch=int(np.ceil(len(train) / args.batch_size))) print('[INFO] Fitting autoencoder...') for layer in encoder.layers: layer.trainable = True # ----------------- # Train here # ----------------- loss = np.inf for ite in range(int(args.total_batches)): batch = next(train_flow) while len(batch) != args.batch_size: batch = next(train_flow) q = model.predict(batch, verbose=0) p = clustering_target_distribution(q) for _ in range(args.repeat_batch): encoder_weights, encoder_biases = encoder.layers[1].get_weights() model_weights, model_biases = model.backbone.layers[1].get_weights( ) print("[INFO] Checking weights and biases equality...") print("[INFO] Weights ", np.sum(encoder_weights - model_weights)) print("[INFO] Biases ", np.sum(encoder_biases - model_biases)) sub_batches = int(np.ceil(args.batch_size / 32)) for i in range(sub_batches): loss = model.train_on_batch(x=batch[i * 32:(i + 1) * 32], y=p[i * 32:(i + 1) * 32]) wandb.log({'kld_loss': loss}) # Fit the sucker batches = int(np.ceil(len(train) / args.batch_size)) dev_batches = int(np.ceil(len(dev) / args.batch_size)) # This scaler is used to normalize before # doing clustering. The online run is done # on the training data to collect statistics. print('[INFO] Fitting the scaler.') scaler = StandardScaler() for batch in range(batches): x_batch = next(train_flow) scaler.partial_fit(encoder.predict(x_batch)) label_encoder = LabelEncoder() train['encoded_label'] = label_encoder.fit_transform(train['label']) dev['encoded_label'] = label_encoder.transform(dev['label']) test['encoded_label'] = label_encoder.transform(test['label']) kmeans = MiniBatchKMeans(n_clusters=train['label'].nunique()) batches = int(np.ceil(len(train) / args.batch_size)) for i in range(batches): kmeans.partial_fit(encoder.predict(next(train_flow))) dev_clusters = [] test_clusters = [] batches = int(np.ceil(len(dev) / args.batch_size)) for i in range(batches): dev_clusters.extend(kmeans.predict(encoder.predict(next(dev_flow)))) batches = int(np.ceil(len(test) / args.batch_size)) for i in range(batches): test_clusters.extend(kmeans.predict(encoder.predict(next(test_flow)))) dev_clusters = np.array(dev_clusters) test_clusters = np.array(test_clusters) accuracy = hungarian_accuracy(dev['encoded_label'], dev_clusters) balanced_accuracy = hungarian_balanced_accuracy(dev['encoded_label'], dev_clusters) wandb.log({ "dev_accuracy": accuracy, "dev_balanced_accuracy": balanced_accuracy }) accuracy = hungarian_accuracy(test['encoded_label'], test_clusters) balanced_accuracy = hungarian_balanced_accuracy(test['encoded_label'], test_clusters) wandb.log({ "test_accuracy": accuracy, "test_balanced_accuracy": balanced_accuracy }) x_batch = next(dev_flow) encoder.save("encoder.dec.{}.hdf5".format(wandb.run.id)) print('[INFO] Finished!')
def main(args): print('[INFO] Starting clustering...') # Setup weights and biases setup_wandb(args) # Setup the pre-trained backbone for our model. This is # done first to get the preprocessing function for the net. #encoder, preprocess = model_factory(args.backbone, pooling=args.pooling) model, encoder = build_model((args.pixels, args.pixels, 3), args.latent_dim) print(model.summary()) print(encoder.summary()) optimizer = Adam(learning_rate=args.learning_rate, beta_1=args.beta1, beta_2=args.beta2) model.compile(optimizer=optimizer, loss='mse') # Load the images into memory. Right now # I am not supporting loading from disk. train, dev, test = load_dataframes(args.base_dir, args.min_samples) # Use an image data generator to save memory. augs = dict( horizontal_flip=True, zoom_range=args.zoom, width_shift_range=args.width_shift, height_shift_range=args.height_shift, preprocessing_function=normalize, rotation_range=args.rotation, ) gen = ImageDataGenerator(**augs) train_flow = gen.flow_from_dataframe( dataframe=train, directory=os.path.join(args.base_dir, 'train'), batch_size=args.batch_size, target_size=(args.pixels, args.pixels), shuffle=True, x_col='file', class_mode=None) # Setup a generator for dev no_augs_gen = ImageDataGenerator(preprocessing_function=normalize) dev_flow = no_augs_gen.flow_from_dataframe( dataframe=dev, directory=os.path.join(args.base_dir, 'dev'), batch_size=args.batch_size, target_size=(args.pixels, args.pixels), shuffle=False, x_col='file', class_mode=None) test_flow = no_augs_gen.flow_from_dataframe( dataframe=test, directory=os.path.join(args.base_dir, 'test'), batch_size=args.batch_size, target_size=(args.pixels, args.pixels), shuffle=False, x_col='file', class_mode=None) print('[INFO] Fitting autoencoder...') for layer in model.layers: layer.trainable = True batches = int(np.ceil(len(train) / args.batch_size)) dev_batches = int(np.ceil(len(dev) / args.batch_size)) for epoch in range(args.epochs): # Train for batch in range(batches): x_batch = next(train_flow) loss = model.train_on_batch(x_batch, x_batch) wandb.log({'loss': loss}) for batch in range(dev_batches): x_batch = next(dev_flow) dev_loss = model.evaluate(x_batch, x_batch) wandb.log({'dev_loss': dev_loss}) # This scaler is used to normalize before # doing clustering. The online run is done # on the training data to collect statistics. print('[INFO] Fitting the scaler.') scaler = StandardScaler() for batch in range(batches): x_batch = next(train_flow) scaler.partial_fit(encoder.predict(x_batch)) print('[INFO] Running metric evaluation...') for layer in encoder.layers: encoder.trainable = False label_encoder = LabelEncoder() train['encoded_label'] = label_encoder.fit_transform(train['label']) dev['encoded_label'] = label_encoder.transform(dev['label']) test['encoded_label'] = label_encoder.transform(test['label']) train_flow = gen.flow_from_dataframe( dataframe=train, directory=os.path.join(args.base_dir, 'train'), batch_size=args.batch_size, target_size=(args.pixels, args.pixels), shuffle=True, x_col='file', y_col='label', class_mode='categorical') kmeans = MiniBatchKMeans(n_clusters=train['label'].nunique()) batches = int(np.ceil(len(train) / args.batch_size)) for i in range(batches): kmeans.partial_fit(scaler.transform(encoder.predict(next(train_flow)))) dev_clusters = [] test_clusters = [] batches = int(np.ceil(len(dev) / args.batch_size)) for i in range(batches): dev_clusters.extend( kmeans.predict(scaler.transform(encoder.predict(next(dev_flow))))) batches = int(np.ceil(len(test) / args.batch_size)) for i in range(batches): test_clusters.extend( kmeans.predict(scaler.transform(encoder.predict(next(test_flow))))) dev_clusters = np.array(dev_clusters) test_clusters = np.array(test_clusters) accuracy = hungarian_accuracy(dev['encoded_label'], dev_clusters) balanced_accuracy = hungarian_balanced_accuracy(dev['encoded_label'], dev_clusters) wandb.log({ "dev_accuracy": accuracy, "dev_balanced_accuracy": balanced_accuracy }) accuracy = hungarian_accuracy(test['encoded_label'], test_clusters) balanced_accuracy = hungarian_balanced_accuracy(test['encoded_label'], test_clusters) wandb.log({ "test_accuracy": accuracy, "test_balanced_accuracy": balanced_accuracy }) x_batch = next(dev_flow) plot_examples(x_batch, model.predict(x_batch), "/home/ubuntu/autoencoder_samples.pdf") encoder.save("encoder.{}.hdf5".format(wandb.run.id)) print('[INFO] Finished!')