Пример #1
0
def main(args):

    print('[INFO] Starting clustering...')

    # Setup weights and biases
    setup_wandb(args)

    # Setup the pre-trained backbone for our model.  This is
    # done first to get the preprocessing function for the net.
    encoder, preprocess = model_factory(args.backbone, pooling=args.pooling)

    # Load the images into memory.  Right now
    # I am not supporting loading from disk.
    train, dev = load_dataframes(args.base_dir, args.min_samples)

    # Save the dataframe of validation predictions and labels
    df = pd.DataFrame({
        'file':
        dev['file'],
        'label':
        dev['label'],
        'pred':
        np.random.choice(np.arange(dev['label'].nunique()), len(dev))
    })

    # if the number of clusters is the same as
    # the true labels, we can do hungarian
    if args.clusters == dev['label'].nunique():
        hba = hungarian_balanced_accuracy(
            LabelEncoder().fit_transform(df['label']), df['pred'])
        print("Balanced Accuracy: {}".format(hba))
        wandb.log({'balanced_accuracy': hba})

        ha = hungarian_accuracy(LabelEncoder().fit_transform(df['label']),
                                df['pred'])
        print("Accuracy: {}".format(ha))
        wandb.log({'accuracy': ha})

    print('[INFO] Finished!')
Пример #2
0
def main(args):

    print('[INFO] Starting clustering...')

    # Setup weights and biases
    setup_wandb(args)

    # Setup the pre-trained backbone for our model.  This is
    # done first to get the preprocessing function for the net.
    encoder, preprocess = model_factory(args.backbone, pooling=args.pooling)

    # Load the images into memory.  Right now
    # I am not supporting loading from disk.
    train, dev = load_dataframes(args.base_dir, args.min_samples)

    # Load the features for the dev set which we
    # are going to cluster up.
    features = np.zeros(shape=(len(train) + len(dev), encoder.output.shape[1]))
    for i, imagefile in enumerate(train['file']):
        img = load_image(image_path=os.path.join(args.base_dir,
                                                 'train/' + imagefile),
                         preprocess_input=preprocess)
        features[i, :] = encoder.predict(img)

    for i, imagefile in enumerate(dev['file']):
        img = load_image(image_path=os.path.join(args.base_dir,
                                                 'dev/' + imagefile),
                         preprocess_input=preprocess)
        features[i + len(train), :] = encoder.predict(img)

    # Scale before doing PCA, which
    # expects to have standardized
    # features.
    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    if args.pca_components > 0:
        pca = PCA(n_components=args.pca_components)
        features = pca.fit_transform(features)

    # Setup clustering
    agglom = AgglomerativeClustering(n_clusters=args.clusters,
                                     affinity=args.affinity)

    print('[INFO] Running clustering...')
    agglom.fit(features)

    # Save the dataframe of validation predictions and labels
    df = pd.DataFrame({
        'file': dev['file'],
        'label': dev['label'],
        'pred': agglom.labels_[len(train):]
    })
    df.to_csv('dev_agglom_pca_ms{}_{}.csv'.format(args.min_samples,
                                                  wandb.run.id),
              index=False)

    # A quick performance estimate.
    ar_score = adjusted_rand_score(dev['label'], agglom.labels_[len(train):])
    wandb.log({'ari': ar_score})
    wandb.log({
        'nmi':
        normalized_mutual_info_score(dev['label'], agglom.labels_[len(train):])
    })

    if args.pca_components > 0:
        wandb.log(
            {'explained_variance': np.sum(pca.explained_variance_ratio_)})
    else:
        wandb.log({'explained_variance': 0.00})

    # if the number of clusters is the same as
    # the true labels, we can do hungarian
    if args.clusters == dev['label'].nunique():
        hba = hungarian_balanced_accuracy(
            LabelEncoder().fit_transform(df['label']), df['pred'])
        wandb.log({'balanced_accuracy': hba})
        hba = hungarian_accuracy(LabelEncoder().fit_transform(df['label']),
                                 df['pred'])
        wandb.log({'accuracy': hba})

    print('[INFO] Finished!')
Пример #3
0
def main(args):

    print('[INFO] Starting clustering...')

    # Setup weights and biases
    setup_wandb(args)

    # Setup the pre-trained encoder from autoencoder.py.
    encoder = load_model(args.model)
    model = PretrainedDeepClusteringModel(backbone=encoder,
                                          n_clusters=args.n_clusters)
    optimizer = Adam(learning_rate=args.learning_rate,
                     beta_1=args.beta1,
                     beta_2=args.beta2)
    model.compile(optimizer=optimizer, loss='kld')

    encoder_weights, encoder_biases = encoder.layers[1].get_weights()
    model_weights, model_biases = model.backbone.layers[1].get_weights()
    print("[INFO] Checking weights and biases equality before running...")
    print("[INFO] Weights ", np.sum(encoder_weights - model_weights))
    print("[INFO] Biases ", np.sum(encoder_biases - model_biases))

    # Load the images into memory.  Right now
    # I am not supporting loading from disk.
    train, dev, test = load_dataframes(args.base_dir, args.min_samples)

    # Use an image data generator to save memory.
    augs = dict(preprocessing_function=normalize, )

    gen = ImageDataGenerator(**augs)
    train_flow = gen.flow_from_dataframe(
        dataframe=train,
        directory=os.path.join(args.base_dir, 'train'),
        batch_size=args.batch_size,
        target_size=(args.pixels, args.pixels),
        shuffle=True,
        x_col='file',
        class_mode=None)

    # Setup a generator for dev
    dev_flow = gen.flow_from_dataframe(dataframe=dev,
                                       directory=os.path.join(
                                           args.base_dir, 'dev'),
                                       batch_size=args.batch_size,
                                       target_size=(args.pixels, args.pixels),
                                       shuffle=False,
                                       x_col='file',
                                       class_mode=None)

    test_flow = gen.flow_from_dataframe(dataframe=test,
                                        directory=os.path.join(
                                            args.base_dir, 'test'),
                                        batch_size=args.batch_size,
                                        target_size=(args.pixels, args.pixels),
                                        shuffle=False,
                                        x_col='file',
                                        class_mode=None)

    print('[INFO] Starting initialization of clusters')
    model.initialize_clusters_generator(
        train_flow,
        epochs=1,
        steps_per_epoch=int(np.ceil(len(train) / args.batch_size)))

    print('[INFO] Fitting autoencoder...')
    for layer in encoder.layers:
        layer.trainable = True

    # -----------------
    #    Train here
    # -----------------
    loss = np.inf
    for ite in range(int(args.total_batches)):

        batch = next(train_flow)
        while len(batch) != args.batch_size:
            batch = next(train_flow)

        q = model.predict(batch, verbose=0)
        p = clustering_target_distribution(q)

        for _ in range(args.repeat_batch):

            encoder_weights, encoder_biases = encoder.layers[1].get_weights()
            model_weights, model_biases = model.backbone.layers[1].get_weights(
            )
            print("[INFO] Checking weights and biases equality...")
            print("[INFO] Weights ", np.sum(encoder_weights - model_weights))
            print("[INFO] Biases ", np.sum(encoder_biases - model_biases))

            sub_batches = int(np.ceil(args.batch_size / 32))
            for i in range(sub_batches):
                loss = model.train_on_batch(x=batch[i * 32:(i + 1) * 32],
                                            y=p[i * 32:(i + 1) * 32])
                wandb.log({'kld_loss': loss})

    # Fit the sucker
    batches = int(np.ceil(len(train) / args.batch_size))
    dev_batches = int(np.ceil(len(dev) / args.batch_size))

    # This scaler is used to normalize before
    # doing clustering.  The online run is done
    # on the training data to collect statistics.
    print('[INFO] Fitting the scaler.')
    scaler = StandardScaler()
    for batch in range(batches):
        x_batch = next(train_flow)
        scaler.partial_fit(encoder.predict(x_batch))

    label_encoder = LabelEncoder()
    train['encoded_label'] = label_encoder.fit_transform(train['label'])
    dev['encoded_label'] = label_encoder.transform(dev['label'])
    test['encoded_label'] = label_encoder.transform(test['label'])

    kmeans = MiniBatchKMeans(n_clusters=train['label'].nunique())
    batches = int(np.ceil(len(train) / args.batch_size))
    for i in range(batches):
        kmeans.partial_fit(encoder.predict(next(train_flow)))

    dev_clusters = []
    test_clusters = []
    batches = int(np.ceil(len(dev) / args.batch_size))
    for i in range(batches):
        dev_clusters.extend(kmeans.predict(encoder.predict(next(dev_flow))))

    batches = int(np.ceil(len(test) / args.batch_size))
    for i in range(batches):
        test_clusters.extend(kmeans.predict(encoder.predict(next(test_flow))))

    dev_clusters = np.array(dev_clusters)
    test_clusters = np.array(test_clusters)

    accuracy = hungarian_accuracy(dev['encoded_label'], dev_clusters)
    balanced_accuracy = hungarian_balanced_accuracy(dev['encoded_label'],
                                                    dev_clusters)
    wandb.log({
        "dev_accuracy": accuracy,
        "dev_balanced_accuracy": balanced_accuracy
    })

    accuracy = hungarian_accuracy(test['encoded_label'], test_clusters)
    balanced_accuracy = hungarian_balanced_accuracy(test['encoded_label'],
                                                    test_clusters)
    wandb.log({
        "test_accuracy": accuracy,
        "test_balanced_accuracy": balanced_accuracy
    })

    x_batch = next(dev_flow)

    encoder.save("encoder.dec.{}.hdf5".format(wandb.run.id))

    print('[INFO] Finished!')
Пример #4
0
def main(args):

    print('[INFO] Starting clustering...')

    # Setup weights and biases
    setup_wandb(args)

    # Setup the pre-trained backbone for our model.  This is
    # done first to get the preprocessing function for the net.
    #encoder, preprocess = model_factory(args.backbone, pooling=args.pooling)
    model, encoder = build_model((args.pixels, args.pixels, 3),
                                 args.latent_dim)
    print(model.summary())
    print(encoder.summary())

    optimizer = Adam(learning_rate=args.learning_rate,
                     beta_1=args.beta1,
                     beta_2=args.beta2)
    model.compile(optimizer=optimizer, loss='mse')

    # Load the images into memory.  Right now
    # I am not supporting loading from disk.
    train, dev, test = load_dataframes(args.base_dir, args.min_samples)

    # Use an image data generator to save memory.
    augs = dict(
        horizontal_flip=True,
        zoom_range=args.zoom,
        width_shift_range=args.width_shift,
        height_shift_range=args.height_shift,
        preprocessing_function=normalize,
        rotation_range=args.rotation,
    )

    gen = ImageDataGenerator(**augs)
    train_flow = gen.flow_from_dataframe(
        dataframe=train,
        directory=os.path.join(args.base_dir, 'train'),
        batch_size=args.batch_size,
        target_size=(args.pixels, args.pixels),
        shuffle=True,
        x_col='file',
        class_mode=None)

    # Setup a generator for dev
    no_augs_gen = ImageDataGenerator(preprocessing_function=normalize)
    dev_flow = no_augs_gen.flow_from_dataframe(
        dataframe=dev,
        directory=os.path.join(args.base_dir, 'dev'),
        batch_size=args.batch_size,
        target_size=(args.pixels, args.pixels),
        shuffle=False,
        x_col='file',
        class_mode=None)

    test_flow = no_augs_gen.flow_from_dataframe(
        dataframe=test,
        directory=os.path.join(args.base_dir, 'test'),
        batch_size=args.batch_size,
        target_size=(args.pixels, args.pixels),
        shuffle=False,
        x_col='file',
        class_mode=None)

    print('[INFO] Fitting autoencoder...')
    for layer in model.layers:
        layer.trainable = True

    batches = int(np.ceil(len(train) / args.batch_size))
    dev_batches = int(np.ceil(len(dev) / args.batch_size))
    for epoch in range(args.epochs):

        # Train
        for batch in range(batches):
            x_batch = next(train_flow)
            loss = model.train_on_batch(x_batch, x_batch)
            wandb.log({'loss': loss})

        for batch in range(dev_batches):
            x_batch = next(dev_flow)
            dev_loss = model.evaluate(x_batch, x_batch)
            wandb.log({'dev_loss': dev_loss})

    # This scaler is used to normalize before
    # doing clustering.  The online run is done
    # on the training data to collect statistics.
    print('[INFO] Fitting the scaler.')
    scaler = StandardScaler()
    for batch in range(batches):
        x_batch = next(train_flow)
        scaler.partial_fit(encoder.predict(x_batch))

    print('[INFO] Running metric evaluation...')
    for layer in encoder.layers:
        encoder.trainable = False

    label_encoder = LabelEncoder()
    train['encoded_label'] = label_encoder.fit_transform(train['label'])
    dev['encoded_label'] = label_encoder.transform(dev['label'])
    test['encoded_label'] = label_encoder.transform(test['label'])

    train_flow = gen.flow_from_dataframe(
        dataframe=train,
        directory=os.path.join(args.base_dir, 'train'),
        batch_size=args.batch_size,
        target_size=(args.pixels, args.pixels),
        shuffle=True,
        x_col='file',
        y_col='label',
        class_mode='categorical')

    kmeans = MiniBatchKMeans(n_clusters=train['label'].nunique())
    batches = int(np.ceil(len(train) / args.batch_size))
    for i in range(batches):
        kmeans.partial_fit(scaler.transform(encoder.predict(next(train_flow))))

    dev_clusters = []
    test_clusters = []
    batches = int(np.ceil(len(dev) / args.batch_size))
    for i in range(batches):
        dev_clusters.extend(
            kmeans.predict(scaler.transform(encoder.predict(next(dev_flow)))))

    batches = int(np.ceil(len(test) / args.batch_size))
    for i in range(batches):
        test_clusters.extend(
            kmeans.predict(scaler.transform(encoder.predict(next(test_flow)))))

    dev_clusters = np.array(dev_clusters)
    test_clusters = np.array(test_clusters)

    accuracy = hungarian_accuracy(dev['encoded_label'], dev_clusters)
    balanced_accuracy = hungarian_balanced_accuracy(dev['encoded_label'],
                                                    dev_clusters)
    wandb.log({
        "dev_accuracy": accuracy,
        "dev_balanced_accuracy": balanced_accuracy
    })

    accuracy = hungarian_accuracy(test['encoded_label'], test_clusters)
    balanced_accuracy = hungarian_balanced_accuracy(test['encoded_label'],
                                                    test_clusters)
    wandb.log({
        "test_accuracy": accuracy,
        "test_balanced_accuracy": balanced_accuracy
    })

    x_batch = next(dev_flow)
    plot_examples(x_batch, model.predict(x_batch),
                  "/home/ubuntu/autoencoder_samples.pdf")

    encoder.save("encoder.{}.hdf5".format(wandb.run.id))

    print('[INFO] Finished!')