Пример #1
0
def load_images(images, data):
    """

    Load all of the specified images from the data folder.

    :param images: A list of strings that contains the
    image name and folder name but not the entire path.
    :param data: A base directory to load images from.
    :return: A list of numpy array images.

    """
    return [load_image(os.path.join(data,image)).reshape(224,224,3) \
            for image in images]
Пример #2
0
def main(args):

    print('[INFO] Starting clustering...')

    # Setup weights and biases
    setup_wandb(args)

    # Setup the pre-trained backbone for our model.  This is
    # done first to get the preprocessing function for the net.
    encoder, preprocess = model_factory(args.backbone, pooling=args.pooling)

    # Load the images into memory.  Right now
    # I am not supporting loading from disk.
    train, dev = load_dataframes(args.base_dir, args.min_samples)

    # Load the features for the dev set which we
    # are going to cluster up.
    features = np.zeros(shape=(len(train) + len(dev), encoder.output.shape[1]))
    for i, imagefile in enumerate(train['file']):
        img = load_image(image_path=os.path.join(args.base_dir,
                                                 'train/' + imagefile),
                         preprocess_input=preprocess)
        features[i, :] = encoder.predict(img)

    for i, imagefile in enumerate(dev['file']):
        img = load_image(image_path=os.path.join(args.base_dir,
                                                 'dev/' + imagefile),
                         preprocess_input=preprocess)
        features[i + len(train), :] = encoder.predict(img)

    # Scale before doing PCA, which
    # expects to have standardized
    # features.
    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    if args.pca_components > 0:
        pca = PCA(n_components=args.pca_components)
        features = pca.fit_transform(features)

    # Setup clustering
    agglom = AgglomerativeClustering(n_clusters=args.clusters,
                                     affinity=args.affinity)

    print('[INFO] Running clustering...')
    agglom.fit(features)

    # Save the dataframe of validation predictions and labels
    df = pd.DataFrame({
        'file': dev['file'],
        'label': dev['label'],
        'pred': agglom.labels_[len(train):]
    })
    df.to_csv('dev_agglom_pca_ms{}_{}.csv'.format(args.min_samples,
                                                  wandb.run.id),
              index=False)

    # A quick performance estimate.
    ar_score = adjusted_rand_score(dev['label'], agglom.labels_[len(train):])
    wandb.log({'ari': ar_score})
    wandb.log({
        'nmi':
        normalized_mutual_info_score(dev['label'], agglom.labels_[len(train):])
    })

    if args.pca_components > 0:
        wandb.log(
            {'explained_variance': np.sum(pca.explained_variance_ratio_)})
    else:
        wandb.log({'explained_variance': 0.00})

    # if the number of clusters is the same as
    # the true labels, we can do hungarian
    if args.clusters == dev['label'].nunique():
        hba = hungarian_balanced_accuracy(
            LabelEncoder().fit_transform(df['label']), df['pred'])
        wandb.log({'balanced_accuracy': hba})
        hba = hungarian_accuracy(LabelEncoder().fit_transform(df['label']),
                                 df['pred'])
        wandb.log({'accuracy': hba})

    print('[INFO] Finished!')
Пример #3
0
def main(args):

    print('[INFO] Starting clustering...')

    # Setup weights and biases
    setup_wandb(args)

    # Setup the pre-trained backbone for our model.  This is
    # done first to get the preprocessing function for the net.
    encoder, preprocess = model_factory(args.backbone, pooling=args.pooling)

    # Load the images into memory.  Right now
    # I am not supporting loading from disk.
    train, dev = load_dataframes(args.base_dir, args.min_samples)

    # Load the features for the dev set which we
    # are going to cluster up.
    features = np.zeros(shape=(len(train) + len(dev), encoder.output.shape[1]))
    for i, imagefile in enumerate(train['file']):
        img = load_image(image_path=os.path.join(args.base_dir,
                                                 'train/' + imagefile),
                         preprocess_input=preprocess)
        features[i, :] = encoder.predict(img)

    for i, imagefile in enumerate(dev['file']):
        img = load_image(image_path=os.path.join(args.base_dir,
                                                 'dev/' + imagefile),
                         preprocess_input=preprocess)
        features[i + len(train), :] = encoder.predict(img)

    # Scale before doing PCA, which
    # expects to have standardized
    # features.
    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    if args.pca_components > 0:
        pca = PCA(n_components=args.pca_components)
        features = pca.fit_transform(features)

    # Setup DBSCAN
    dbscan = DBSCAN(eps=args.eps,
                    min_samples=args.min_dbscan_samples,
                    n_jobs=-1)

    print('[INFO] Running DBSCAN...')
    dbscan.fit(features)

    # Save the dataframe of validation predictions and labels
    pd.DataFrame({
        'file': dev['file'],
        'label': dev['label'],
        'pred': dbscan.labels_[len(train):]
    }).to_csv('dev_dbscan_pca_ms{}_{}.csv'.format(args.min_samples,
                                                  wandb.run.id),
              index=False)

    # A quick performance estimate.
    ar_score = adjusted_rand_score(dev['label'], dbscan.labels_[len(train):])
    wandb.log({'ari': ar_score})
    wandb.log({
        'nmi':
        normalized_mutual_info_score(dev['label'], dbscan.labels_[len(train):])
    })

    if args.pca_components > 0:
        wandb.log(
            {'explained_variance': np.sum(pca.explained_variance_ratio_)})
    else:
        wandb.log({'explained_variance': 0.00})

    # Log the number of empty clusters
    # on the dev set.
    nunique = len(np.unique(dbscan.labels_))
    wandb.log({'dev_clusters': nunique})

    print('[INFO] Finished!')