def build_cookbook_and_featurevectors_for_model_tuning():
    train_labels, train_images, test_labels, test_images = get_training_and_test_data()

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images),
                          feature_extractor=FeatureExtractorKeys.SIFT,
                          window_resolution=0.125,
                          window_overlap=2.0,
                          num_clusters=450,
                          image_size=256)

    trainer = SketchRecognitionTrainer(
        file_path=TRAIN_COOKBOOK_FILENAME,
        run_parallel_processors=True,
        params=params
    )

    # 1 - extract image feature vectors
    if os.path.isfile(TRAIN_FEATURES_FILENAME):
        train_image_features = np.load(TRAIN_FEATURES_FILENAME)
    else:
        train_image_features = trainer.extract_image_descriptors(train_images)
        np.save(TRAIN_FEATURES_FILENAME, train_image_features)

    # 2 - create codebook from feature vectors
    if not trainer.is_cookbook_available:
        trainer.create_codebook_from_image_descriptors(image_descriptors=train_image_features)

    # 3 - create codelabels (visual word histograms) for each image
    if os.path.isfile(TRAIN_CODE_LABELS_FILENAME):
        train_images_codelabels = np.load(TRAIN_CODE_LABELS_FILENAME)
    else:
        train_images_codelabels = trainer.code_labels_for_image_descriptors(train_image_features)
        np.save(TRAIN_CODE_LABELS_FILENAME, train_images_codelabels)  # features

    # 4 - save labels
    np.save(TRAIN_LABELS_FILENAME, train_labels)

    print "finished creating and saving feature data:\n- codebook: {}\n- train image code labels: {}" \
          "\n- train image labels: {}".format(
        "data/codebook_v1.dat",
        "data/train_image_codelabels_v1.npy",
        "data/train_image_labels_v1.npy"
    )
def window_overlap_test(window_overlap=2.):
    """  """

    train_labels, train_images, test_labels, test_images = get_training_and_test_data(
    )

    # split to make experimentation quicker
    train_labels, train_images = get_subset_of_training_data(train_labels,
                                                             train_images,
                                                             split=0.5)

    training_size = len(train_labels)

    desc = "testing influence of window_overlap, set to {}. NB training size = {}".format(
        window_overlap, training_size)

    print desc

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images),
                          window_overlap=window_overlap,
                          fn_prefix="winoverlap-{}".format(window_overlap))

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(
            params=params),
        run_parallel_processors=True,
        params=params)

    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.le.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images))

    evaluator = Evaluator(clf=classifier.clf,
                          label_encoder=classifier.le,
                          params=params,
                          output_filepath=SketchRecognitionTrainer.
                          get_evaluation_filename_for_params(params=params))

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    # add comment
    evaluator.results["desc"] = desc

    evaluation_results = evaluator.evaluate(X=test_images_codelabels,
                                            y=encoded_test_labels)
    print evaluation_results
def window_overlap_test(window_overlap=2.):
    """  """

    train_labels, train_images, test_labels, test_images = get_training_and_test_data()

    # split to make experimentation quicker
    train_labels, train_images = get_subset_of_training_data(train_labels, train_images, split=0.5)

    training_size = len(train_labels)

    desc = "testing influence of window_overlap, set to {}. NB training size = {}".format(
        window_overlap,
        training_size
    )

    print desc

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images),
                          window_overlap=window_overlap,
                          fn_prefix="winoverlap-{}".format(window_overlap))

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(params=params),
        run_parallel_processors=True,
        params=params
    )

    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.le.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images)
    )

    evaluator = Evaluator(
        clf=classifier.clf,
        label_encoder=classifier.le,
        params=params,
        output_filepath=SketchRecognitionTrainer.get_evaluation_filename_for_params(params=params)
    )

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    # add comment
    evaluator.results["desc"] = desc

    evaluation_results = evaluator.evaluate(X=test_images_codelabels, y=encoded_test_labels)
    print evaluation_results
def build_cookbook_and_featurevectors_for_model_tuning():
    train_labels, train_images, test_labels, test_images = get_training_and_test_data(
    )

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images),
                          feature_extractor=FeatureExtractorKeys.SIFT,
                          window_resolution=0.125,
                          window_overlap=2.0,
                          num_clusters=450,
                          image_size=256)

    trainer = SketchRecognitionTrainer(file_path=TRAIN_COOKBOOK_FILENAME,
                                       run_parallel_processors=True,
                                       params=params)

    # 1 - extract image feature vectors
    if os.path.isfile(TRAIN_FEATURES_FILENAME):
        train_image_features = np.load(TRAIN_FEATURES_FILENAME)
    else:
        train_image_features = trainer.extract_image_descriptors(train_images)
        np.save(TRAIN_FEATURES_FILENAME, train_image_features)

    # 2 - create codebook from feature vectors
    if not trainer.is_cookbook_available:
        trainer.create_codebook_from_image_descriptors(
            image_descriptors=train_image_features)

    # 3 - create codelabels (visual word histograms) for each image
    if os.path.isfile(TRAIN_CODE_LABELS_FILENAME):
        train_images_codelabels = np.load(TRAIN_CODE_LABELS_FILENAME)
    else:
        train_images_codelabels = trainer.code_labels_for_image_descriptors(
            train_image_features)
        np.save(TRAIN_CODE_LABELS_FILENAME,
                train_images_codelabels)  # features

    # 4 - save labels
    np.save(TRAIN_LABELS_FILENAME, train_labels)

    print "finished creating and saving feature data:\n- codebook: {}\n- train image code labels: {}" \
          "\n- train image labels: {}".format(
        "data/codebook_v1.dat",
        "data/train_image_codelabels_v1.npy",
        "data/train_image_labels_v1.npy"
    )
def training_size_test(split=0.5):
    """
    see what effect the training size has on the performance, initially take off 50%
    """
    print "test_2"

    train_labels, train_images, test_labels, test_images = get_training_and_test_data(
    )

    train_labels, train_images = get_subset_of_training_data(train_labels,
                                                             train_images,
                                                             split=split)

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images))

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(
            params=params),
        run_parallel_processors=True,
        params=params)

    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.le.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images))

    evaluator = Evaluator(clf=classifier.clf,
                          label_encoder=classifier.le,
                          params=params,
                          output_filepath=SketchRecognitionTrainer.
                          get_evaluation_filename_for_params(params=params))

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    # add comment
    evaluator.results[
        "desc"] = "After many iterations, this is a baseline for which tuning will benchmark from"

    evaluation_results = evaluator.evaluate(X=test_images_codelabels,
                                            y=encoded_test_labels)
    print evaluation_results
def sanity_check():
    """ baseline test, all all parameters from experimentation """

    train_labels, train_images, test_labels, test_images = get_training_and_test_data(
    )

    train_labels, train_images = get_subset_of_training_data(train_labels,
                                                             train_images,
                                                             split=0.05)

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images),
                          fn_prefix="sanitycheck")

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(
            params=params),
        run_parallel_processors=True,
        params=params)

    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.le.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images))

    evaluator = Evaluator(clf=classifier.clf,
                          label_encoder=classifier.le,
                          params=params,
                          output_filepath=SketchRecognitionTrainer.
                          get_evaluation_filename_for_params(params=params))

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    # add comment
    evaluator.results[
        "desc"] = "After many iterations, this is a baseline for which tuning will benchmark from"

    evaluation_results = evaluator.evaluate(X=test_images_codelabels,
                                            y=encoded_test_labels)
    print evaluation_results
def training_size_test(split=0.5):
    """
    see what effect the training size has on the performance, initially take off 50%
    """
    print "test_2"

    train_labels, train_images, test_labels, test_images = get_training_and_test_data()

    train_labels, train_images = get_subset_of_training_data(train_labels, train_images, split=split)

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images))

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(params=params),
        run_parallel_processors=True,
        params=params
    )

    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.le.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images)
    )

    evaluator = Evaluator(
        clf=classifier.clf,
        label_encoder=classifier.le,
        params=params,
        output_filepath=SketchRecognitionTrainer.get_evaluation_filename_for_params(params=params)
    )

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    # add comment
    evaluator.results["desc"] = "After many iterations, this is a baseline for which tuning will benchmark from"

    evaluation_results = evaluator.evaluate(X=test_images_codelabels, y=encoded_test_labels)
    print evaluation_results
def sanity_check():
    """ baseline test, all all parameters from experimentation """

    train_labels, train_images, test_labels, test_images = get_training_and_test_data()

    train_labels, train_images = get_subset_of_training_data(train_labels, train_images, split=0.05)

    selected_labels = list(set(train_labels))

    params = build_params(
        num_classes=len(selected_labels),
        training_size=len(train_images),
        test_size=len(test_images),
        fn_prefix="sanitycheck"
    )

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(params=params),
        run_parallel_processors=True,
        params=params
    )

    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.le.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images)
    )

    evaluator = Evaluator(
        clf=classifier.clf,
        label_encoder=classifier.le,
        params=params,
        output_filepath=SketchRecognitionTrainer.get_evaluation_filename_for_params(params=params)
    )

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    # add comment
    evaluator.results["desc"] = "After many iterations, this is a baseline for which tuning will benchmark from"

    evaluation_results = evaluator.evaluate(X=test_images_codelabels, y=encoded_test_labels)
    print evaluation_results
Пример #9
0
def rebuild(train_labels,
            train_images,
            test_labels,
            test_images,
            params_prefix=None):
    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images),
                          window_resolution=0.125,
                          window_overlap=2.5,
                          num_clusters=400,
                          image_size=256)

    if params_prefix is not None:
        params["prefix"] = params_prefix

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(
            params=params),
        run_parallel_processors=True,
        params=params)

    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.label_encoder.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images))

    evaluator = Evaluator(clf=classifier.clf,
                          label_encoder=classifier.label_encoder,
                          params=params,
                          output_filepath=SketchRecognitionTrainer.
                          get_evaluation_filename_for_params(params=params))

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    evaluation_results = evaluator.evaluate(X=test_images_codelabels,
                                            y=encoded_test_labels)

    print evaluation_results
def cluster_size_test(num_clusters=200):
    """  """

    train_labels, train_images, test_labels, test_images = get_training_and_test_data(
    )

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images),
                          num_clusters=num_clusters)

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(
            params=params),
        run_parallel_processors=True,
        params=params)

    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.le.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images))

    evaluator = Evaluator(clf=classifier.clf,
                          label_encoder=classifier.le,
                          params=params,
                          output_filepath=SketchRecognitionTrainer.
                          get_evaluation_filename_for_params(params=params))

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    # add comment
    evaluator.results[
        "desc"] = "testing influence of num_clusters, set to {}".format(
            num_clusters)

    evaluation_results = evaluator.evaluate(X=test_images_codelabels,
                                            y=encoded_test_labels)
    print evaluation_results
def cluster_size_test(num_clusters=200):
    """  """

    train_labels, train_images, test_labels, test_images = get_training_and_test_data()

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images),
                          num_clusters=num_clusters)

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(params=params),
        run_parallel_processors=True,
        params=params
    )

    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.le.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images)
    )

    evaluator = Evaluator(
        clf=classifier.clf,
        label_encoder=classifier.le,
        params=params,
        output_filepath=SketchRecognitionTrainer.get_evaluation_filename_for_params(params=params)
    )

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    # add comment
    evaluator.results["desc"] = "testing influence of num_clusters, set to {}".format(num_clusters)

    evaluation_results = evaluator.evaluate(X=test_images_codelabels, y=encoded_test_labels)
    print evaluation_results
def clustering_algorithm_test(clustering='kmeans'):
    """  """

    from sklearn.cluster import KMeans
    from sklearn.cluster import MiniBatchKMeans
    import multiprocessing

    train_labels, train_images, test_labels, test_images = get_training_and_test_data(
    )

    # split to make experimentation quicker
    train_labels, train_images = get_subset_of_training_data(train_labels,
                                                             train_images,
                                                             split=0.5)

    training_size = len(train_labels)

    desc = "testing influence of different clustering algorithms, using {} for a training size of {}".format(
        clustering, training_size)

    print desc

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images),
                          fn_prefix="clustering-{}".format(clustering))

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(
            params=params),
        run_parallel_processors=True,
        params=params)

    if clustering == "kmeans":
        trainer.clustering = KMeans(init='k-means++',
                                    n_clusters=params[ParamKeys.NUM_CLUSTERS],
                                    n_init=10,
                                    max_iter=10,
                                    tol=1.0,
                                    n_jobs=multiprocessing.cpu_count()
                                    if trainer.run_parallel_processors else 1)
    elif clustering == "minibatchkmeans":
        trainer.clustering = MiniBatchKMeans(
            init='k-means++',
            n_clusters=params[ParamKeys.NUM_CLUSTERS],
            batch_size=100,
            n_init=10,
            max_no_improvement=10,
            verbose=0)
    elif clustering == "meanshift":
        trainer = MeanShiftSketchRecognitionTrainer(
            file_path=SketchRecognitionTrainer.
            get_cookbook_filename_for_params(params=params),
            run_parallel_processors=True,
            params=params)

    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.le.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images))

    evaluator = Evaluator(clf=classifier.clf,
                          label_encoder=classifier.le,
                          params=params,
                          output_filepath=SketchRecognitionTrainer.
                          get_evaluation_filename_for_params(params=params))

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    # add comment
    evaluator.results["desc"] = desc

    evaluation_results = evaluator.evaluate(X=test_images_codelabels,
                                            y=encoded_test_labels)
    print evaluation_results
def clustering_algorithm_test(clustering='kmeans'):
    """  """

    from sklearn.cluster import KMeans
    from sklearn.cluster import MiniBatchKMeans
    import multiprocessing

    train_labels, train_images, test_labels, test_images = get_training_and_test_data()

    # split to make experimentation quicker
    train_labels, train_images = get_subset_of_training_data(train_labels, train_images, split=0.5)

    training_size = len(train_labels)

    desc = "testing influence of different clustering algorithms, using {} for a training size of {}".format(
        clustering,
        training_size
    )

    print desc

    selected_labels = list(set(train_labels))

    params = build_params(num_classes=len(selected_labels),
                          training_size=len(train_images),
                          test_size=len(test_images),
                          fn_prefix="clustering-{}".format(clustering))

    trainer = SketchRecognitionTrainer(
        file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(params=params),
        run_parallel_processors=True,
        params=params
    )

    if clustering == "kmeans":
        trainer.clustering = KMeans(
            init='k-means++',
            n_clusters=params[ParamKeys.NUM_CLUSTERS],
            n_init=10,
            max_iter=10,
            tol=1.0,
            n_jobs=multiprocessing.cpu_count() if trainer.run_parallel_processors else 1
        )
    elif clustering == "minibatchkmeans":
        trainer.clustering = MiniBatchKMeans(
            init='k-means++',
            n_clusters=params[ParamKeys.NUM_CLUSTERS],
            batch_size=100,
            n_init=10,
            max_no_improvement=10,
            verbose=0
        )
    elif clustering == "meanshift":
        trainer = MeanShiftSketchRecognitionTrainer(
            file_path=SketchRecognitionTrainer.get_cookbook_filename_for_params(params=params),
            run_parallel_processors=True,
            params=params
        )


    classifier = trainer.train_and_build_classifier(train_labels, train_images)
    encoded_test_labels = classifier.le.transform(test_labels)

    test_images_codelabels = trainer.code_labels_for_image_descriptors(
        trainer.extract_image_descriptors(test_images)
    )

    evaluator = Evaluator(
        clf=classifier.clf,
        label_encoder=classifier.le,
        params=params,
        output_filepath=SketchRecognitionTrainer.get_evaluation_filename_for_params(params=params)
    )

    # add timings to output
    evaluator.results["timings"] = {}
    for key, value in trainer.timings.iteritems():
        evaluator.results["timings"][key] = value

    # add comment
    evaluator.results["desc"] = desc

    evaluation_results = evaluator.evaluate(X=test_images_codelabels, y=encoded_test_labels)
    print evaluation_results