예제 #1
0
def max_pooling():
    # Seems to be a lot worse than the sum pooling
    # 2014-03-28 10:26:28 - Base - INFO - Cross validation completed in 1433.7291348.  Scores:
    # 2014-03-28 10:26:28 - Base - INFO - [-0.11968588 -0.12018345]
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images,
                                              n_centroids=n_centroids,
                                              pool_method='max')

    # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel
    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_008_maxpool.npy',
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
예제 #2
0
def kmeans_006_colwise_rmse():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, train_size=0.2, test_size=0.2)

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500, 'verbose': 1}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    # About 11 minutes to train the ridge regression on an m2.4 xlarge with 50% of the train set
    # Took about a minute to train ridge on 0.1 of the train set, but the overall rmse was .114 compared to .106 on 50%, and .104 actual
    # 5 minutes to train ridge on 0.2 of the train set, with rmse of .111
    kmeans_preds = wrapper.predict(test_x)

    logger.info('Kmeans')
    colwise = classes.colwise_rmse(kmeans_preds, test_y)
    overall = classes.rmse(kmeans_preds, test_y)

    """
예제 #3
0
def mean_pooling():
    # Wow mean pooling is really bad
    # 2014-03-28 11:28:42 - Base - INFO - Cross validation completed in 1523.09399891.  Scores:
    # 2014-03-28 11:28:42 - Base - INFO - [-0.13083991 -0.12989765]
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images,
                                              n_centroids=n_centroids,
                                              pool_method='mean')

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_008_meanpool.npy',
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
예제 #4
0
def extra_trees_submission():
    # Somehow the submission on the leaderboard scores 0.22
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_008.csv')
예제 #5
0
def kmeans_006_submission():
    # 2014-03-28 10:53:22 - Base - INFO - Cross validation completed in 1487.18687487.  Scores:
    # 2014-03-28 10:53:22 - Base - INFO - [-0.11018943 -0.10946863]

    # Final submission
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
예제 #6
0
def extratress():
    # 2014-03-28 13:24:22 - Base - INFO - Cross validation completed in 1139.1731801.  Scores:
    # 2014-03-28 13:24:22 - Base - INFO - [-0.11048638 -0.11060714]

    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images,
                                              n_centroids=n_centroids,
                                              pool_method='sum')

    # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel
    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
예제 #7
0
def rbm_001():
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    images = images.reshape((images.shape[0], 15 * 15 * 3))

    # rbm needs inputs to be between 0 and 1
    scaler = MinMaxScaler()
    images = scaler.fit_transform(images)

    # Training takes a long time, says 80 seconds per iteration, but seems like longer
    # And this is only with 256 components
    rbm = BernoulliRBM(verbose=1)
    rbm.fit(images)

    train_x = rbm.transform(images)
    train_y = classes.train_solutions.data

    # 0.138 CV on 50% of the dataset
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
예제 #8
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_007():
    """
    Increasing crop/scale size, rf size, centroids, and patches all at once.

    2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399.  Scores:
    2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868]
    """
    n_centroids = 5000
    s = 50
    crop = 200
    # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid
    # 800000 / 5000 = will give us 160 patches per centroid
    n_patches = 800000
    rf_size = 20
    # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings
    # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches
    stride = 2
    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    images = train_x_crop_scale.transform()
    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_007'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )
    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_007.npy',
        stride_size=stride,
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 250
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, parallel_estimator=True)
    """
예제 #9
0
def kmeans_006_submission():
    # Final submission
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5
    logger.info("Training with n_centroids {}".format(n_centroids))

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
예제 #10
0
def rf_size_10():
    # Pretty bad as well
    # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999.  Scores:
    # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735]

    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_008_rf10'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_rf10.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
예제 #11
0
def kmeans_007():
    """
    Increasing crop/scale size, rf size, centroids, and patches all at once.

    2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399.  Scores:
    2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868]
    """
    n_centroids = 5000
    s = 50
    crop = 200
    # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid
    # 800000 / 5000 = will give us 160 patches per centroid
    n_patches = 800000
    rf_size = 20
    # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings
    # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches
    stride = 2
    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    images = train_x_crop_scale.transform()
    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_007'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)
    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_007.npy', stride_size=stride, memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, parallel_estimator=True)

    """
예제 #12
0
def kmeans_006_submission():
    # 2014-03-28 10:53:22 - Base - INFO - Cross validation completed in 1487.18687487.  Scores:
    # 2014-03-28 10:53:22 - Base - INFO - [-0.11018943 -0.10946863]

    # Final submission
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
예제 #13
0
def extra_trees_submission():
    # Somehow the submission on the leaderboard scores 0.22
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_008.csv')
예제 #14
0
def rbm_001():
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    images = images.reshape((images.shape[0], 15 * 15 * 3))

    # rbm needs inputs to be between 0 and 1
    scaler = MinMaxScaler()
    images = scaler.fit_transform(images)

    # Training takes a long time, says 80 seconds per iteration, but seems like longer
    # And this is only with 256 components
    rbm = BernoulliRBM(verbose=1)
    rbm.fit(images)

    train_x = rbm.transform(images)
    train_y = classes.train_solutions.data

    # 0.138 CV on 50% of the dataset
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
예제 #15
0
def mean_pooling():
    # Wow mean pooling is really bad
    # 2014-03-28 11:28:42 - Base - INFO - Cross validation completed in 1523.09399891.  Scores:
    # 2014-03-28 11:28:42 - Base - INFO - [-0.13083991 -0.12989765]
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='mean')

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_meanpool.npy', memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
예제 #16
0
def max_pooling():
    # Seems to be a lot worse than the sum pooling
    # 2014-03-28 10:26:28 - Base - INFO - Cross validation completed in 1433.7291348.  Scores:
    # 2014-03-28 10:26:28 - Base - INFO - [-0.11968588 -0.12018345]
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='max')

    # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel
    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_maxpool.npy', memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
예제 #17
0
def extratress():
    # 2014-03-28 13:24:22 - Base - INFO - Cross validation completed in 1139.1731801.  Scores:
    # 2014-03-28 13:24:22 - Base - INFO - [-0.11048638 -0.11060714]

    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='sum')

    # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel
    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
예제 #18
0
def kmeans_006_colwise_rmse():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    train_x, test_x, train_y, test_y = train_test_split(train_x,
                                                        train_y,
                                                        train_size=0.2,
                                                        test_size=0.2)

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500,
        'verbose': 1
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    # About 11 minutes to train the ridge regression on an m2.4 xlarge with 50% of the train set
    # Took about a minute to train ridge on 0.1 of the train set, but the overall rmse was .114 compared to .106 on 50%, and .104 actual
    # 5 minutes to train ridge on 0.2 of the train set, with rmse of .111
    kmeans_preds = wrapper.predict(test_x)

    logger.info('Kmeans')
    colwise = classes.colwise_rmse(kmeans_preds, test_y)
    overall = classes.rmse(kmeans_preds, test_y)
    """
예제 #19
0
def gradient_boosting_grid_search():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    # This took maybe 30 minutes to run, 1200 components gives .9999999 variance explained.  Can maybe get
    # away with 200 - .9937
    pca = RandomizedPCA(n_components=200)
    pca.fit(train_x)
    pca_train_x = pca.transform(train_x)

    # We'll focus on the columns that have high errors based on the analysis in kmeans_006.py
    params = {
        'loss': ['ls', 'lad', 'huber', 'quantile'],
        'learning_rate': [0.01, 0.1, 1, 5, 10],
        'n_estimators': [100, 250, 500, 1000],
        'max_depth': [2, 3, 5, 10],
        'subsample': [0.2, 0.5, 1]
    }
    # not sure why it just dies here, on CV too
    #   File "/usr/lib/python2.7/multiprocessing/pool.py", line 319, in _handle_tasks
    # put(task)
    # SystemError: NULL result without error in PyObject_Call
    # seems like the parallelization is broken

    # Without parallelization, will run, but is super slow, probably because of the high dimensionality
    # of the train_x, which is n_centroids * 4 dimensions (12000), because of the pooling
    # It says something like 300 minutes to train 100 iterations
    # After PCA, takes about 15 minutes on 15% of the dataset with 1200 features
    # But RMSE is .20
    wrapper = ModelWrapper(GradientBoostingRegressor, {'verbose': 2}, n_jobs=1)

    # SVR takes about 30 minutes on 15% of the sample, and score is .19 on 0th class, compared to .15 on RidgeRFE
    # Didn't Scale/Center, so maybe need to do that?
    # After scaling to -1, 1, still same RMSE
    wrapper = ModelWrapper(SVR, {}, n_jobs=1)
    scale = MinMaxScaler((-1, 1))
    scaled_train_x = scale.fit_transform(train_x)

    wrapper.grid_search(train_x, train_y[:, 0], params, sample=0.3, refit=False)

    wrapper.cross_validation(train_x, train_y[:, 0], params, sample=0.3)
예제 #20
0
def ensemble():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper1 = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper1.fit(train_x, train_y)

    wrapper2 = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper2.fit(train_x, train_y)

    pred1 = wrapper1.predict(train_x)
    pred2 = wrapper2.predict(train_x)
    wrapper3 = ModelWrapper(Ridge)
    wrapper3.cross_validation(np.vstack((pred1, pred2)), train_y)
예제 #21
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def ensemble_001():
    """
    Ensemble of kmeans and random forest results
    Conducting some analysis of whether the errors from these two models for individual Ys are different

    Ensembled error is .1149.

    Kmeans is better on every class than RF.
    """
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_ensemble_001',
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    X = kmeans_generator.transform(images,
                                   save_to_file='data/data_ensemble_001.npy',
                                   memmap=True)
    Y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    # Get the input for the RF so that we can split together
    sampler = SampleTransformer(training=True,
                                steps=2,
                                step_size=20,
                                n_jobs=-1)
    pX = sampler.transform()

    # manual split of train and test
    train_x, test_x, ptrain_x, ptest_x, train_y, test_y = train_test_split(
        X, pX, Y, test_size=0.5)

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    kmeans_preds = wrapper.predict(test_x)

    pWrapper = ModelWrapper(RandomForestRegressor, {
        'n_estimators': 500,
        'verbose': 3
    },
                            n_jobs=-1)
    pWrapper.fit(ptrain_x, train_y)
    pixel_preds = pWrapper.predict(ptest_x)

    logger.info('Kmeans')
    classes.colwise_rmse(kmeans_preds, test_y)
    classes.rmse(kmeans_preds, test_y)
    logger.info('Pixel RF')
    classes.colwise_rmse(pixel_preds, test_y)
    classes.rmse(pixel_preds, test_y)

    logger.info("Ensembling predictions")
    etrain_x = np.hstack(
        (wrapper.predict(train_x), pWrapper.predict(ptrain_x)))
    etest_x = np.hstack((kmeans_preds, pixel_preds))
    eWrapper = ModelWrapper(RandomForestRegressor, {
        'n_estimators': 500,
        'verbose': 3
    },
                            n_jobs=-1)
    eWrapper.fit(etrain_x, train_y)
    ensemble_preds = eWrapper.predict(etest_x)
    classes.colwise_rmse(ensemble_preds, test_y)
    classes.rmse(ensemble_preds, test_y)
예제 #22
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_006_submission():
    # Final submission
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5
    logger.info("Training with n_centroids {}".format(n_centroids))

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
예제 #23
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_005():
    """
    Testing whether extracting patches from train and test images works better

    [(500000, False, array([-0.10799986, -0.10744586])),
    (500000, True, array([-0.10790803, -0.10733288])),
    (600000, False, array([-0.10812188, -0.10735988])),
    (600000, True, array([-0.10778652, -0.10752664]))]
    """
    n_patches_vals = [500000, 600000, 700000]
    include_test_images = [False, True]

    scores = []
    for n_patches in n_patches_vals:
        for incl in include_test_images:
            s = 15
            crop = 150
            n_centroids = 1600
            rf_size = 5
            logger.info(
                "Training with n_patches {}, with test images {}".format(
                    n_patches, incl))

            train_x_crop_scale = CropScaleImageTransformer(
                training=True,
                result_path='data/data_train_crop_{}_scale_{}.npy'.format(
                    crop, s),
                crop_size=crop,
                scaled_size=s,
                n_jobs=-1,
                memmap=True)
            test_x_crop_scale = CropScaleImageTransformer(
                training=False,
                result_path='data/data_test_crop_{}_scale_{}.npy'.format(
                    crop, s),
                crop_size=crop,
                scaled_size=s,
                n_jobs=-1,
                memmap=True)

            kmeans_generator = KMeansFeatureGenerator(
                n_centroids=n_centroids,
                rf_size=rf_size,
                result_path='data/mdl_kmeans_005_patches_{}_test{}'.format(
                    n_patches, incl),
                n_iterations=20,
                n_jobs=-1,
            )

            patch_extractor = models.KMeansFeatures.PatchSampler(
                n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
            images = train_x_crop_scale.transform()
            if incl:
                test_images = test_x_crop_scale.transform()
                images = np.vstack([images, test_images])
            logger.info(
                "Extracting patches from images ndarray shape: {}".format(
                    images.shape))

            patches = patch_extractor.transform(images)
            logger.info("Patches ndarray shape: {}".format(patches.shape))

            kmeans_generator.fit(patches)

            del patches
            gc.collect()

            # Reload the original images
            images = train_x_crop_scale.transform()
            logger.info(
                "Generating features on images ndarray shape: {}".format(
                    images.shape))
            train_x = kmeans_generator.transform(
                images,
                save_to_file=
                'data/data_kmeans_features_005_patches_{}_test_{}.npy'.format(
                    n_patches, incl),
                memmap=True)
            train_y = classes.train_solutions.data
            # Unload some objects
            del images
            gc.collect()

            wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
                'alpha': 500,
                'n_estimators': 250
            },
                                   n_jobs=-1)
            wrapper.cross_validation(train_x,
                                     train_y,
                                     n_folds=2,
                                     parallel_estimator=True)

            score = (n_patches, incl, wrapper.cv_scores)
            logger.info("Score: {}".format(score))
            scores.append(score)

            del wrapper
            gc.collect()
예제 #24
0
def kmeans_004():
    """
    Tuning the scale/crop and RF size parameters

    First number is the scaling, cropped to 200, with rf size of 5.  75 scaling took forever ot transform, so killed
    [(30, array([-0.11374265, -0.1134896 ]))
     (50, array([-0.11677854, -0.11696837]))]

    Trying again with larger RF size of 10.
    As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set
    Scale to 50 with rf 10 takes almost 90 minutes.
    [(30, array([-0.10828216, -0.1081058 ])),
    (50, array([-0.10840914, -0.10868195]))]
    Interesting that scale size of 50 does worse

    Crop is not 150, so this is not really an apples to apples comparison with kmeans_003

    It is possibly worth making a submission with scale 30 and rf size 10
    """
    crops = [200]  # Should probably also add 250
    scales = [30, 50]  # Scaling is probably the most important part here

    scores = []
    for s in scales:
        crop = 200
        n_centroids = 1600
        n_patches = 400000
        # rf_size = int(round(s * .2))
        rf_size = 10
        logger.info("Training with crop {}, scale {}, patch size {}, patches {}, centroids {}".format(crop, s, rf_size, n_patches, n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                       result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                       crop_size=crop,
                                                       scaled_size=s,
                                                       n_jobs=-1,
                                                       memmap=True)

        # spherical generator
        kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                  rf_size=rf_size,
                                                  result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format(s, rf_size),
                                                  n_iterations=20,
                                                  n_jobs=-1,)

        patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                             patch_size=rf_size,
                                                             n_jobs=-1)
        images = train_x_crop_scale.transform()
        logger.info("Images ndarray shape: {}".format(images.shape))
        patches = patch_extractor.transform(images)
        logger.info("Patches ndarray shape: {}".format(patches.shape))

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'.format(s, rf_size), memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()
        logger.info("Train X ndarray shape: {}".format(train_x.shape))

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
        wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)
        scores.append((s, wrapper.cv_scores))
        del wrapper
        gc.collect()
예제 #25
0
def kmeans_006():
    """
    Testing number of centroids

    [(1000, array([-0.10926318, -0.10853047])),
     (2000, array([-0.10727502, -0.10710292])),
     (2500, array([-0.107019  , -0.10696262])),
     (3000, array([-0.10713973, -0.1066932 ]))]

    """
    n_centroids_vals = [1000, 2000, 2500, 3000]
    scores = []

    for n_centroids in n_centroids_vals:
        s = 15
        crop = 150
        n_patches = 400000
        rf_size = 5
        logger.info("Training with n_centroids {}".format(n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(
            training=True,
            result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)
        test_x_crop_scale = CropScaleImageTransformer(
            training=False,
            result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)

        kmeans_generator = KMeansFeatureGenerator(
            n_centroids=n_centroids,
            rf_size=rf_size,
            result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
            n_iterations=20,
            n_jobs=-1,
        )

        patch_extractor = models.KMeansFeatures.PatchSampler(
            n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
        images = train_x_crop_scale.transform()

        patches = patch_extractor.transform(images)

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(
            images,
            save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.
            format(n_centroids),
            memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
            'alpha': 500,
            'n_estimators': 250
        },
                               n_jobs=-1)
        wrapper.cross_validation(train_x,
                                 train_y,
                                 n_folds=2,
                                 parallel_estimator=True)

        score = (n_centroids, wrapper.cv_scores)
        logger.info("Scores: {}".format(score))
        scores.append(score)

        del wrapper
        gc.collect()
예제 #26
0
    memmap=True)
train_y = classes.train_solutions.data

# Unload some objects for memory
del images
gc.collect()

# ModelWrapper is a convenience class that we created to automate some of the typical tasks
# like logging, grid search and cross validation.
# For fit, it is basically equivalent to calling fit on the estimator

# The estimator takes the X and y and trains a ridge regression (sklearn.linear_model.Ridge),
# predicts using the ridge regressor, then uses the results of the prediction to train a random forest.
wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
    'alpha': 500,
    'n_estimators': 500
},
                       n_jobs=-1)
wrapper.fit(train_x, train_y)

test_x_crop_scale = CropScaleImageTransformer(
    training=False,
    result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
    crop_size=crop,
    scaled_size=s,
    n_jobs=-1,
    memmap=True)

# Crop and scale the test images
test_images = test_x_crop_scale.transform()
예제 #27
0
def ensemble_001():
    """
    Ensemble of kmeans and random forest results
    Conducting some analysis of whether the errors from these two models for individual Ys are different

    Ensembled error is .1149.

    Kmeans is better on every class than RF.
    """
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_ensemble_001',
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    X = kmeans_generator.transform(images, save_to_file='data/data_ensemble_001.npy', memmap=True)
    Y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    # Get the input for the RF so that we can split together
    sampler = SampleTransformer(training=True, steps=2, step_size=20, n_jobs=-1)
    pX = sampler.transform()

    # manual split of train and test
    train_x, test_x, ptrain_x, ptest_x, train_y, test_y = train_test_split(X, pX, Y, test_size=0.5)

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    kmeans_preds = wrapper.predict(test_x)

    pWrapper = ModelWrapper(RandomForestRegressor, {'n_estimators': 500, 'verbose': 3}, n_jobs=-1)
    pWrapper.fit(ptrain_x, train_y)
    pixel_preds = pWrapper.predict(ptest_x)

    logger.info('Kmeans')
    classes.colwise_rmse(kmeans_preds, test_y)
    classes.rmse(kmeans_preds, test_y)
    logger.info('Pixel RF')
    classes.colwise_rmse(pixel_preds, test_y)
    classes.rmse(pixel_preds, test_y)

    logger.info("Ensembling predictions")
    etrain_x = np.hstack((wrapper.predict(train_x), pWrapper.predict(ptrain_x)))
    etest_x = np.hstack((kmeans_preds, pixel_preds))
    eWrapper = ModelWrapper(RandomForestRegressor, {'n_estimators': 500, 'verbose': 3}, n_jobs=-1)
    eWrapper.fit(etrain_x, train_y)
    ensemble_preds = eWrapper.predict(etest_x)
    classes.colwise_rmse(ensemble_preds, test_y)
    classes.rmse(ensemble_preds, test_y)
# Transform the training images into the features.
# Since we have 3,000 centroids with pooling over quadrants, we'll get 12,000 (3000 * 4) features
train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
train_y = classes.train_solutions.data

# Unload some objects for memory
del images
gc.collect()

# ModelWrapper is a convenience class that we created to automate some of the typical tasks
# like logging, grid search and cross validation.
# For fit, it is basically equivalent to calling fit on the estimator

# The estimator takes the X and y and trains a ridge regression (sklearn.linear_model.Ridge),
# predicts using the ridge regressor, then uses the results of the prediction to train a random forest.
wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
wrapper.fit(train_x, train_y)

test_x_crop_scale = CropScaleImageTransformer(training=False,
                                              result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                              crop_size=crop,
                                              scaled_size=s,
                                              n_jobs=-1,
                                              memmap=True)

# Crop and scale the test images
test_images = test_x_crop_scale.transform()

# Generate the test features
test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
예제 #29
0
def kmeans_005():
    """
    Testing whether extracting patches from train and test images works better

    [(500000, False, array([-0.10799986, -0.10744586])),
    (500000, True, array([-0.10790803, -0.10733288])),
    (600000, False, array([-0.10812188, -0.10735988])),
    (600000, True, array([-0.10778652, -0.10752664]))]
    """
    n_patches_vals = [500000, 600000, 700000]
    include_test_images = [False, True]

    scores = []
    for n_patches in n_patches_vals:
        for incl in include_test_images:
            s = 15
            crop = 150
            n_centroids = 1600
            rf_size = 5
            logger.info("Training with n_patches {}, with test images {}".format(n_patches, incl))

            train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                           result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                           crop_size=crop,
                                                           scaled_size=s,
                                                           n_jobs=-1,
                                                           memmap=True)
            test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                          result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                          crop_size=crop,
                                                          scaled_size=s,
                                                          n_jobs=-1,
                                                          memmap=True)

            kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                      rf_size=rf_size,
                                                      result_path='data/mdl_kmeans_005_patches_{}_test{}'.format(n_patches, incl),
                                                      n_iterations=20,
                                                      n_jobs=-1,)

            patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                                 patch_size=rf_size,
                                                                 n_jobs=-1)
            images = train_x_crop_scale.transform()
            if incl:
                test_images = test_x_crop_scale.transform()
                images = np.vstack([images, test_images])
            logger.info("Extracting patches from images ndarray shape: {}".format(images.shape))

            patches = patch_extractor.transform(images)
            logger.info("Patches ndarray shape: {}".format(patches.shape))

            kmeans_generator.fit(patches)

            del patches
            gc.collect()

            # Reload the original images
            images = train_x_crop_scale.transform()
            logger.info("Generating features on images ndarray shape: {}".format(images.shape))
            train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_005_patches_{}_test_{}.npy'.format(n_patches, incl), memmap=True)
            train_y = classes.train_solutions.data
            # Unload some objects
            del images
            gc.collect()

            wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
            wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)

            score = (n_patches, incl, wrapper.cv_scores)
            logger.info("Score: {}".format(score))
            scores.append(score)

            del wrapper
            gc.collect()
예제 #30
0
def kmeans_006():
    """
    Testing number of centroids

    [(1000, array([-0.10926318, -0.10853047])),
     (2000, array([-0.10727502, -0.10710292])),
     (2500, array([-0.107019  , -0.10696262])),
     (3000, array([-0.10713973, -0.1066932 ]))]

    """
    n_centroids_vals = [1000, 2000, 2500, 3000]
    scores = []

    for n_centroids in n_centroids_vals:
        s = 15
        crop = 150
        n_patches = 400000
        rf_size = 5
        logger.info("Training with n_centroids {}".format(n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                       result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                       crop_size=crop,
                                                       scaled_size=s,
                                                       n_jobs=-1,
                                                       memmap=True)
        test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                      result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                      crop_size=crop,
                                                      scaled_size=s,
                                                      n_jobs=-1,
                                                      memmap=True)

        kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                  rf_size=rf_size,
                                                  result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                                  n_iterations=20,
                                                  n_jobs=-1,)

        patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                             patch_size=rf_size,
                                                             n_jobs=-1)
        images = train_x_crop_scale.transform()

        patches = patch_extractor.transform(images)

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
        wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)

        score = (n_centroids, wrapper.cv_scores)
        logger.info("Scores: {}".format(score))
        scores.append(score)

        del wrapper
        gc.collect()
예제 #31
0
def ensemble():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper1 = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                            n_jobs=-1)
    wrapper1.fit(train_x, train_y)

    wrapper2 = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                            n_jobs=-1)
    wrapper2.fit(train_x, train_y)

    pred1 = wrapper1.predict(train_x)
    pred2 = wrapper2.predict(train_x)
    wrapper3 = ModelWrapper(Ridge)
    wrapper3.cross_validation(np.vstack((pred1, pred2)), train_y)
예제 #32
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_004():
    """
    Tuning the scale/crop and RF size parameters

    First number is the scaling, cropped to 200, with rf size of 5.  75 scaling took forever ot transform, so killed
    [(30, array([-0.11374265, -0.1134896 ]))
     (50, array([-0.11677854, -0.11696837]))]

    Trying again with larger RF size of 10.
    As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set
    Scale to 50 with rf 10 takes almost 90 minutes.
    [(30, array([-0.10828216, -0.1081058 ])),
    (50, array([-0.10840914, -0.10868195]))]
    Interesting that scale size of 50 does worse

    Crop is not 150, so this is not really an apples to apples comparison with kmeans_003

    It is possibly worth making a submission with scale 30 and rf size 10
    """
    crops = [200]  # Should probably also add 250
    scales = [30, 50]  # Scaling is probably the most important part here

    scores = []
    for s in scales:
        crop = 200
        n_centroids = 1600
        n_patches = 400000
        # rf_size = int(round(s * .2))
        rf_size = 10
        logger.info(
            "Training with crop {}, scale {}, patch size {}, patches {}, centroids {}"
            .format(crop, s, rf_size, n_patches, n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(
            training=True,
            result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)

        # spherical generator
        kmeans_generator = KMeansFeatureGenerator(
            n_centroids=n_centroids,
            rf_size=rf_size,
            result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format(
                s, rf_size),
            n_iterations=20,
            n_jobs=-1,
        )

        patch_extractor = models.KMeansFeatures.PatchSampler(
            n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
        images = train_x_crop_scale.transform()
        logger.info("Images ndarray shape: {}".format(images.shape))
        patches = patch_extractor.transform(images)
        logger.info("Patches ndarray shape: {}".format(patches.shape))

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(
            images,
            save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'.
            format(s, rf_size),
            memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()
        logger.info("Train X ndarray shape: {}".format(train_x.shape))

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
            'alpha': 500,
            'n_estimators': 250
        },
                               n_jobs=-1)
        wrapper.cross_validation(train_x,
                                 train_y,
                                 n_folds=2,
                                 parallel_estimator=True)
        scores.append((s, wrapper.cv_scores))
        del wrapper
        gc.collect()
예제 #33
0
def kmeans_003():
    """
    Grid search for Ridge RF parameters
    Not sure whether to use spherical or minibatch, so maybe do one run with both

    .106 on the leaderboard.  So the difference in CV scores narrowed
    """

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_150_scale_15.npy',
                                                   crop_size=150,
                                                   scaled_size=15,
                                                   n_jobs=-1,
                                                   memmap=True)



    # spherical generator
    kmeans_generator = KMeansFeatureGenerator(n_centroids=1600,
                                              rf_size=5,
                                              result_path='data/mdl_kmeans_002_new',
                                              n_iterations=20,
                                              n_jobs=-1,)

    # minibatch generator
    # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600,
    #                                                                 rf_size=5,
    #                                                                 result_path='data/mdl_kmeans_002_new_minibatch',
    #                                                                 method='minibatch',
    #                                                                 n_init=1,
    #                                                                 n_jobs=-1,)


    # Don't need to fit, as already cached
    patches = ''
    kmeans_generator.fit(patches)
    images = train_x_crop_scale.transform()

    # Problematic here - memory usage spikes to ~ 11GB when threads return
    # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()
    # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1)
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 14, 'n_estimators': 500}, n_jobs=-1)
    params = {
        'alpha': [150, 250, 500, 750, 1000],
        'n_estimators': [250]
    }

    # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space,
    # So need to re-run with larger range of alpha
    # Will hit 30GB of ram with 500 trees.
    wrapper.grid_search(train_x, train_y, params, refit=False, parallel_estimator=True)

    # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0},
    # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0},
    # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35},
    # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50},
    # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75},
    # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100},
    # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150},
    # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250},
    # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500},
    # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750},
    # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}]

    # Fit the final model
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_150_scale_15.npy',
                                                  crop_size=150,
                                                  scaled_size=15,
                                                  n_jobs=-1,
                                                  memmap=True)


    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_kmeans_test_features_003_new.npy', memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_003.csv')
예제 #34
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_003():
    """
    Grid search for Ridge RF parameters
    Not sure whether to use spherical or minibatch, so maybe do one run with both

    .106 on the leaderboard.  So the difference in CV scores narrowed
    """

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_150_scale_15.npy',
        crop_size=150,
        scaled_size=15,
        n_jobs=-1,
        memmap=True)

    # spherical generator
    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=1600,
        rf_size=5,
        result_path='data/mdl_kmeans_002_new',
        n_iterations=20,
        n_jobs=-1,
    )

    # minibatch generator
    # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600,
    #                                                                 rf_size=5,
    #                                                                 result_path='data/mdl_kmeans_002_new_minibatch',
    #                                                                 method='minibatch',
    #                                                                 n_init=1,
    #                                                                 n_jobs=-1,)

    # Don't need to fit, as already cached
    patches = ''
    kmeans_generator.fit(patches)
    images = train_x_crop_scale.transform()

    # Problematic here - memory usage spikes to ~ 11GB when threads return
    # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_002_new.npy',
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()
    # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1)
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 14,
        'n_estimators': 500
    },
                           n_jobs=-1)
    params = {'alpha': [150, 250, 500, 750, 1000], 'n_estimators': [250]}

    # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space,
    # So need to re-run with larger range of alpha
    # Will hit 30GB of ram with 500 trees.
    wrapper.grid_search(train_x,
                        train_y,
                        params,
                        refit=False,
                        parallel_estimator=True)

    # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0},
    # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0},
    # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35},
    # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50},
    # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75},
    # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100},
    # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150},
    # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250},
    # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500},
    # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750},
    # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}]

    # Fit the final model
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_150_scale_15.npy',
        crop_size=150,
        scaled_size=15,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_kmeans_test_features_003_new.npy',
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_003.csv')
예제 #35
0
def rf_size_10():
    # Pretty bad as well
    # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999.  Scores:
    # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735]

    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)
    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_008_rf10'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_008_rf10.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)