예제 #1
0
def ensemble():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper1 = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper1.fit(train_x, train_y)

    wrapper2 = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper2.fit(train_x, train_y)

    pred1 = wrapper1.predict(train_x)
    pred2 = wrapper2.predict(train_x)
    wrapper3 = ModelWrapper(Ridge)
    wrapper3.cross_validation(np.vstack((pred1, pred2)), train_y)
예제 #2
0
def mean_pooling():
    # Wow mean pooling is really bad
    # 2014-03-28 11:28:42 - Base - INFO - Cross validation completed in 1523.09399891.  Scores:
    # 2014-03-28 11:28:42 - Base - INFO - [-0.13083991 -0.12989765]
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images,
                                              n_centroids=n_centroids,
                                              pool_method='mean')

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_008_meanpool.npy',
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
예제 #3
0
def rbm_001():
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    images = images.reshape((images.shape[0], 15 * 15 * 3))

    # rbm needs inputs to be between 0 and 1
    scaler = MinMaxScaler()
    images = scaler.fit_transform(images)

    # Training takes a long time, says 80 seconds per iteration, but seems like longer
    # And this is only with 256 components
    rbm = BernoulliRBM(verbose=1)
    rbm.fit(images)

    train_x = rbm.transform(images)
    train_y = classes.train_solutions.data

    # 0.138 CV on 50% of the dataset
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
예제 #4
0
def extratress():
    # 2014-03-28 13:24:22 - Base - INFO - Cross validation completed in 1139.1731801.  Scores:
    # 2014-03-28 13:24:22 - Base - INFO - [-0.11048638 -0.11060714]

    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images,
                                              n_centroids=n_centroids,
                                              pool_method='sum')

    # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel
    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
예제 #5
0
def max_pooling():
    # Seems to be a lot worse than the sum pooling
    # 2014-03-28 10:26:28 - Base - INFO - Cross validation completed in 1433.7291348.  Scores:
    # 2014-03-28 10:26:28 - Base - INFO - [-0.11968588 -0.12018345]
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images,
                                              n_centroids=n_centroids,
                                              pool_method='max')

    # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel
    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_008_maxpool.npy',
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
예제 #6
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_007():
    """
    Increasing crop/scale size, rf size, centroids, and patches all at once.

    2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399.  Scores:
    2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868]
    """
    n_centroids = 5000
    s = 50
    crop = 200
    # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid
    # 800000 / 5000 = will give us 160 patches per centroid
    n_patches = 800000
    rf_size = 20
    # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings
    # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches
    stride = 2
    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    images = train_x_crop_scale.transform()
    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_007'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )
    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_007.npy',
        stride_size=stride,
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 250
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, parallel_estimator=True)
    """
예제 #7
0
def gradient_boosting_grid_search():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    # This took maybe 30 minutes to run, 1200 components gives .9999999 variance explained.  Can maybe get
    # away with 200 - .9937
    pca = RandomizedPCA(n_components=200)
    pca.fit(train_x)
    pca_train_x = pca.transform(train_x)

    # We'll focus on the columns that have high errors based on the analysis in kmeans_006.py
    params = {
        'loss': ['ls', 'lad', 'huber', 'quantile'],
        'learning_rate': [0.01, 0.1, 1, 5, 10],
        'n_estimators': [100, 250, 500, 1000],
        'max_depth': [2, 3, 5, 10],
        'subsample': [0.2, 0.5, 1]
    }
    # not sure why it just dies here, on CV too
    #   File "/usr/lib/python2.7/multiprocessing/pool.py", line 319, in _handle_tasks
    # put(task)
    # SystemError: NULL result without error in PyObject_Call
    # seems like the parallelization is broken

    # Without parallelization, will run, but is super slow, probably because of the high dimensionality
    # of the train_x, which is n_centroids * 4 dimensions (12000), because of the pooling
    # It says something like 300 minutes to train 100 iterations
    # After PCA, takes about 15 minutes on 15% of the dataset with 1200 features
    # But RMSE is .20
    wrapper = ModelWrapper(GradientBoostingRegressor, {'verbose': 2}, n_jobs=1)

    # SVR takes about 30 minutes on 15% of the sample, and score is .19 on 0th class, compared to .15 on RidgeRFE
    # Didn't Scale/Center, so maybe need to do that?
    # After scaling to -1, 1, still same RMSE
    wrapper = ModelWrapper(SVR, {}, n_jobs=1)
    scale = MinMaxScaler((-1, 1))
    scaled_train_x = scale.fit_transform(train_x)

    wrapper.grid_search(train_x, train_y[:, 0], params, sample=0.3, refit=False)

    wrapper.cross_validation(train_x, train_y[:, 0], params, sample=0.3)
예제 #8
0
def rf_size_10():
    # Pretty bad as well
    # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999.  Scores:
    # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735]

    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_008_rf10'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_rf10.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
예제 #9
0
def kmeans_007():
    """
    Increasing crop/scale size, rf size, centroids, and patches all at once.

    2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399.  Scores:
    2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868]
    """
    n_centroids = 5000
    s = 50
    crop = 200
    # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid
    # 800000 / 5000 = will give us 160 patches per centroid
    n_patches = 800000
    rf_size = 20
    # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings
    # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches
    stride = 2
    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    images = train_x_crop_scale.transform()
    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_007'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)
    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_007.npy', stride_size=stride, memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, parallel_estimator=True)

    """
예제 #10
0
def rbm_001():
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    images = images.reshape((images.shape[0], 15 * 15 * 3))

    # rbm needs inputs to be between 0 and 1
    scaler = MinMaxScaler()
    images = scaler.fit_transform(images)

    # Training takes a long time, says 80 seconds per iteration, but seems like longer
    # And this is only with 256 components
    rbm = BernoulliRBM(verbose=1)
    rbm.fit(images)

    train_x = rbm.transform(images)
    train_y = classes.train_solutions.data

    # 0.138 CV on 50% of the dataset
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
예제 #11
0
def mean_pooling():
    # Wow mean pooling is really bad
    # 2014-03-28 11:28:42 - Base - INFO - Cross validation completed in 1523.09399891.  Scores:
    # 2014-03-28 11:28:42 - Base - INFO - [-0.13083991 -0.12989765]
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='mean')

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_meanpool.npy', memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
예제 #12
0
def extratress():
    # 2014-03-28 13:24:22 - Base - INFO - Cross validation completed in 1139.1731801.  Scores:
    # 2014-03-28 13:24:22 - Base - INFO - [-0.11048638 -0.11060714]

    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='sum')

    # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel
    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
예제 #13
0
def max_pooling():
    # Seems to be a lot worse than the sum pooling
    # 2014-03-28 10:26:28 - Base - INFO - Cross validation completed in 1433.7291348.  Scores:
    # 2014-03-28 10:26:28 - Base - INFO - [-0.11968588 -0.12018345]
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='max')

    # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel
    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_maxpool.npy', memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
예제 #14
0
def ensemble():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper1 = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                            n_jobs=-1)
    wrapper1.fit(train_x, train_y)

    wrapper2 = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                            n_jobs=-1)
    wrapper2.fit(train_x, train_y)

    pred1 = wrapper1.predict(train_x)
    pred2 = wrapper2.predict(train_x)
    wrapper3 = ModelWrapper(Ridge)
    wrapper3.cross_validation(np.vstack((pred1, pred2)), train_y)
예제 #15
0
def kmeans_005():
    """
    Testing whether extracting patches from train and test images works better

    [(500000, False, array([-0.10799986, -0.10744586])),
    (500000, True, array([-0.10790803, -0.10733288])),
    (600000, False, array([-0.10812188, -0.10735988])),
    (600000, True, array([-0.10778652, -0.10752664]))]
    """
    n_patches_vals = [500000, 600000, 700000]
    include_test_images = [False, True]

    scores = []
    for n_patches in n_patches_vals:
        for incl in include_test_images:
            s = 15
            crop = 150
            n_centroids = 1600
            rf_size = 5
            logger.info("Training with n_patches {}, with test images {}".format(n_patches, incl))

            train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                           result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                           crop_size=crop,
                                                           scaled_size=s,
                                                           n_jobs=-1,
                                                           memmap=True)
            test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                          result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                          crop_size=crop,
                                                          scaled_size=s,
                                                          n_jobs=-1,
                                                          memmap=True)

            kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                      rf_size=rf_size,
                                                      result_path='data/mdl_kmeans_005_patches_{}_test{}'.format(n_patches, incl),
                                                      n_iterations=20,
                                                      n_jobs=-1,)

            patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                                 patch_size=rf_size,
                                                                 n_jobs=-1)
            images = train_x_crop_scale.transform()
            if incl:
                test_images = test_x_crop_scale.transform()
                images = np.vstack([images, test_images])
            logger.info("Extracting patches from images ndarray shape: {}".format(images.shape))

            patches = patch_extractor.transform(images)
            logger.info("Patches ndarray shape: {}".format(patches.shape))

            kmeans_generator.fit(patches)

            del patches
            gc.collect()

            # Reload the original images
            images = train_x_crop_scale.transform()
            logger.info("Generating features on images ndarray shape: {}".format(images.shape))
            train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_005_patches_{}_test_{}.npy'.format(n_patches, incl), memmap=True)
            train_y = classes.train_solutions.data
            # Unload some objects
            del images
            gc.collect()

            wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
            wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)

            score = (n_patches, incl, wrapper.cv_scores)
            logger.info("Score: {}".format(score))
            scores.append(score)

            del wrapper
            gc.collect()
예제 #16
0
def kmeans_006():
    """
    Testing number of centroids

    [(1000, array([-0.10926318, -0.10853047])),
     (2000, array([-0.10727502, -0.10710292])),
     (2500, array([-0.107019  , -0.10696262])),
     (3000, array([-0.10713973, -0.1066932 ]))]

    """
    n_centroids_vals = [1000, 2000, 2500, 3000]
    scores = []

    for n_centroids in n_centroids_vals:
        s = 15
        crop = 150
        n_patches = 400000
        rf_size = 5
        logger.info("Training with n_centroids {}".format(n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(
            training=True,
            result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)
        test_x_crop_scale = CropScaleImageTransformer(
            training=False,
            result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)

        kmeans_generator = KMeansFeatureGenerator(
            n_centroids=n_centroids,
            rf_size=rf_size,
            result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
            n_iterations=20,
            n_jobs=-1,
        )

        patch_extractor = models.KMeansFeatures.PatchSampler(
            n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
        images = train_x_crop_scale.transform()

        patches = patch_extractor.transform(images)

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(
            images,
            save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.
            format(n_centroids),
            memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
            'alpha': 500,
            'n_estimators': 250
        },
                               n_jobs=-1)
        wrapper.cross_validation(train_x,
                                 train_y,
                                 n_folds=2,
                                 parallel_estimator=True)

        score = (n_centroids, wrapper.cv_scores)
        logger.info("Scores: {}".format(score))
        scores.append(score)

        del wrapper
        gc.collect()
예제 #17
0
def kmeans_006():
    """
    Testing number of centroids

    [(1000, array([-0.10926318, -0.10853047])),
     (2000, array([-0.10727502, -0.10710292])),
     (2500, array([-0.107019  , -0.10696262])),
     (3000, array([-0.10713973, -0.1066932 ]))]

    """
    n_centroids_vals = [1000, 2000, 2500, 3000]
    scores = []

    for n_centroids in n_centroids_vals:
        s = 15
        crop = 150
        n_patches = 400000
        rf_size = 5
        logger.info("Training with n_centroids {}".format(n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                       result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                       crop_size=crop,
                                                       scaled_size=s,
                                                       n_jobs=-1,
                                                       memmap=True)
        test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                      result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                      crop_size=crop,
                                                      scaled_size=s,
                                                      n_jobs=-1,
                                                      memmap=True)

        kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                  rf_size=rf_size,
                                                  result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                                  n_iterations=20,
                                                  n_jobs=-1,)

        patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                             patch_size=rf_size,
                                                             n_jobs=-1)
        images = train_x_crop_scale.transform()

        patches = patch_extractor.transform(images)

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
        wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)

        score = (n_centroids, wrapper.cv_scores)
        logger.info("Scores: {}".format(score))
        scores.append(score)

        del wrapper
        gc.collect()
예제 #18
0
def rf_size_10():
    # Pretty bad as well
    # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999.  Scores:
    # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735]

    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)
    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_008_rf10'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_008_rf10.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
예제 #19
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_005():
    """
    Testing whether extracting patches from train and test images works better

    [(500000, False, array([-0.10799986, -0.10744586])),
    (500000, True, array([-0.10790803, -0.10733288])),
    (600000, False, array([-0.10812188, -0.10735988])),
    (600000, True, array([-0.10778652, -0.10752664]))]
    """
    n_patches_vals = [500000, 600000, 700000]
    include_test_images = [False, True]

    scores = []
    for n_patches in n_patches_vals:
        for incl in include_test_images:
            s = 15
            crop = 150
            n_centroids = 1600
            rf_size = 5
            logger.info(
                "Training with n_patches {}, with test images {}".format(
                    n_patches, incl))

            train_x_crop_scale = CropScaleImageTransformer(
                training=True,
                result_path='data/data_train_crop_{}_scale_{}.npy'.format(
                    crop, s),
                crop_size=crop,
                scaled_size=s,
                n_jobs=-1,
                memmap=True)
            test_x_crop_scale = CropScaleImageTransformer(
                training=False,
                result_path='data/data_test_crop_{}_scale_{}.npy'.format(
                    crop, s),
                crop_size=crop,
                scaled_size=s,
                n_jobs=-1,
                memmap=True)

            kmeans_generator = KMeansFeatureGenerator(
                n_centroids=n_centroids,
                rf_size=rf_size,
                result_path='data/mdl_kmeans_005_patches_{}_test{}'.format(
                    n_patches, incl),
                n_iterations=20,
                n_jobs=-1,
            )

            patch_extractor = models.KMeansFeatures.PatchSampler(
                n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
            images = train_x_crop_scale.transform()
            if incl:
                test_images = test_x_crop_scale.transform()
                images = np.vstack([images, test_images])
            logger.info(
                "Extracting patches from images ndarray shape: {}".format(
                    images.shape))

            patches = patch_extractor.transform(images)
            logger.info("Patches ndarray shape: {}".format(patches.shape))

            kmeans_generator.fit(patches)

            del patches
            gc.collect()

            # Reload the original images
            images = train_x_crop_scale.transform()
            logger.info(
                "Generating features on images ndarray shape: {}".format(
                    images.shape))
            train_x = kmeans_generator.transform(
                images,
                save_to_file=
                'data/data_kmeans_features_005_patches_{}_test_{}.npy'.format(
                    n_patches, incl),
                memmap=True)
            train_y = classes.train_solutions.data
            # Unload some objects
            del images
            gc.collect()

            wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
                'alpha': 500,
                'n_estimators': 250
            },
                                   n_jobs=-1)
            wrapper.cross_validation(train_x,
                                     train_y,
                                     n_folds=2,
                                     parallel_estimator=True)

            score = (n_patches, incl, wrapper.cv_scores)
            logger.info("Score: {}".format(score))
            scores.append(score)

            del wrapper
            gc.collect()
예제 #20
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_004():
    """
    Tuning the scale/crop and RF size parameters

    First number is the scaling, cropped to 200, with rf size of 5.  75 scaling took forever ot transform, so killed
    [(30, array([-0.11374265, -0.1134896 ]))
     (50, array([-0.11677854, -0.11696837]))]

    Trying again with larger RF size of 10.
    As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set
    Scale to 50 with rf 10 takes almost 90 minutes.
    [(30, array([-0.10828216, -0.1081058 ])),
    (50, array([-0.10840914, -0.10868195]))]
    Interesting that scale size of 50 does worse

    Crop is not 150, so this is not really an apples to apples comparison with kmeans_003

    It is possibly worth making a submission with scale 30 and rf size 10
    """
    crops = [200]  # Should probably also add 250
    scales = [30, 50]  # Scaling is probably the most important part here

    scores = []
    for s in scales:
        crop = 200
        n_centroids = 1600
        n_patches = 400000
        # rf_size = int(round(s * .2))
        rf_size = 10
        logger.info(
            "Training with crop {}, scale {}, patch size {}, patches {}, centroids {}"
            .format(crop, s, rf_size, n_patches, n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(
            training=True,
            result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)

        # spherical generator
        kmeans_generator = KMeansFeatureGenerator(
            n_centroids=n_centroids,
            rf_size=rf_size,
            result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format(
                s, rf_size),
            n_iterations=20,
            n_jobs=-1,
        )

        patch_extractor = models.KMeansFeatures.PatchSampler(
            n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
        images = train_x_crop_scale.transform()
        logger.info("Images ndarray shape: {}".format(images.shape))
        patches = patch_extractor.transform(images)
        logger.info("Patches ndarray shape: {}".format(patches.shape))

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(
            images,
            save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'.
            format(s, rf_size),
            memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()
        logger.info("Train X ndarray shape: {}".format(train_x.shape))

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
            'alpha': 500,
            'n_estimators': 250
        },
                               n_jobs=-1)
        wrapper.cross_validation(train_x,
                                 train_y,
                                 n_folds=2,
                                 parallel_estimator=True)
        scores.append((s, wrapper.cv_scores))
        del wrapper
        gc.collect()
예제 #21
0
def kmeans_004():
    """
    Tuning the scale/crop and RF size parameters

    First number is the scaling, cropped to 200, with rf size of 5.  75 scaling took forever ot transform, so killed
    [(30, array([-0.11374265, -0.1134896 ]))
     (50, array([-0.11677854, -0.11696837]))]

    Trying again with larger RF size of 10.
    As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set
    Scale to 50 with rf 10 takes almost 90 minutes.
    [(30, array([-0.10828216, -0.1081058 ])),
    (50, array([-0.10840914, -0.10868195]))]
    Interesting that scale size of 50 does worse

    Crop is not 150, so this is not really an apples to apples comparison with kmeans_003

    It is possibly worth making a submission with scale 30 and rf size 10
    """
    crops = [200]  # Should probably also add 250
    scales = [30, 50]  # Scaling is probably the most important part here

    scores = []
    for s in scales:
        crop = 200
        n_centroids = 1600
        n_patches = 400000
        # rf_size = int(round(s * .2))
        rf_size = 10
        logger.info("Training with crop {}, scale {}, patch size {}, patches {}, centroids {}".format(crop, s, rf_size, n_patches, n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                       result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                       crop_size=crop,
                                                       scaled_size=s,
                                                       n_jobs=-1,
                                                       memmap=True)

        # spherical generator
        kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                  rf_size=rf_size,
                                                  result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format(s, rf_size),
                                                  n_iterations=20,
                                                  n_jobs=-1,)

        patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                             patch_size=rf_size,
                                                             n_jobs=-1)
        images = train_x_crop_scale.transform()
        logger.info("Images ndarray shape: {}".format(images.shape))
        patches = patch_extractor.transform(images)
        logger.info("Patches ndarray shape: {}".format(patches.shape))

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'.format(s, rf_size), memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()
        logger.info("Train X ndarray shape: {}".format(train_x.shape))

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
        wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)
        scores.append((s, wrapper.cv_scores))
        del wrapper
        gc.collect()