def ensemble(): crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper1 = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper1.fit(train_x, train_y) wrapper2 = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper2.fit(train_x, train_y) pred1 = wrapper1.predict(train_x) pred2 = wrapper2.predict(train_x) wrapper3 = ModelWrapper(Ridge) wrapper3.cross_validation(np.vstack((pred1, pred2)), train_y)
def mean_pooling(): # Wow mean pooling is really bad # 2014-03-28 11:28:42 - Base - INFO - Cross validation completed in 1523.09399891. Scores: # 2014-03-28 11:28:42 - Base - INFO - [-0.13083991 -0.12989765] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='mean') train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_008_meanpool.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def rbm_001(): s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() images = images.reshape((images.shape[0], 15 * 15 * 3)) # rbm needs inputs to be between 0 and 1 scaler = MinMaxScaler() images = scaler.fit_transform(images) # Training takes a long time, says 80 seconds per iteration, but seems like longer # And this is only with 256 components rbm = BernoulliRBM(verbose=1) rbm.fit(images) train_x = rbm.transform(images) train_y = classes.train_solutions.data # 0.138 CV on 50% of the dataset wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def extratress(): # 2014-03-28 13:24:22 - Base - INFO - Cross validation completed in 1139.1731801. Scores: # 2014-03-28 13:24:22 - Base - INFO - [-0.11048638 -0.11060714] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='sum') # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def max_pooling(): # Seems to be a lot worse than the sum pooling # 2014-03-28 10:26:28 - Base - INFO - Cross validation completed in 1433.7291348. Scores: # 2014-03-28 10:26:28 - Base - INFO - [-0.11968588 -0.12018345] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='max') # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_008_maxpool.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def kmeans_007(): """ Increasing crop/scale size, rf size, centroids, and patches all at once. 2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399. Scores: 2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868] """ n_centroids = 5000 s = 50 crop = 200 # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid # 800000 / 5000 = will give us 160 patches per centroid n_patches = 800000 rf_size = 20 # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches stride = 2 train_x_crop_scale = CropScaleImageTransformer(training=True, crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) images = train_x_crop_scale.transform() patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) patches = patch_extractor.transform(images) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_007'.format(n_centroids), n_iterations=20, n_jobs=-1, ) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_007.npy', stride_size=stride, memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 250 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, parallel_estimator=True) """
def gradient_boosting_grid_search(): crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() # This took maybe 30 minutes to run, 1200 components gives .9999999 variance explained. Can maybe get # away with 200 - .9937 pca = RandomizedPCA(n_components=200) pca.fit(train_x) pca_train_x = pca.transform(train_x) # We'll focus on the columns that have high errors based on the analysis in kmeans_006.py params = { 'loss': ['ls', 'lad', 'huber', 'quantile'], 'learning_rate': [0.01, 0.1, 1, 5, 10], 'n_estimators': [100, 250, 500, 1000], 'max_depth': [2, 3, 5, 10], 'subsample': [0.2, 0.5, 1] } # not sure why it just dies here, on CV too # File "/usr/lib/python2.7/multiprocessing/pool.py", line 319, in _handle_tasks # put(task) # SystemError: NULL result without error in PyObject_Call # seems like the parallelization is broken # Without parallelization, will run, but is super slow, probably because of the high dimensionality # of the train_x, which is n_centroids * 4 dimensions (12000), because of the pooling # It says something like 300 minutes to train 100 iterations # After PCA, takes about 15 minutes on 15% of the dataset with 1200 features # But RMSE is .20 wrapper = ModelWrapper(GradientBoostingRegressor, {'verbose': 2}, n_jobs=1) # SVR takes about 30 minutes on 15% of the sample, and score is .19 on 0th class, compared to .15 on RidgeRFE # Didn't Scale/Center, so maybe need to do that? # After scaling to -1, 1, still same RMSE wrapper = ModelWrapper(SVR, {}, n_jobs=1) scale = MinMaxScaler((-1, 1)) scaled_train_x = scale.fit_transform(train_x) wrapper.grid_search(train_x, train_y[:, 0], params, sample=0.3, refit=False) wrapper.cross_validation(train_x, train_y[:, 0], params, sample=0.3)
def rf_size_10(): # Pretty bad as well # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999. Scores: # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735] n_centroids = 3000 s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_008_rf10'.format(n_centroids), n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_rf10.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def kmeans_007(): """ Increasing crop/scale size, rf size, centroids, and patches all at once. 2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399. Scores: 2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868] """ n_centroids = 5000 s = 50 crop = 200 # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid # 800000 / 5000 = will give us 160 patches per centroid n_patches = 800000 rf_size = 20 # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches stride = 2 train_x_crop_scale = CropScaleImageTransformer(training=True, crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) images = train_x_crop_scale.transform() patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) patches = patch_extractor.transform(images) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_007'.format(n_centroids), n_iterations=20, n_jobs=-1,) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_007.npy', stride_size=stride, memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, parallel_estimator=True) """
def rbm_001(): s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() images = images.reshape((images.shape[0], 15 * 15 * 3)) # rbm needs inputs to be between 0 and 1 scaler = MinMaxScaler() images = scaler.fit_transform(images) # Training takes a long time, says 80 seconds per iteration, but seems like longer # And this is only with 256 components rbm = BernoulliRBM(verbose=1) rbm.fit(images) train_x = rbm.transform(images) train_y = classes.train_solutions.data # 0.138 CV on 50% of the dataset wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def mean_pooling(): # Wow mean pooling is really bad # 2014-03-28 11:28:42 - Base - INFO - Cross validation completed in 1523.09399891. Scores: # 2014-03-28 11:28:42 - Base - INFO - [-0.13083991 -0.12989765] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='mean') train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_meanpool.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def extratress(): # 2014-03-28 13:24:22 - Base - INFO - Cross validation completed in 1139.1731801. Scores: # 2014-03-28 13:24:22 - Base - INFO - [-0.11048638 -0.11060714] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='sum') # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def max_pooling(): # Seems to be a lot worse than the sum pooling # 2014-03-28 10:26:28 - Base - INFO - Cross validation completed in 1433.7291348. Scores: # 2014-03-28 10:26:28 - Base - INFO - [-0.11968588 -0.12018345] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='max') # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_maxpool.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def ensemble(): crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper1 = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper1.fit(train_x, train_y) wrapper2 = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper2.fit(train_x, train_y) pred1 = wrapper1.predict(train_x) pred2 = wrapper2.predict(train_x) wrapper3 = ModelWrapper(Ridge) wrapper3.cross_validation(np.vstack((pred1, pred2)), train_y)
def kmeans_005(): """ Testing whether extracting patches from train and test images works better [(500000, False, array([-0.10799986, -0.10744586])), (500000, True, array([-0.10790803, -0.10733288])), (600000, False, array([-0.10812188, -0.10735988])), (600000, True, array([-0.10778652, -0.10752664]))] """ n_patches_vals = [500000, 600000, 700000] include_test_images = [False, True] scores = [] for n_patches in n_patches_vals: for incl in include_test_images: s = 15 crop = 150 n_centroids = 1600 rf_size = 5 logger.info("Training with n_patches {}, with test images {}".format(n_patches, incl)) train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_005_patches_{}_test{}'.format(n_patches, incl), n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() if incl: test_images = test_x_crop_scale.transform() images = np.vstack([images, test_images]) logger.info("Extracting patches from images ndarray shape: {}".format(images.shape)) patches = patch_extractor.transform(images) logger.info("Patches ndarray shape: {}".format(patches.shape)) kmeans_generator.fit(patches) del patches gc.collect() # Reload the original images images = train_x_crop_scale.transform() logger.info("Generating features on images ndarray shape: {}".format(images.shape)) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_005_patches_{}_test_{}.npy'.format(n_patches, incl), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) score = (n_patches, incl, wrapper.cv_scores) logger.info("Score: {}".format(score)) scores.append(score) del wrapper gc.collect()
def kmeans_006(): """ Testing number of centroids [(1000, array([-0.10926318, -0.10853047])), (2000, array([-0.10727502, -0.10710292])), (2500, array([-0.107019 , -0.10696262])), (3000, array([-0.10713973, -0.1066932 ]))] """ n_centroids_vals = [1000, 2000, 2500, 3000] scores = [] for n_centroids in n_centroids_vals: s = 15 crop = 150 n_patches = 400000 rf_size = 5 logger.info("Training with n_centroids {}".format(n_centroids)) train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler( n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'. format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 250 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) score = (n_centroids, wrapper.cv_scores) logger.info("Scores: {}".format(score)) scores.append(score) del wrapper gc.collect()
def kmeans_006(): """ Testing number of centroids [(1000, array([-0.10926318, -0.10853047])), (2000, array([-0.10727502, -0.10710292])), (2500, array([-0.107019 , -0.10696262])), (3000, array([-0.10713973, -0.1066932 ]))] """ n_centroids_vals = [1000, 2000, 2500, 3000] scores = [] for n_centroids in n_centroids_vals: s = 15 crop = 150 n_patches = 400000 rf_size = 5 logger.info("Training with n_centroids {}".format(n_centroids)) train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids), n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) score = (n_centroids, wrapper.cv_scores) logger.info("Scores: {}".format(score)) scores.append(score) del wrapper gc.collect()
def rf_size_10(): # Pretty bad as well # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999. Scores: # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735] n_centroids = 3000 s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_008_rf10'.format(n_centroids), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_008_rf10.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def kmeans_005(): """ Testing whether extracting patches from train and test images works better [(500000, False, array([-0.10799986, -0.10744586])), (500000, True, array([-0.10790803, -0.10733288])), (600000, False, array([-0.10812188, -0.10735988])), (600000, True, array([-0.10778652, -0.10752664]))] """ n_patches_vals = [500000, 600000, 700000] include_test_images = [False, True] scores = [] for n_patches in n_patches_vals: for incl in include_test_images: s = 15 crop = 150 n_centroids = 1600 rf_size = 5 logger.info( "Training with n_patches {}, with test images {}".format( n_patches, incl)) train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format( crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format( crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_005_patches_{}_test{}'.format( n_patches, incl), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler( n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() if incl: test_images = test_x_crop_scale.transform() images = np.vstack([images, test_images]) logger.info( "Extracting patches from images ndarray shape: {}".format( images.shape)) patches = patch_extractor.transform(images) logger.info("Patches ndarray shape: {}".format(patches.shape)) kmeans_generator.fit(patches) del patches gc.collect() # Reload the original images images = train_x_crop_scale.transform() logger.info( "Generating features on images ndarray shape: {}".format( images.shape)) train_x = kmeans_generator.transform( images, save_to_file= 'data/data_kmeans_features_005_patches_{}_test_{}.npy'.format( n_patches, incl), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 250 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) score = (n_patches, incl, wrapper.cv_scores) logger.info("Score: {}".format(score)) scores.append(score) del wrapper gc.collect()
def kmeans_004(): """ Tuning the scale/crop and RF size parameters First number is the scaling, cropped to 200, with rf size of 5. 75 scaling took forever ot transform, so killed [(30, array([-0.11374265, -0.1134896 ])) (50, array([-0.11677854, -0.11696837]))] Trying again with larger RF size of 10. As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set Scale to 50 with rf 10 takes almost 90 minutes. [(30, array([-0.10828216, -0.1081058 ])), (50, array([-0.10840914, -0.10868195]))] Interesting that scale size of 50 does worse Crop is not 150, so this is not really an apples to apples comparison with kmeans_003 It is possibly worth making a submission with scale 30 and rf size 10 """ crops = [200] # Should probably also add 250 scales = [30, 50] # Scaling is probably the most important part here scores = [] for s in scales: crop = 200 n_centroids = 1600 n_patches = 400000 # rf_size = int(round(s * .2)) rf_size = 10 logger.info( "Training with crop {}, scale {}, patch size {}, patches {}, centroids {}" .format(crop, s, rf_size, n_patches, n_centroids)) train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) # spherical generator kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format( s, rf_size), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler( n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() logger.info("Images ndarray shape: {}".format(images.shape)) patches = patch_extractor.transform(images) logger.info("Patches ndarray shape: {}".format(patches.shape)) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'. format(s, rf_size), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() logger.info("Train X ndarray shape: {}".format(train_x.shape)) wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 250 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) scores.append((s, wrapper.cv_scores)) del wrapper gc.collect()
def kmeans_004(): """ Tuning the scale/crop and RF size parameters First number is the scaling, cropped to 200, with rf size of 5. 75 scaling took forever ot transform, so killed [(30, array([-0.11374265, -0.1134896 ])) (50, array([-0.11677854, -0.11696837]))] Trying again with larger RF size of 10. As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set Scale to 50 with rf 10 takes almost 90 minutes. [(30, array([-0.10828216, -0.1081058 ])), (50, array([-0.10840914, -0.10868195]))] Interesting that scale size of 50 does worse Crop is not 150, so this is not really an apples to apples comparison with kmeans_003 It is possibly worth making a submission with scale 30 and rf size 10 """ crops = [200] # Should probably also add 250 scales = [30, 50] # Scaling is probably the most important part here scores = [] for s in scales: crop = 200 n_centroids = 1600 n_patches = 400000 # rf_size = int(round(s * .2)) rf_size = 10 logger.info("Training with crop {}, scale {}, patch size {}, patches {}, centroids {}".format(crop, s, rf_size, n_patches, n_centroids)) train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) # spherical generator kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format(s, rf_size), n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() logger.info("Images ndarray shape: {}".format(images.shape)) patches = patch_extractor.transform(images) logger.info("Patches ndarray shape: {}".format(patches.shape)) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'.format(s, rf_size), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() logger.info("Train X ndarray shape: {}".format(train_x.shape)) wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) scores.append((s, wrapper.cv_scores)) del wrapper gc.collect()