def max_pooling(): # Seems to be a lot worse than the sum pooling # 2014-03-28 10:26:28 - Base - INFO - Cross validation completed in 1433.7291348. Scores: # 2014-03-28 10:26:28 - Base - INFO - [-0.11968588 -0.12018345] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='max') # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_008_maxpool.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def kmeans_006_colwise_rmse(): crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, train_size=0.2, test_size=0.2) # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500, 'verbose': 1}, n_jobs=-1) wrapper.fit(train_x, train_y) # About 11 minutes to train the ridge regression on an m2.4 xlarge with 50% of the train set # Took about a minute to train ridge on 0.1 of the train set, but the overall rmse was .114 compared to .106 on 50%, and .104 actual # 5 minutes to train ridge on 0.2 of the train set, with rmse of .111 kmeans_preds = wrapper.predict(test_x) logger.info('Kmeans') colwise = classes.colwise_rmse(kmeans_preds, test_y) overall = classes.rmse(kmeans_preds, test_y) """
def mean_pooling(): # Wow mean pooling is really bad # 2014-03-28 11:28:42 - Base - INFO - Cross validation completed in 1523.09399891. Scores: # 2014-03-28 11:28:42 - Base - INFO - [-0.13083991 -0.12989765] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='mean') train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_008_meanpool.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def extra_trees_submission(): # Somehow the submission on the leaderboard scores 0.22 crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_008.csv')
def kmeans_006_submission(): # 2014-03-28 10:53:22 - Base - INFO - Cross validation completed in 1487.18687487. Scores: # 2014-03-28 10:53:22 - Base - INFO - [-0.11018943 -0.10946863] # Final submission crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_006.csv')
def extratress(): # 2014-03-28 13:24:22 - Base - INFO - Cross validation completed in 1139.1731801. Scores: # 2014-03-28 13:24:22 - Base - INFO - [-0.11048638 -0.11060714] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='sum') # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def rbm_001(): s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() images = images.reshape((images.shape[0], 15 * 15 * 3)) # rbm needs inputs to be between 0 and 1 scaler = MinMaxScaler() images = scaler.fit_transform(images) # Training takes a long time, says 80 seconds per iteration, but seems like longer # And this is only with 256 components rbm = BernoulliRBM(verbose=1) rbm.fit(images) train_x = rbm.transform(images) train_y = classes.train_solutions.data # 0.138 CV on 50% of the dataset wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def kmeans_007(): """ Increasing crop/scale size, rf size, centroids, and patches all at once. 2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399. Scores: 2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868] """ n_centroids = 5000 s = 50 crop = 200 # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid # 800000 / 5000 = will give us 160 patches per centroid n_patches = 800000 rf_size = 20 # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches stride = 2 train_x_crop_scale = CropScaleImageTransformer(training=True, crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) images = train_x_crop_scale.transform() patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) patches = patch_extractor.transform(images) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_007'.format(n_centroids), n_iterations=20, n_jobs=-1, ) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_007.npy', stride_size=stride, memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 250 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, parallel_estimator=True) """
def kmeans_006_submission(): # Final submission n_centroids = 3000 s = 15 crop = 150 n_patches = 400000 rf_size = 5 logger.info("Training with n_centroids {}".format(n_centroids)) train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids), n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_006.csv')
def rf_size_10(): # Pretty bad as well # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999. Scores: # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735] n_centroids = 3000 s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_008_rf10'.format(n_centroids), n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_rf10.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def kmeans_007(): """ Increasing crop/scale size, rf size, centroids, and patches all at once. 2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399. Scores: 2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868] """ n_centroids = 5000 s = 50 crop = 200 # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid # 800000 / 5000 = will give us 160 patches per centroid n_patches = 800000 rf_size = 20 # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches stride = 2 train_x_crop_scale = CropScaleImageTransformer(training=True, crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) images = train_x_crop_scale.transform() patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) patches = patch_extractor.transform(images) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_007'.format(n_centroids), n_iterations=20, n_jobs=-1,) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_007.npy', stride_size=stride, memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, parallel_estimator=True) """
def kmeans_006_submission(): # 2014-03-28 10:53:22 - Base - INFO - Cross validation completed in 1487.18687487. Scores: # 2014-03-28 10:53:22 - Base - INFO - [-0.11018943 -0.10946863] # Final submission crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform( test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'. format(n_centroids), memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_006.csv')
def extra_trees_submission(): # Somehow the submission on the leaderboard scores 0.22 crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform( test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'. format(n_centroids), memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_008.csv')
def rbm_001(): s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() images = images.reshape((images.shape[0], 15 * 15 * 3)) # rbm needs inputs to be between 0 and 1 scaler = MinMaxScaler() images = scaler.fit_transform(images) # Training takes a long time, says 80 seconds per iteration, but seems like longer # And this is only with 256 components rbm = BernoulliRBM(verbose=1) rbm.fit(images) train_x = rbm.transform(images) train_y = classes.train_solutions.data # 0.138 CV on 50% of the dataset wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def mean_pooling(): # Wow mean pooling is really bad # 2014-03-28 11:28:42 - Base - INFO - Cross validation completed in 1523.09399891. Scores: # 2014-03-28 11:28:42 - Base - INFO - [-0.13083991 -0.12989765] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='mean') train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_meanpool.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def max_pooling(): # Seems to be a lot worse than the sum pooling # 2014-03-28 10:26:28 - Base - INFO - Cross validation completed in 1433.7291348. Scores: # 2014-03-28 10:26:28 - Base - INFO - [-0.11968588 -0.12018345] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='max') # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_maxpool.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def extratress(): # 2014-03-28 13:24:22 - Base - INFO - Cross validation completed in 1139.1731801. Scores: # 2014-03-28 13:24:22 - Base - INFO - [-0.11048638 -0.11060714] crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids, pool_method='sum') # Need something larger than the 15G RAM, since RAM usage seems to spike when recombining from parallel train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def kmeans_006_colwise_rmse(): crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, train_size=0.2, test_size=0.2) # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500, 'verbose': 1 }, n_jobs=-1) wrapper.fit(train_x, train_y) # About 11 minutes to train the ridge regression on an m2.4 xlarge with 50% of the train set # Took about a minute to train ridge on 0.1 of the train set, but the overall rmse was .114 compared to .106 on 50%, and .104 actual # 5 minutes to train ridge on 0.2 of the train set, with rmse of .111 kmeans_preds = wrapper.predict(test_x) logger.info('Kmeans') colwise = classes.colwise_rmse(kmeans_preds, test_y) overall = classes.rmse(kmeans_preds, test_y) """
def gradient_boosting_grid_search(): crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() # This took maybe 30 minutes to run, 1200 components gives .9999999 variance explained. Can maybe get # away with 200 - .9937 pca = RandomizedPCA(n_components=200) pca.fit(train_x) pca_train_x = pca.transform(train_x) # We'll focus on the columns that have high errors based on the analysis in kmeans_006.py params = { 'loss': ['ls', 'lad', 'huber', 'quantile'], 'learning_rate': [0.01, 0.1, 1, 5, 10], 'n_estimators': [100, 250, 500, 1000], 'max_depth': [2, 3, 5, 10], 'subsample': [0.2, 0.5, 1] } # not sure why it just dies here, on CV too # File "/usr/lib/python2.7/multiprocessing/pool.py", line 319, in _handle_tasks # put(task) # SystemError: NULL result without error in PyObject_Call # seems like the parallelization is broken # Without parallelization, will run, but is super slow, probably because of the high dimensionality # of the train_x, which is n_centroids * 4 dimensions (12000), because of the pooling # It says something like 300 minutes to train 100 iterations # After PCA, takes about 15 minutes on 15% of the dataset with 1200 features # But RMSE is .20 wrapper = ModelWrapper(GradientBoostingRegressor, {'verbose': 2}, n_jobs=1) # SVR takes about 30 minutes on 15% of the sample, and score is .19 on 0th class, compared to .15 on RidgeRFE # Didn't Scale/Center, so maybe need to do that? # After scaling to -1, 1, still same RMSE wrapper = ModelWrapper(SVR, {}, n_jobs=1) scale = MinMaxScaler((-1, 1)) scaled_train_x = scale.fit_transform(train_x) wrapper.grid_search(train_x, train_y[:, 0], params, sample=0.3, refit=False) wrapper.cross_validation(train_x, train_y[:, 0], params, sample=0.3)
def ensemble(): crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper1 = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper1.fit(train_x, train_y) wrapper2 = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper2.fit(train_x, train_y) pred1 = wrapper1.predict(train_x) pred2 = wrapper2.predict(train_x) wrapper3 = ModelWrapper(Ridge) wrapper3.cross_validation(np.vstack((pred1, pred2)), train_y)
def ensemble_001(): """ Ensemble of kmeans and random forest results Conducting some analysis of whether the errors from these two models for individual Ys are different Ensembled error is .1149. Kmeans is better on every class than RF. """ n_centroids = 3000 s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer(training=True, crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_ensemble_001', n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() X = kmeans_generator.transform(images, save_to_file='data/data_ensemble_001.npy', memmap=True) Y = classes.train_solutions.data # Unload some objects del images gc.collect() # Get the input for the RF so that we can split together sampler = SampleTransformer(training=True, steps=2, step_size=20, n_jobs=-1) pX = sampler.transform() # manual split of train and test train_x, test_x, ptrain_x, ptest_x, train_y, test_y = train_test_split( X, pX, Y, test_size=0.5) wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.fit(train_x, train_y) kmeans_preds = wrapper.predict(test_x) pWrapper = ModelWrapper(RandomForestRegressor, { 'n_estimators': 500, 'verbose': 3 }, n_jobs=-1) pWrapper.fit(ptrain_x, train_y) pixel_preds = pWrapper.predict(ptest_x) logger.info('Kmeans') classes.colwise_rmse(kmeans_preds, test_y) classes.rmse(kmeans_preds, test_y) logger.info('Pixel RF') classes.colwise_rmse(pixel_preds, test_y) classes.rmse(pixel_preds, test_y) logger.info("Ensembling predictions") etrain_x = np.hstack( (wrapper.predict(train_x), pWrapper.predict(ptrain_x))) etest_x = np.hstack((kmeans_preds, pixel_preds)) eWrapper = ModelWrapper(RandomForestRegressor, { 'n_estimators': 500, 'verbose': 3 }, n_jobs=-1) eWrapper.fit(etrain_x, train_y) ensemble_preds = eWrapper.predict(etest_x) classes.colwise_rmse(ensemble_preds, test_y) classes.rmse(ensemble_preds, test_y)
def kmeans_006_submission(): # Final submission n_centroids = 3000 s = 15 crop = 150 n_patches = 400000 rf_size = 5 logger.info("Training with n_centroids {}".format(n_centroids)) train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform( test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'. format(n_centroids), memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_006.csv')
def kmeans_005(): """ Testing whether extracting patches from train and test images works better [(500000, False, array([-0.10799986, -0.10744586])), (500000, True, array([-0.10790803, -0.10733288])), (600000, False, array([-0.10812188, -0.10735988])), (600000, True, array([-0.10778652, -0.10752664]))] """ n_patches_vals = [500000, 600000, 700000] include_test_images = [False, True] scores = [] for n_patches in n_patches_vals: for incl in include_test_images: s = 15 crop = 150 n_centroids = 1600 rf_size = 5 logger.info( "Training with n_patches {}, with test images {}".format( n_patches, incl)) train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format( crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format( crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_005_patches_{}_test{}'.format( n_patches, incl), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler( n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() if incl: test_images = test_x_crop_scale.transform() images = np.vstack([images, test_images]) logger.info( "Extracting patches from images ndarray shape: {}".format( images.shape)) patches = patch_extractor.transform(images) logger.info("Patches ndarray shape: {}".format(patches.shape)) kmeans_generator.fit(patches) del patches gc.collect() # Reload the original images images = train_x_crop_scale.transform() logger.info( "Generating features on images ndarray shape: {}".format( images.shape)) train_x = kmeans_generator.transform( images, save_to_file= 'data/data_kmeans_features_005_patches_{}_test_{}.npy'.format( n_patches, incl), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 250 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) score = (n_patches, incl, wrapper.cv_scores) logger.info("Score: {}".format(score)) scores.append(score) del wrapper gc.collect()
def kmeans_004(): """ Tuning the scale/crop and RF size parameters First number is the scaling, cropped to 200, with rf size of 5. 75 scaling took forever ot transform, so killed [(30, array([-0.11374265, -0.1134896 ])) (50, array([-0.11677854, -0.11696837]))] Trying again with larger RF size of 10. As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set Scale to 50 with rf 10 takes almost 90 minutes. [(30, array([-0.10828216, -0.1081058 ])), (50, array([-0.10840914, -0.10868195]))] Interesting that scale size of 50 does worse Crop is not 150, so this is not really an apples to apples comparison with kmeans_003 It is possibly worth making a submission with scale 30 and rf size 10 """ crops = [200] # Should probably also add 250 scales = [30, 50] # Scaling is probably the most important part here scores = [] for s in scales: crop = 200 n_centroids = 1600 n_patches = 400000 # rf_size = int(round(s * .2)) rf_size = 10 logger.info("Training with crop {}, scale {}, patch size {}, patches {}, centroids {}".format(crop, s, rf_size, n_patches, n_centroids)) train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) # spherical generator kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format(s, rf_size), n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() logger.info("Images ndarray shape: {}".format(images.shape)) patches = patch_extractor.transform(images) logger.info("Patches ndarray shape: {}".format(patches.shape)) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'.format(s, rf_size), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() logger.info("Train X ndarray shape: {}".format(train_x.shape)) wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) scores.append((s, wrapper.cv_scores)) del wrapper gc.collect()
def kmeans_006(): """ Testing number of centroids [(1000, array([-0.10926318, -0.10853047])), (2000, array([-0.10727502, -0.10710292])), (2500, array([-0.107019 , -0.10696262])), (3000, array([-0.10713973, -0.1066932 ]))] """ n_centroids_vals = [1000, 2000, 2500, 3000] scores = [] for n_centroids in n_centroids_vals: s = 15 crop = 150 n_patches = 400000 rf_size = 5 logger.info("Training with n_centroids {}".format(n_centroids)) train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler( n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'. format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 250 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) score = (n_centroids, wrapper.cv_scores) logger.info("Scores: {}".format(score)) scores.append(score) del wrapper gc.collect()
memmap=True) train_y = classes.train_solutions.data # Unload some objects for memory del images gc.collect() # ModelWrapper is a convenience class that we created to automate some of the typical tasks # like logging, grid search and cross validation. # For fit, it is basically equivalent to calling fit on the estimator # The estimator takes the X and y and trains a ridge regression (sklearn.linear_model.Ridge), # predicts using the ridge regressor, then uses the results of the prediction to train a random forest. wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) # Crop and scale the test images test_images = test_x_crop_scale.transform()
def ensemble_001(): """ Ensemble of kmeans and random forest results Conducting some analysis of whether the errors from these two models for individual Ys are different Ensembled error is .1149. Kmeans is better on every class than RF. """ n_centroids = 3000 s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer(training=True, crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_ensemble_001', n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() X = kmeans_generator.transform(images, save_to_file='data/data_ensemble_001.npy', memmap=True) Y = classes.train_solutions.data # Unload some objects del images gc.collect() # Get the input for the RF so that we can split together sampler = SampleTransformer(training=True, steps=2, step_size=20, n_jobs=-1) pX = sampler.transform() # manual split of train and test train_x, test_x, ptrain_x, ptest_x, train_y, test_y = train_test_split(X, pX, Y, test_size=0.5) wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.fit(train_x, train_y) kmeans_preds = wrapper.predict(test_x) pWrapper = ModelWrapper(RandomForestRegressor, {'n_estimators': 500, 'verbose': 3}, n_jobs=-1) pWrapper.fit(ptrain_x, train_y) pixel_preds = pWrapper.predict(ptest_x) logger.info('Kmeans') classes.colwise_rmse(kmeans_preds, test_y) classes.rmse(kmeans_preds, test_y) logger.info('Pixel RF') classes.colwise_rmse(pixel_preds, test_y) classes.rmse(pixel_preds, test_y) logger.info("Ensembling predictions") etrain_x = np.hstack((wrapper.predict(train_x), pWrapper.predict(ptrain_x))) etest_x = np.hstack((kmeans_preds, pixel_preds)) eWrapper = ModelWrapper(RandomForestRegressor, {'n_estimators': 500, 'verbose': 3}, n_jobs=-1) eWrapper.fit(etrain_x, train_y) ensemble_preds = eWrapper.predict(etest_x) classes.colwise_rmse(ensemble_preds, test_y) classes.rmse(ensemble_preds, test_y)
# Transform the training images into the features. # Since we have 3,000 centroids with pooling over quadrants, we'll get 12,000 (3000 * 4) features train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects for memory del images gc.collect() # ModelWrapper is a convenience class that we created to automate some of the typical tasks # like logging, grid search and cross validation. # For fit, it is basically equivalent to calling fit on the estimator # The estimator takes the X and y and trains a ridge regression (sklearn.linear_model.Ridge), # predicts using the ridge regressor, then uses the results of the prediction to train a random forest. wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) # Crop and scale the test images test_images = test_x_crop_scale.transform() # Generate the test features test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
def kmeans_005(): """ Testing whether extracting patches from train and test images works better [(500000, False, array([-0.10799986, -0.10744586])), (500000, True, array([-0.10790803, -0.10733288])), (600000, False, array([-0.10812188, -0.10735988])), (600000, True, array([-0.10778652, -0.10752664]))] """ n_patches_vals = [500000, 600000, 700000] include_test_images = [False, True] scores = [] for n_patches in n_patches_vals: for incl in include_test_images: s = 15 crop = 150 n_centroids = 1600 rf_size = 5 logger.info("Training with n_patches {}, with test images {}".format(n_patches, incl)) train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_005_patches_{}_test{}'.format(n_patches, incl), n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() if incl: test_images = test_x_crop_scale.transform() images = np.vstack([images, test_images]) logger.info("Extracting patches from images ndarray shape: {}".format(images.shape)) patches = patch_extractor.transform(images) logger.info("Patches ndarray shape: {}".format(patches.shape)) kmeans_generator.fit(patches) del patches gc.collect() # Reload the original images images = train_x_crop_scale.transform() logger.info("Generating features on images ndarray shape: {}".format(images.shape)) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_005_patches_{}_test_{}.npy'.format(n_patches, incl), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) score = (n_patches, incl, wrapper.cv_scores) logger.info("Score: {}".format(score)) scores.append(score) del wrapper gc.collect()
def kmeans_006(): """ Testing number of centroids [(1000, array([-0.10926318, -0.10853047])), (2000, array([-0.10727502, -0.10710292])), (2500, array([-0.107019 , -0.10696262])), (3000, array([-0.10713973, -0.1066932 ]))] """ n_centroids_vals = [1000, 2000, 2500, 3000] scores = [] for n_centroids in n_centroids_vals: s = 15 crop = 150 n_patches = 400000 rf_size = 5 logger.info("Training with n_centroids {}".format(n_centroids)) train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids), n_iterations=20, n_jobs=-1,) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) score = (n_centroids, wrapper.cv_scores) logger.info("Scores: {}".format(score)) scores.append(score) del wrapper gc.collect()
def ensemble(): crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper1 = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper1.fit(train_x, train_y) wrapper2 = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper2.fit(train_x, train_y) pred1 = wrapper1.predict(train_x) pred2 = wrapper2.predict(train_x) wrapper3 = ModelWrapper(Ridge) wrapper3.cross_validation(np.vstack((pred1, pred2)), train_y)
def kmeans_004(): """ Tuning the scale/crop and RF size parameters First number is the scaling, cropped to 200, with rf size of 5. 75 scaling took forever ot transform, so killed [(30, array([-0.11374265, -0.1134896 ])) (50, array([-0.11677854, -0.11696837]))] Trying again with larger RF size of 10. As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set Scale to 50 with rf 10 takes almost 90 minutes. [(30, array([-0.10828216, -0.1081058 ])), (50, array([-0.10840914, -0.10868195]))] Interesting that scale size of 50 does worse Crop is not 150, so this is not really an apples to apples comparison with kmeans_003 It is possibly worth making a submission with scale 30 and rf size 10 """ crops = [200] # Should probably also add 250 scales = [30, 50] # Scaling is probably the most important part here scores = [] for s in scales: crop = 200 n_centroids = 1600 n_patches = 400000 # rf_size = int(round(s * .2)) rf_size = 10 logger.info( "Training with crop {}, scale {}, patch size {}, patches {}, centroids {}" .format(crop, s, rf_size, n_patches, n_centroids)) train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) # spherical generator kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format( s, rf_size), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler( n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() logger.info("Images ndarray shape: {}".format(images.shape)) patches = patch_extractor.transform(images) logger.info("Patches ndarray shape: {}".format(patches.shape)) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'. format(s, rf_size), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() logger.info("Train X ndarray shape: {}".format(train_x.shape)) wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 250 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True) scores.append((s, wrapper.cv_scores)) del wrapper gc.collect()
def kmeans_003(): """ Grid search for Ridge RF parameters Not sure whether to use spherical or minibatch, so maybe do one run with both .106 on the leaderboard. So the difference in CV scores narrowed """ train_x_crop_scale = CropScaleImageTransformer(training=True, result_path='data/data_train_crop_150_scale_15.npy', crop_size=150, scaled_size=15, n_jobs=-1, memmap=True) # spherical generator kmeans_generator = KMeansFeatureGenerator(n_centroids=1600, rf_size=5, result_path='data/mdl_kmeans_002_new', n_iterations=20, n_jobs=-1,) # minibatch generator # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600, # rf_size=5, # result_path='data/mdl_kmeans_002_new_minibatch', # method='minibatch', # n_init=1, # n_jobs=-1,) # Don't need to fit, as already cached patches = '' kmeans_generator.fit(patches) images = train_x_crop_scale.transform() # Problematic here - memory usage spikes to ~ 11GB when threads return # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True) train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1) wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 14, 'n_estimators': 500}, n_jobs=-1) params = { 'alpha': [150, 250, 500, 750, 1000], 'n_estimators': [250] } # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space, # So need to re-run with larger range of alpha # Will hit 30GB of ram with 500 trees. wrapper.grid_search(train_x, train_y, params, refit=False, parallel_estimator=True) # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0}, # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0}, # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35}, # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50}, # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75}, # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100}, # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150}, # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250}, # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500}, # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750}, # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}] # Fit the final model wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer(training=False, result_path='data/data_test_crop_150_scale_15.npy', crop_size=150, scaled_size=15, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform(test_images, save_to_file='data/data_kmeans_test_features_003_new.npy', memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_003.csv')
def kmeans_003(): """ Grid search for Ridge RF parameters Not sure whether to use spherical or minibatch, so maybe do one run with both .106 on the leaderboard. So the difference in CV scores narrowed """ train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_150_scale_15.npy', crop_size=150, scaled_size=15, n_jobs=-1, memmap=True) # spherical generator kmeans_generator = KMeansFeatureGenerator( n_centroids=1600, rf_size=5, result_path='data/mdl_kmeans_002_new', n_iterations=20, n_jobs=-1, ) # minibatch generator # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600, # rf_size=5, # result_path='data/mdl_kmeans_002_new_minibatch', # method='minibatch', # n_init=1, # n_jobs=-1,) # Don't need to fit, as already cached patches = '' kmeans_generator.fit(patches) images = train_x_crop_scale.transform() # Problematic here - memory usage spikes to ~ 11GB when threads return # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True) train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1) wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 14, 'n_estimators': 500 }, n_jobs=-1) params = {'alpha': [150, 250, 500, 750, 1000], 'n_estimators': [250]} # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space, # So need to re-run with larger range of alpha # Will hit 30GB of ram with 500 trees. wrapper.grid_search(train_x, train_y, params, refit=False, parallel_estimator=True) # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0}, # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0}, # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35}, # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50}, # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75}, # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100}, # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150}, # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250}, # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500}, # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750}, # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}] # Fit the final model wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_150_scale_15.npy', crop_size=150, scaled_size=15, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform( test_images, save_to_file='data/data_kmeans_test_features_003_new.npy', memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_003.csv')
def rf_size_10(): # Pretty bad as well # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999. Scores: # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735] n_centroids = 3000 s = 15 crop = 150 n_patches = 400000 rf_size = 5 train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_008_rf10'.format(n_centroids), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_008_rf10.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)