def assess(estimator,X,y): predictions = estimator.predict(X) #print predictions #predictions[:,:3] /=2 scaler1 = joblib.load(galaxy.get_data_folder()+"/scaler1") predictions = scaler1.inverse_transform(predictions) #predictions *= scale predictions[predictions<1e-7] = 0 #y[:,:3] /=2 y = scaler1.inverse_transform(y) #y *= scale MSE = (predictions - y)**2 mse = np.mean(MSE,axis=1) rmse = math.sqrt(mse.mean()) rmse2 = np.sqrt(mse) indices = np.argsort(mse)[::-1] for i in indices[:5]: if rmse2[i]>0.1: print rmse2[i], int(X[i,0]) return rmse
folder = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/images_training_rev1/" else: folder = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/images_test_rev1/" f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv" files = glob(folder + "/*") print "will predict", len(files), "galaxies" print "from", folder points = Parallel(n_jobs=-1)(delayed(galaxy.get_features)(f) for f in files) mapping = galaxy.get_fieldnames() forest = joblib.load(galaxy.get_data_folder() + "/galaxy_forest") forest.set_params(n_jobs=1) scaler2 = joblib.load(galaxy.get_data_folder() + "/scaler2") points = scaler2.transform(points) predictions = forest.predict(points) scaler1 = joblib.load(galaxy.get_data_folder() + "/scaler1") predictions = scaler1.inverse_transform(predictions) # sparsify # min on training responses: 2.9099999e-06 predictions[predictions < 1e-7] = 0 #f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv"
print "will learn from", len(selected_ids), "galaxies for Class", Class,c,svm_class for transform in [0]:#xrange(3): points = Parallel(n_jobs=-1)(delayed(process_galaxy)(galaxy_id,transform=transform) for galaxy_id in selected_ids ) X.extend(points) Y.extend([svm_class for i in xrange(len(points))]) svm_class += 1 X = np.array(X,dtype='float') Y = np.array(Y,dtype='int') scaler = StandardScaler() X = scaler.fit_transform(X) joblib.dump( scaler, galaxy.get_data_folder()+"/scaler_statistics_Class"+ str(Class)+"_") print "got",len(X),"points" print np.bincount(Y,minlength=svm_class) # train best SVM clf = SVC( kernel='rbf', class_weight='auto', probability=True, C=10.0, gamma=0.005 ) clf.fit(X, Y) joblib.dump(clf, galaxy.get_data_folder()+"/svm_statistics_Class"+ str(Class)+"_") exit(0)
for transform in [0]: #xrange(3): points = Parallel(n_jobs=-1)( delayed(process_galaxy)(galaxy_id, transform=transform) for galaxy_id in selected_ids) X.extend(points) Y.extend([svm_class for i in xrange(len(points))]) svm_class += 1 X = np.array(X, dtype='float') Y = np.array(Y, dtype='int') scaler = StandardScaler() X = scaler.fit_transform(X) joblib.dump( scaler, galaxy.get_data_folder() + "/scaler_statistics_Class" + str(Class) + "_") print "got", len(X), "points" print np.bincount(Y, minlength=svm_class) # train best SVM clf = SVC(kernel='rbf', class_weight='auto', probability=True, C=10.0, gamma=0.005) clf.fit(X, Y) joblib.dump( clf, galaxy.get_data_folder() + "/svm_statistics_Class" + str(Class) + "_")
else: folder = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/images_test_rev1/" f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv" files = glob(folder+"/*") print "will predict", len(files), "galaxies" print "from", folder points = Parallel(n_jobs=-1)(delayed(galaxy.get_features)(f) for f in files ) mapping = galaxy.get_fieldnames() forest = joblib.load(galaxy.get_data_folder()+"/galaxy_forest") forest.set_params(n_jobs=1) scaler2 = joblib.load(galaxy.get_data_folder()+"/scaler2") points = scaler2.transform(points) predictions = forest.predict( points ) scaler1 = joblib.load(galaxy.get_data_folder()+"/scaler1") predictions = scaler1.inverse_transform(predictions) # sparsify # min on training responses: 2.9099999e-06 predictions[predictions<1e-7] = 0 #f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv"
return rmse def shuffle(a, b): """ http://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison """ assert len(a) == len(b) p = np.random.permutation(len(a)) return a[p], b[p] f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv" responses, ids = galaxy.read_responses( f ) scaler1 = StandardScaler() responses = scaler1.fit_transform(responses) joblib.dump( scaler1, galaxy.get_data_folder()+"/scaler1" ) # mapping = galaxy.get_classes() # selection = {} # for Class in xrange(1,12): # classes = np.nonzero(mapping==Class)[0] # for c in classes: # q = 0.95 # threshold = mquantiles( responses[:,c], q ) # tmp_selection = np.nonzero(responses[:,c]>=threshold)[0] # for i in tmp_selection: # selection[i] = 1 # tmp_responses = [] # tmp_ids = [] # for i in selection.keys():
def shuffle(a, b): """ http://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison """ assert len(a) == len(b) p = np.random.permutation(len(a)) return a[p], b[p] f = "/vol/biomedic/users/kpk09/kaggle/galaxy/data/training_solutions_rev1.csv" responses, ids = galaxy.read_responses(f) scaler1 = StandardScaler() responses = scaler1.fit_transform(responses) joblib.dump(scaler1, galaxy.get_data_folder() + "/scaler1") # mapping = galaxy.get_classes() # selection = {} # for Class in xrange(1,12): # classes = np.nonzero(mapping==Class)[0] # for c in classes: # q = 0.95 # threshold = mquantiles( responses[:,c], q ) # tmp_selection = np.nonzero(responses[:,c]>=threshold)[0] # for i in tmp_selection: # selection[i] = 1 # tmp_responses = [] # tmp_ids = [] # for i in selection.keys():
rng = RandomState(0) ############################################################################### def plot_gallery(title, images, n_col=n_col, n_row=n_row): pl.figure(figsize=(2. * n_col, 2.26 * n_row)) pl.suptitle(title, size=16) for i, comp in enumerate(images): pl.subplot(n_row, n_col, i + 1) vmax = max(comp.max(), -comp.min()) pl.imshow(comp.reshape(image_shape), cmap=pl.cm.gray, interpolation='nearest', vmin=-vmax, vmax=vmax) pl.xticks(()) pl.yticks(()) pl.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.) pl.savefig(title+'.png') ############################################################################### # Plot a sample of the input data for i in xrange(1,12): #pca = joblib.load(galaxy.get_data_folder()+"/pca_"+str(i)+"_") pca = joblib.load(galaxy.get_data_folder()+"/pca_"+thumbnail+"_Class"+ str(i)+"_") print pca.components_.shape print pca.explained_variance_ratio_ plot_gallery("Class_"+thumbnail+str(i), pca.components_[:n_components]) cv2.imwrite("mean_Class_"+thumbnail+str(i)+".png",np.reshape(pca.mean_,image_shape)) #pl.show()
n_components = 50 print "Extracting the top %d eigenfaces from %d faces" % (n_components, X.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X) print "done in %0.3fs" % (time() - t0) print pca.explained_variance_ratio_ print "Projecting the input data on the eigenfaces orthonormal basis" t0 = time() X_pca = pca.transform(X) print "done in %0.3fs" % (time() - t0) # save PCA joblib.dump(pca, galaxy.get_data_folder()+"/pca_color_Class"+ str(Class)+"_") # train best SVM clf = SVC( kernel='rbf', class_weight='auto', probability=True, C=5000.0, gamma=0.0001 ) clf.fit(X_pca, Y) joblib.dump(clf, galaxy.get_data_folder()+"/pca_color_SVM_Class" + str(Class)+"_") exit(0) ############################################################################### # Train a SVM classification model
n_components = 50 print "Extracting the top %d eigenfaces from %d faces" % (n_components, X.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X) print "done in %0.3fs" % (time() - t0) print pca.explained_variance_ratio_ print "Projecting the input data on the eigenfaces orthonormal basis" t0 = time() X_pca = pca.transform(X) print "done in %0.3fs" % (time() - t0) # save PCA joblib.dump(pca, galaxy.get_data_folder()+"/pca_"+thumbnail+"_Class"+ str(Class)+"_") # train best SVM clf = SVC( kernel='rbf', class_weight='auto', probability=True, C=50000.0, gamma=5e-05 ) clf.fit(X_pca, Y) joblib.dump(clf, galaxy.get_data_folder()+"/pca_"+thumbnail+"_SVM_Class" + str(Class)+"_") exit(0) ############################################################################### # Train a SVM classification model
X.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X) print "done in %0.3fs" % (time() - t0) print pca.explained_variance_ratio_ print "Projecting the input data on the eigenfaces orthonormal basis" t0 = time() X_pca = pca.transform(X) print "done in %0.3fs" % (time() - t0) # save PCA joblib.dump( pca, galaxy.get_data_folder() + "/pca_color_Class" + str(Class) + "_") # train best SVM clf = SVC(kernel='rbf', class_weight='auto', probability=True, C=5000.0, gamma=0.0001) clf.fit(X_pca, Y) joblib.dump( clf, galaxy.get_data_folder() + "/pca_color_SVM_Class" + str(Class) + "_") exit(0) ###############################################################################