def predict_example(): # UNRESOLVED ISSUES WITH PREDICTION # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_test = pd.read_hdf('../class_photoz/data/DR7DR14Q_flux_cat.hdf5', 'data') # df_train = pd.read_hdf('../class_photoz/data/DR7DR14Q_flux_cat.hdf5','data') df_train = pd.read_hdf('../class_photoz/data/brightqsos_2.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_ks', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] # Try a fraction of the whole datafile first df_train = df_train.sample(frac=1.0) # df_test.query('Z > 1.1',inplace=True) # df_train.query('z > 1.1',inplace=True) for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_test.replace(np.inf, np.nan, inplace=True) df_train.replace(np.inf, np.nan, inplace=True) df_test, features = qs.prepare_flux_ratio_catalog(df_test, passband_names) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) print df_test.shape, df_train.shape # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', 'hks', 'ksw1', 'w1w2'] label = 'z' params = { 'kernel': 'rbf', 'C': 1.0, 'gamma': 0.001, 'epsilon': 0.2, 'cache_size': 1200 } df_test = svr.svm_reg_predict(df_train, df_test, features, label, params, 'svm_photoz') print df_test['svm_photoz'].describe() ml_an.evaluate_regression(df_test['Z'], df_test['svm_photoz']) pz_an.plot_redshifts(df_test['Z'], df_test['svm_photoz']) pz_an.plot_error_hist(df_test['Z'], df_test['svm_photoz']) plt.show()
def simqso_predict_dr7dr12(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_test = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') df_train = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_ks', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_test = df_test.query('0.3 < Z_VI < 5.5') # df_train.query('obsMag_SDSS_i <= 18.5',inplace=True) # df_test.query('SDSS_mag_i <= 18.5',inplace=True) # df_train.query('z > 1.1',inplace=True) for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_test.replace(np.inf, np.nan, inplace=True) df_train.replace(np.inf, np.nan, inplace=True) df_test, features = qs.prepare_flux_ratio_catalog(df_test, passband_names) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) print df_test.shape, df_train.shape # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- # features = ['SDSS_u','SDSS_i','SDSS_r','SDSS_z','SDSS_g','WISE_w1','WISE_w2'] # features = ['SDSS_i','WISE_w1','ug','gr','ri','iz','zw1','w1w2'] features = ['ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', 'hks', 'ksw1', 'w1w2'] label = 'z' rand_state = 1 params = { 'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 4, 'n_jobs': 2, 'random_state': rand_state } df_test = rf.rf_reg_predict(df_train, df_test, features, label, params, 'rf_photoz') ml_an.evaluate_regression(df_test['Z_VI'], df_test['rf_photoz']) pz_an.plot_redshifts(df_test['Z_VI'], df_test['rf_photoz']) pz_an.plot_error_hist(df_test['Z_VI'], df_test['rf_photoz']) plt.show()
def dr7dr12_predict_simqso(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_train = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') df_test = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_ks', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df_train.query('0.3 < Z_VI < 5.5') df_test.query('obsMag_SDSS_i <= 18.5', inplace=True) df_train.query('SDSS_mag_i <= 18.5', inplace=True) # df_train.query('z > 1.1',inplace=True) for name in passband_names: df_test.rename(columns={'obsFlux_' + name: name}, inplace=True) df_test.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_test.replace(np.inf, np.nan, inplace=True) df_train.replace(np.inf, np.nan, inplace=True) df_test, features = qs.prepare_flux_ratio_catalog(df_test, passband_names) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) print df_test.shape, df_train.shape # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- # features = ['SDSS_u','SDSS_i','SDSS_r','SDSS_z','SDSS_g','WISE_w1','WISE_w2'] features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', 'hks', 'ksw1', 'w1w2'] label = 'Z_VI' rand_state = 1 params = { 'kernel': 'rbf', 'C': 1.0, 'gamma': 0.001, 'epsilon': 0.2, 'cache_size': 1200 } df_test = svr.svm_reg_predict(df_train, df_test, features, label, params, 'svm_photoz') ml_an.evaluate_regression(df_test['z'], df_test['svm_photoz']) pz_an.plot_redshifts(df_test['z'], df_test['svm_photoz']) pz_an.plot_error_hist(df_test['z'], df_test['svm_photoz']) plt.show()