def simqso_predict_dr7dr12(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_test = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') df_train = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_ks', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_test = df_test.query('0.3 < Z_VI < 5.5') # df_train.query('obsMag_SDSS_i <= 18.5',inplace=True) # df_test.query('SDSS_mag_i <= 18.5',inplace=True) # df_train.query('z > 1.1',inplace=True) for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_test.replace(np.inf, np.nan, inplace=True) df_train.replace(np.inf, np.nan, inplace=True) df_test, features = qs.prepare_flux_ratio_catalog(df_test, passband_names) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) print df_test.shape, df_train.shape # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- # features = ['SDSS_u','SDSS_i','SDSS_r','SDSS_z','SDSS_g','WISE_w1','WISE_w2'] # features = ['SDSS_i','WISE_w1','ug','gr','ri','iz','zw1','w1w2'] features = ['ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', 'hks', 'ksw1', 'w1w2'] label = 'z' rand_state = 1 params = { 'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 4, 'n_jobs': 2, 'random_state': rand_state } df_test = rf.rf_reg_predict(df_train, df_test, features, label, params, 'rf_photoz') ml_an.evaluate_regression(df_test['Z_VI'], df_test['rf_photoz']) pz_an.plot_redshifts(df_test['Z_VI'], df_test['rf_photoz']) pz_an.plot_error_hist(df_test['Z_VI'], df_test['rf_photoz']) plt.show()
def rf_full_emp(df_pred): # -------------------------------------------------------------------------- # PHOTOMETRIC REDSHIFT ESTIMATION # -------------------------------------------------------------------------- # Preparing the feature matrix df_train = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5','data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train.replace(np.inf, np.nan,inplace=True) df_train = df_train.query('0 < Z_VI < 10') df_train.query('SDSS_mag_i <= 18.5',inplace=True) df_train,features = qs.prepare_flux_ratio_catalog(df_train,passband_names) # Random Forest Regression Grid Search features = ['SDSS_i','WISE_w1','ug','gr','ri','iz','zw1','w1w2'] rand_state = 1 params = {'n_estimators': 200, 'max_depth': 25, 'min_samples_split': 2, 'n_jobs': 4, 'random_state':rand_state} df_pred = rf_reg.rf_reg_predict(df_train, df_pred, features, label, params, 'rf_emp_photoz') # -------------------------------------------------------------------------- # QSO-STAR-CLASSIFICATION # -------------------------------------------------------------------------- # Loading and preparing the data files df_stars = pd.read_hdf('../class_photoz/data/DR13_stars_clean_flux_cat.hdf5','data') df_quasars = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5','data') passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', \ 'WISE_w2', \ ] df_stars,features = qs.prepare_flux_ratio_catalog(df_stars,passband_names) df_quasars,features = qs.prepare_flux_ratio_catalog(df_quasars,passband_names) df_stars.query('SDSS_mag_i <= 18.5',inplace=True) df_quasars.query('SDSS_mag_i <= 18.5',inplace=True) print "Stars: ",df_stars.shape print "Quasars: ",df_quasars.shape # Preparing test and training sets #Create detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'mult_class_true', 'z') df_stars = qs.create_star_labels(df_stars, 'mult_class_true', 'star_class') # Create binary classes df_quasars['bin_class_true']='QSO' df_stars['bin_class_true']='STAR' # Concatenate training set df_train = pd.concat([df_star,df_quasars]) # Running the Random Forest method features = ['SDSS_i','WISE_w1','ug','gr','ri','iz', \ 'zw1', 'w1w2'] label = 'mult_class_true' params = {'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 4, 'n_jobs': 4, 'random_state': 1} rand_state = 1 clf,y_pred = rf_class_predict(df_train, df_pred, features, label, params, rand_state) df_pred['rf_emp_mult_label_pred'] = y_pred df_pred['rf_emp_bin_class_pred'] = 'STAR' qso_query = 'rf_emp_mult_class_pred == "vlowz" or rf_emp_mult_class_pred == "lowz" or rf_emp_mult_class_pred == "midz" or rf_emp_mult_class_pred == "highz"' df_pred.loc[df_pred.query(qso_query).index,'rf_emp_bin_class_pred'] = 'QSO' return df_pred