Exemplo n.º 1
0
def simqso_predict_dr7dr12():
    # --------------------------------------------------------------------------
    # Preparing the feature matrix
    # --------------------------------------------------------------------------
    df_test = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5',
                          'data')
    df_train = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5',
                           'data')
    passband_names = [\
            'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \
            # 'TMASS_j','TMASS_h','TMASS_ks', \
            'WISE_w1','WISE_w2', \
            # 'WISE_w3' \
            ]

    df_test = df_test.query('0.3 < Z_VI < 5.5')
    # df_train.query('obsMag_SDSS_i <= 18.5',inplace=True)
    # df_test.query('SDSS_mag_i <= 18.5',inplace=True)
    # df_train.query('z > 1.1',inplace=True)

    for name in passband_names:
        df_train.rename(columns={'obsFlux_' + name: name}, inplace=True)
        df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name},
                        inplace=True)

    df_test.replace(np.inf, np.nan, inplace=True)
    df_train.replace(np.inf, np.nan, inplace=True)

    df_test, features = qs.prepare_flux_ratio_catalog(df_test, passband_names)
    df_train, features = qs.prepare_flux_ratio_catalog(df_train,
                                                       passband_names)

    print df_test.shape, df_train.shape
    # --------------------------------------------------------------------------
    # Random Forest Regression Grid Search
    # --------------------------------------------------------------------------

    # features = ['SDSS_u','SDSS_i','SDSS_r','SDSS_z','SDSS_g','WISE_w1','WISE_w2']
    #  features = ['SDSS_i','WISE_w1','ug','gr','ri','iz','zw1','w1w2']
    features = ['ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2']
    # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', 'hks', 'ksw1', 'w1w2']
    label = 'z'
    rand_state = 1

    params = {
        'n_estimators': 300,
        'max_depth': 30,
        'min_samples_split': 4,
        'n_jobs': 2,
        'random_state': rand_state
    }

    df_test = rf.rf_reg_predict(df_train, df_test, features, label, params,
                                'rf_photoz')

    ml_an.evaluate_regression(df_test['Z_VI'], df_test['rf_photoz'])
    pz_an.plot_redshifts(df_test['Z_VI'], df_test['rf_photoz'])
    pz_an.plot_error_hist(df_test['Z_VI'], df_test['rf_photoz'])
    plt.show()
def rf_full_emp(df_pred):
    # --------------------------------------------------------------------------
    # PHOTOMETRIC REDSHIFT ESTIMATION
    # --------------------------------------------------------------------------

    # Preparing the feature matrix

    df_train = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5','data')

    passband_names = [\
            'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \
            # 'TMASS_j','TMASS_h','TMASS_k', \
            'WISE_w1','WISE_w2', \
            # 'WISE_w3' \
            ]

    df_train.replace(np.inf, np.nan,inplace=True)
    df_train = df_train.query('0 < Z_VI < 10')
    df_train.query('SDSS_mag_i <= 18.5',inplace=True)

    df_train,features = qs.prepare_flux_ratio_catalog(df_train,passband_names)


    # Random Forest Regression Grid Search
    features = ['SDSS_i','WISE_w1','ug','gr','ri','iz','zw1','w1w2']
    rand_state = 1
    params = {'n_estimators': 200, 'max_depth': 25, 'min_samples_split': 2, 'n_jobs': 4, 'random_state':rand_state}

    df_pred = rf_reg.rf_reg_predict(df_train, df_pred, features, label, params, 'rf_emp_photoz')


    # --------------------------------------------------------------------------
    # QSO-STAR-CLASSIFICATION
    # --------------------------------------------------------------------------

    # Loading and preparing the data files

    df_stars = pd.read_hdf('../class_photoz/data/DR13_stars_clean_flux_cat.hdf5','data')
    df_quasars = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5','data')

    passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \
                        # 'TMASS_j', \
                        # 'TMASS_h', \
                        # 'TMASS_k', \
                        'WISE_w1', \
                        'WISE_w2', \
                        ]

    df_stars,features = qs.prepare_flux_ratio_catalog(df_stars,passband_names)
    df_quasars,features = qs.prepare_flux_ratio_catalog(df_quasars,passband_names)

    df_stars.query('SDSS_mag_i <= 18.5',inplace=True)
    df_quasars.query('SDSS_mag_i <= 18.5',inplace=True)


    print "Stars: ",df_stars.shape
    print "Quasars: ",df_quasars.shape


    # Preparing test and training sets

    #Create detailed classes
    df_quasars = qs.create_qso_labels(df_quasars, 'mult_class_true', 'z')
    df_stars = qs.create_star_labels(df_stars, 'mult_class_true', 'star_class')

    # Create binary classes
    df_quasars['bin_class_true']='QSO'
    df_stars['bin_class_true']='STAR'

    # Concatenate training set
    df_train = pd.concat([df_star,df_quasars])

    # Running the Random Forest method
    features = ['SDSS_i','WISE_w1','ug','gr','ri','iz',  \
                'zw1', 'w1w2']


    label = 'mult_class_true'

    params = {'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 4,
        'n_jobs': 4, 'random_state': 1}

    rand_state = 1

    clf,y_pred = rf_class_predict(df_train, df_pred, features, label, params,
                                                                rand_state)

    df_pred['rf_emp_mult_label_pred'] = y_pred

    df_pred['rf_emp_bin_class_pred'] = 'STAR'
    qso_query = 'rf_emp_mult_class_pred == "vlowz" or rf_emp_mult_class_pred == "lowz" or rf_emp_mult_class_pred == "midz" or rf_emp_mult_class_pred == "highz"'
    df_pred.loc[df_pred.query(qso_query).index,'rf_emp_bin_class_pred'] = 'QSO'

    return df_pred