def full_test(): # -------------------------------------------------------------------------- # Loading and preparing the data files # -------------------------------------------------------------------------- df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') # df_quasars = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5','data') # df_stars.drop(df_stars.query('star_class == "null"').index, inplace=True) passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', \ 'WISE_w2', \ ] # embed this in the sim qso conversion file! for name in passband_names: df_quasars.rename(columns={'obsFlux_' + name: name}, inplace=True) df_quasars.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_stars, features = qs.prepare_flux_ratio_catalog(df_stars, passband_names) df_quasars, features = qs.prepare_flux_ratio_catalog( df_quasars, passband_names) # Introducing selection criteria df_stars.query('SDSS_mag_i <= 21.5', inplace=True) df_quasars.query('SDSS_mag_i <= 21.5', inplace=True) # df_quasars.query('obsMag_SDSS_i <= 18.5',inplace=True) print "Stars: ", df_stars.shape print "Quasars: ", df_quasars.shape # -------------------------------------------------------------------------- # Preparing test and training sets # -------------------------------------------------------------------------- #Create detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'mult_class_true', 'Z_VI') df_stars = qs.create_star_labels(df_stars, 'mult_class_true', 'star_class') # Create binary classes df_quasars['bin_class_true'] = 'QSO' df_stars['bin_class_true'] = 'STAR' # Make test and training set df_train, df_pred = qs.make_train_pred_set(df_stars, df_quasars, 0.2, rand_state=1) print df_train.shape, df_pred.shape # -------------------------------------------------------------------------- # Running the Random Forest method # -------------------------------------------------------------------------- # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ # 'hk', 'kw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','ug','gr','ri','iz', \ # 'zw1', 'w1w2'] features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] label = 'mult_class_true' params = { 'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'n_jobs': 2, 'random_state': 1 } print features print params rand_state = 1 y_true, y_pred, df_prob = \ rf_class.rf_class_example(df_train, df_pred, features, label, params,rand_state) # -------------------------------------------------------------------------- # Additional analysis # -------------------------------------------------------------------------- data = {'mult_class_true': y_true, 'mult_class_pred': y_pred} df = pd.DataFrame(data) df['bin_class_pred'] = 'STAR' df['bin_class_true'] = 'STAR' qso_query = 'mult_class_pred == "vlowz" or mult_class_pred == "lowz" or mult_class_pred == "midz" or mult_class_pred == "highz"' df.loc[df.query(qso_query).index, 'bin_class_pred'] = 'QSO' qso_query_true = 'mult_class_true == "vlowz" or mult_class_true == "lowz" or mult_class_true == "midz" or mult_class_true == "highz"' df.loc[df.query(qso_query_true).index, 'bin_class_true'] = 'QSO' labels = ('QSO', 'STAR') y_true = df.bin_class_true.values y_pred = df.bin_class_pred.values pf_an.classification_analysis(y_true, y_pred, labels) df.to_hdf('fitted_classes.hdf5', 'data')
def sim_test_full_fit(): # Load the catalog from wich to make the star model df_stars = pd.read_hdf('../class_photoz/data/DR13_stars_clean_flux_cat.hdf5','data') df_stars.drop(df_stars.query('star_class == "null"').index, inplace=True) # Load the catalog from wich to make the quasar model df_qsos = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5','data') z_label = 'z' star_label = 'class_label' rand_state = 1 params = {'binning' : 'minimum', 'bin_param' : 50, 'model_type' : 'median'} df_qsos.query('obsMag_SDSS_i <= 18.5',inplace=True) df_stars = df_stars.query('SDSS_mag_i < 18.5') df_stars = qs.create_star_labels(df_stars, star_label, 'star_class') # Set binary and multi class columns for evaluation routines df_stars['bin_class_true'] = 'STAR' df_stars['mult_class_true'] = df_stars[star_label] df_qsos['bin_class_true'] = 'QSO' df_qsos = pf_an.set_redshift_classes(df_qsos, 'z', 'mult_class_true') #specify passband and other column names for model file passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1','WISE_w2', \ ] #embed this in the sim qso conversion file! for name in passband_names: df_qsos.rename(columns={'obsFlux_'+name:name},inplace=True) df_qsos.rename(columns={'obsFluxErr_'+name:'sigma_'+name},inplace=True) df_stars, features = qs.prepare_flux_ratio_catalog(df_stars, \ passband_names, sigma=True) df_qsos, features = qs.prepare_flux_ratio_catalog(df_qsos, \ passband_names, sigma=True) df_train_stars, df_train_qsos, df_test = \ qs.make_train_pred_set(df_stars, df_qsos, 0.2, rand_state, 'SDSSW1W2_sim_i18_5_', concat=False, save = True) print df_train_stars.mult_class_true.value_counts() print df_train_qsos.mult_class_true.value_counts() print df_test.mult_class_true.value_counts() df_test, qso_prob, qso_chisq = \ photoz_fit(df_train_qsos,df_test,features, z_label, params) df_test, star_prob, star_chisq, star_model = \ star_fit(df_train_stars, df_test, features, star_label, params) # Classify the test set according to the lowest chi-squared value df_test = pf_an.set_redshift_classes(df_test, 'pf_photoz', 'qso_class') df_test = pf_an.set_pred_classes(df_test) df_test.to_hdf('photofit_SDSSW1W2_bin50_sim_i18_5.hdf5','data') full_analysis_sim(df_test)
def dr7dr12q_grid_search(): # -------------------------------------------------------------------------- # Read data file and input parameters # -------------------------------------------------------------------------- df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') df_stars.dropna(subset=['star_class'], inplace=True) param_grid = [{ 'n_estimators': [100, 200, 300], 'min_samples_split': [2, 3, 4], 'max_depth': [15, 20, 25] }] # param_grid = [{'n_estimators': [100], 'min_samples_split': [2], # 'max_depth' : [20]}] rand_state = 1 scores = ['f1_weighted'] # Restrict the data set df_stars.query('SDSS_mag_i <= 21.5', inplace=True) df_quasars.query('SDSS_mag_i <= 21.5', inplace=True) # Create basic classes df_quasars['label'] = 'QSO' df_stars['label'] = 'STAR' #Create more detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'class_label', 'Z_VI') df_stars = qs.create_star_labels(df_stars, 'class_label', 'star_class') # FOR TESTING PURPOSES # df_stars = df_stars.sample(frac=0.2) # df_quasars = df_quasars.sample(frac=0.2) # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ # 'WISE_w1','WISE_w2', \ ] df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) label = 'class_label' df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df_train, df_pred = qs.make_train_pred_set(df_stars_train, df_qsos_train, 0.2, rand_state, 'i19_5_') #Choose label: 'label' = 2 classes, 'class_label'= multiple classes features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- #rf_class.rf_class_grid_search(df_train, df_pred, features, label ,param_grid, rand_state, scores, 'test') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1','WISE_w2', \ ] label = 'class_label' df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df_train, df_pred = qs.make_train_pred_set(df_stars_train, df_qsos_train, 0.2, rand_state) #Choose label: 'label' = 2 classes, 'class_label'= multiple classes features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] print df_train.shape, df_pred.shape # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- #rf_class.rf_class_grid_search(df_train, df_pred, features, label ,param_grid, rand_state, scores, 'test') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ 'TMASS_j', \ 'TMASS_h', \ 'TMASS_k', \ 'WISE_w1','WISE_w2', \ ] label = 'class_label' df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df_train, df_pred = qs.make_train_pred_set(df_stars_train, df_qsos_train, 0.2, rand_state) #Choose label: 'label' = 2 classes, 'class_label'= multiple classes features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ 'hk', 'kw1', 'w1w2']
def test_example(): df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') passband_names = ['SDSS_u', 'SDSS_g', 'SDSS_r', 'SDSS_i', 'SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', \ 'WISE_w2', \ # 'WISE_w3', \ # 'WISE_w4', \ ] df_stars, features = \ qs.prepare_flux_ratio_catalog(df_stars, passband_names) df_quasars, features = \ qs.prepare_flux_ratio_catalog(df_quasars, passband_names) # Reduce the total set of objects for testing the routines # df_stars = df_stars.sample(frac=0.2) # df_quasars = df_quasars.sample(frac=0.2) df_stars.query('SDSS_mag_i <= 21.5', inplace=True) df_quasars.query('SDSS_mag_i <= 21.5', inplace=True) print "Stars: ", df_stars.shape print "Quasars: ", df_quasars.shape # Create detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'mult_class_true', 'z') df_stars = qs.create_star_labels(df_stars, 'mult_class_true', 'star_class') # Create binary classes df_quasars['bin_class_true'] = 'QSO' df_stars['bin_class_true'] = 'STAR' # Make test and training set df_train, df_pred = qs.make_train_pred_set(df_stars, df_quasars, 0.2, rand_state=1) #features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ # 'hk', 'kw1', 'w1w2'] # features = ['SDSS_i','TMASS_j','ug','gr','ri','iz','zj','jh', 'hk'] features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] #features = ['SDSS_i','ug','gr','ri','iz'] label = 'mult_class_true' params = { 'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 3, 'n_jobs': 2, 'random_state': 1 } rand_state = 1 y_true, y_pred, df_prob = rf_class.rf_class_example( df_train, df_pred, features, label, params, rand_state)