Python DataSampling 예제들, data_sampling.DataSampling Python 예제들

예제 #1

0

파일 보기

    def SKRandomForest_Category(self, train_set_x, train_set_y):
        print("SKRandomForest_Category ...")
        # NUM_STEPS = 1000
        # MINIBATCH_SIZE = 20
        NUM_TREE = 3000
        # MAX_NODE = 1000
        NUM_FEATURES = 'auto'
        # NUM_CLASS = 1
        MAX_DEPTH = None
        sampling_obj = DataSampling()
        my_model = RandomForestClassifier(n_estimators=NUM_TREE,
                                          max_features=NUM_FEATURES,
                                          min_samples_split=2,
                                          max_depth=MAX_DEPTH,
                                          oob_score=False)

        ### training
        print("Before fitting")
        my_model.fit(train_set_x, train_set_y)
        train_score = my_model.score(train_set_x, train_set_y)
        print("After fitting")

        print("train_score = {} ".format(train_score))
        print("feature_importances_ = {} ".format(
            my_model.feature_importances_))
        print("n_features_ =  {} ".format(my_model.n_features_))
        # print("oob_score_ = {} ".format(my_model.oob_score_ ))

        # ### prediction
        # test_prediction_result = my_model.predict(test_set_feature)
        # # print("test_prediction_result = {} ".format(test_prediction_result))

        return my_model

예제 #2

0

파일 보기

파일: CytoOA_training_main.py 프로젝트: myfirstjump/Statistical_Modeling_Template

    def cyto_cnn_training_pipeline(self, data_ary, test_ary):
        # data_processing_obj = DataProcessing()
        sampling_obj = DataSampling()
        sys_obj = SysConfig()
        train_obj = DataTrainning()
        process_obj = DataProcessing()

        TEST_VALIDATION_SPLIT = 0.33

        # model_output_dir = sys_obj.get_model_output_dir()

        #### data preprocessing
        np.random.shuffle(data_ary)
        train_set_x, train_set_y = sampling_obj.get_x_y_from_data_ary(data_ary)
        print('train set shape', train_set_x.shape)
        train_set_x = train_set_x.reshape(train_set_x.shape[0], 432, 1220, 1)

        np.random.shuffle(test_ary)
        test_X_ary, test_Y_ary = sampling_obj.get_x_y_from_data_ary(test_ary)
        test_X_ary = test_X_ary.reshape(test_X_ary.shape[0], 432, 1220, 1)
        (test_set_x, valid_set_x, test_set_y,
         valid_set_y) = train_test_split(test_X_ary,
                                         test_Y_ary,
                                         test_size=TEST_VALIDATION_SPLIT)
        # train_set_x, train_set_y = sampling_obj.binary_ary_data_over_sampling(train_set_x, train_set_y)

        ### sampling

        my_model = train_obj.cnn_apply(train_set_x, train_set_y, valid_set_x,
                                       valid_set_y, test_X_ary, test_Y_ary)

        # test_score = my_model.evaluate(test_set_x, test_set_y)

        # test_set_diseas_score = my_model.score(test_set_disease_x, test_set_disease_y)
        # test_set_normal_score = my_model.score(test_set_normal_x, test_set_normal_y)

        # print("\ntest data set, %s: %.2f%%" % (my_model.metrics_names[1], test_score[1]*100))

        # print("Test score:", test_score[0])
        # print('Test accuracy:', test_score[1])

        return my_model

예제 #3

0

파일 보기

파일: CytoOA_training_main.py 프로젝트: myfirstjump/Statistical_Modeling_Template

    def cyto_ai_training_pipeline_log_df(self, data_df):

        data_processing_obj = DataProcessing()
        sampling_obj = DataSampling()
        sys_obj = SysConfig()
        train_obj = DataTrainning()

        test_set_ratio = sys_obj.get_test_set_ratio()
        y_key = sys_obj.get_y_key()

        df_output_file = sys_obj.get_df_output_file()
        print(data_df)
        #### data preprocessing
        print("start cast_all_to_numeric.")
        data_df = data_processing_obj.cast_all_to_numeric(data_df)
        print(data_df)
        print("end cast_all_to_numeric.")

        ###
        data_df.to_csv(df_output_file, index=False)
        print("DF output file = %s" % (df_output_file))

        return data_df

예제 #4

0

파일 보기

파일: CytoOA_training_main.py 프로젝트: myfirstjump/Statistical_Modeling_Template

    def cyto_ai_training_pipeline(self, data_df):

        data_processing_obj = DataProcessing()
        sampling_obj = DataSampling()
        sys_obj = SysConfig()
        train_obj = DataTrainning()

        test_set_ratio = sys_obj.get_test_set_ratio()
        y_key = sys_obj.get_y_key()

        print(data_df)
        #### data preprocessing
        print("start cast_all_to_numeric.")
        data_df = data_processing_obj.cast_all_to_numeric(data_df)
        print("end cast_all_to_numeric.")
        # print("########################")
        # print(data_df)

        ### convert to category
        data_df[y_key] = data_df[y_key].astype('category')

        ### sampling
        (train_set,
         test_set) = sampling_obj.sk_sampling(data_df, test_set_ratio)

        # print("######################## train_set")
        # print(train_set)

        # print("######################## test_set")
        # print(test_set)

        #### temp data
        test_set_bak = test_set.copy()
        # test_set_bak[y_key] = test_set_bak[y_key].astype('int')

        ### get x,y
        (train_set_x,
         train_set_y) = sampling_obj.get_x_y_from_dataframe(train_set, y_key)
        (test_set_x,
         test_set_y) = sampling_obj.get_x_y_from_dataframe(test_set, y_key)

        # print("######################## train_set_x")
        # print(train_set_x)
        # print("######################## train_set_y")
        # print(train_set_y)

        # print("######################## test_set_x")
        # print(test_set_x)
        # print("######################## test_set_y")
        # print(test_set_y)

        ### seperate by disease
        test_set_disease = test_set_bak.loc[test_set_bak[y_key] == 1, :]
        test_set_normal = test_set_bak.loc[test_set_bak[y_key] == 0, :]

        (test_set_disease_x,
         test_set_disease_y) = sampling_obj.get_x_y_from_dataframe(
             test_set_disease, y_key)
        (test_set_normal_x,
         test_set_normal_y) = sampling_obj.get_x_y_from_dataframe(
             test_set_normal, y_key)

        # print("######################## test_set_disease")
        # print(test_set_disease)
        # print("######################## test_set_disease_x")
        # print(test_set_disease_x)
        # print("######################## test_set_disease_y")
        # print(test_set_disease_y)

        # print("######################## test_set_normal")
        # print(test_set_normal)
        # print("######################## test_set_normal_x")
        # print(test_set_normal_x)
        # print("######################## test_set_normal_y")
        # print(test_set_normal_y)

        test_set_disease_len = len(test_set_disease_y)
        test_set_normal_len = len(test_set_normal_y)

        print(
            "######################## test_set_disease_len, test_set_normal_len"
        )
        print("{}, {}".format(test_set_disease_len, test_set_normal_len))
        # ### feature list
        # # feature_dict = {}
        # # feature_dict['numeric'] = list(train_set_x.columns.values)

        ### training the model
        my_model = train_obj.SKRandomForest_Category(train_set_x, train_set_y)

        test_score = my_model.score(test_set_x, test_set_y)
        test_set_diseas_score = my_model.score(test_set_disease_x,
                                               test_set_disease_y)
        test_set_normal_score = my_model.score(test_set_normal_x,
                                               test_set_normal_y)

        print("Test score = {}".format(test_score))
        print("test_set_diseas_score score = {}".format(test_set_diseas_score))
        print("test_set_normal_score score = {}".format(test_set_normal_score))

        return data_df

예제 #5

0

파일 보기

파일: CytoOA_training_main.py 프로젝트: myfirstjump/Statistical_Modeling_Template

    def cyto_ai_balance_training_pipeline(self, data_df):

        data_processing_obj = DataProcessing()
        sampling_obj = DataSampling()
        sys_obj = SysConfig()
        train_obj = DataTrainning()

        test_set_ratio = sys_obj.get_test_set_ratio()
        y_key = sys_obj.get_y_key()

        model_output_dir = sys_obj.get_model_output_dir()
        log_file = sys_obj.get_log_file()

        model_threshold = 200
        model_count = 1
        print(data_df)
        #### data preprocessing
        print("start cast_all_to_numeric.")
        data_df = data_processing_obj.cast_all_to_numeric(data_df)
        print("end cast_all_to_numeric.")
        # print("########################")
        # print(data_df)

        ### convert to category
        data_df[y_key] = data_df[y_key].astype('category')

        ### sampling

        disease_df = data_df.loc[data_df[y_key] == 1, :]
        normal_df = data_df.loc[data_df[y_key] == 0, :]

        print("### disease_df")
        print(disease_df)

        print("### normal_df")
        print(normal_df)

        print(len(disease_df), len(normal_df))
        # # log_file = '/app/data/model/RF_3000_log.txt'
        fh_writer = open(log_file, 'w')

        while model_count < model_threshold:
            # (train_set, train_label, test_set, test_label) = sampling_obj.category2_sampling_pipeline(normal_df, disease_df,y_key, test_set_ratio)
            (train_set,
             test_set) = sampling_obj.category2_simple_sampling_pipeline(
                 normal_df, disease_df, y_key, test_set_ratio)

            print("###############################")
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(model_count)
            # print("######################## train_set")
            # print(train_set)

            # print("######################## test_set")
            # print(test_set)

            # #### temp data
            test_set_bak = test_set.copy()
            # # test_set_bak[y_key] = test_set_bak[y_key].astype('int')

            ### get x,y
            (train_set_x, train_set_y) = sampling_obj.get_x_y_from_dataframe(
                train_set, y_key)
            (test_set_x, test_set_y) = sampling_obj.get_x_y_from_dataframe(
                test_set, y_key)

            # print("######################## train_set_x")
            # print(train_set_x)
            # print("######################## train_set_y")
            # print(train_set_y)

            # print("######################## test_set_x")
            # print(test_set_x)
            # print("######################## test_set_y")
            # print(test_set_y)

            ### seperate by disease
            test_set_disease = test_set_bak.loc[test_set_bak[y_key] == 1, :]
            test_set_normal = test_set_bak.loc[test_set_bak[y_key] == 0, :]

            (test_set_disease_x,
             test_set_disease_y) = sampling_obj.get_x_y_from_dataframe(
                 test_set_disease, y_key)
            (test_set_normal_x,
             test_set_normal_y) = sampling_obj.get_x_y_from_dataframe(
                 test_set_normal, y_key)

            # print("######################## test_set_disease")
            # print(test_set_disease)
            # print("######################## test_set_disease_x")
            # print(test_set_disease_x)
            # print("######################## test_set_disease_y")
            # print(test_set_disease_y)

            # print("######################## test_set_normal")
            # print(test_set_normal)
            # print("######################## test_set_normal_x")
            # print(test_set_normal_x)
            # print("######################## test_set_normal_y")
            # print(test_set_normal_y)

            test_set_disease_len = len(test_set_disease_y)
            test_set_normal_len = len(test_set_normal_y)

            print(
                "######################## test_set_disease_len, test_set_normal_len"
            )
            print("{}, {}".format(test_set_disease_len, test_set_normal_len))
            # ### feature list
            # # feature_dict = {}
            # # feature_dict['numeric'] = list(train_set_x.columns.values)

            ### training the model
            my_model = train_obj.SKRandomForest_Category(
                train_set_x, train_set_y)

            test_score = my_model.score(test_set_x, test_set_y)
            test_set_diseas_score = my_model.score(test_set_disease_x,
                                                   test_set_disease_y)
            test_set_normal_score = my_model.score(test_set_normal_x,
                                                   test_set_normal_y)

            print("Test score = {}".format(test_score))
            print("test_set_diseas_score score = {}".format(
                test_set_diseas_score))
            print("test_set_normal_score score = {}".format(
                test_set_normal_score))

            ### log file
            fh_writer.write("{}\t{}\t{}\n".format(test_score,
                                                  test_set_diseas_score,
                                                  test_set_normal_score))

            ### model
            model_file = model_output_dir + str(model_count) + '.pkl'
            joblib.dump(my_model, model_file)
            model_count += 1
        fh_writer.close()

        return data_df

예제 #6

0

파일 보기

파일: CytoOA_training_main.py 프로젝트: myfirstjump/Statistical_Modeling_Template

    def cyto_xgboost_balance_training_pipeline(self, data_df):

        data_processing_obj = DataProcessing()
        sampling_obj = DataSampling()
        sys_obj = SysConfig()
        train_obj = DataTrainning()

        test_set_ratio = sys_obj.get_test_set_ratio()
        y_key = sys_obj.get_y_key()

        model_output_dir = sys_obj.get_model_output_dir()
        log_file = sys_obj.get_log_file()

        model_threshold = 200
        model_count = 1
        print(data_df)
        #### data preprocessing
        print("start cast_all_to_numeric.")
        data_df = data_processing_obj.cast_all_to_numeric(data_df)
        print("end cast_all_to_numeric.")
        # print("########################")
        # print(data_df)

        ### convert to category
        # data_df[y_key] = data_df[y_key].astype('category')

        ### sampling

        disease_df = data_df.loc[data_df[y_key] == 1, :]
        normal_df = data_df.loc[data_df[y_key] == 0, :]

        # # log_file = '/app/data/model/RF_3000_log.txt'
        # fh_writer = open(log_file, 'w')

        # while model_count < model_threshold:
        # (train_set, train_label, test_set, test_label) = sampling_obj.category2_sampling_pipeline(normal_df, disease_df,y_key, test_set_ratio)
        (train_set,
         test_set) = sampling_obj.category2_simple_sampling_pipeline(
             normal_df, disease_df, y_key, test_set_ratio)

        print("###############################")
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(model_count)
        # print("######################## train_set")
        # print(train_set)

        # print("######################## test_set")
        # print(test_set)

        # #### temp data
        test_set_bak = test_set.copy()
        # # test_set_bak[y_key] = test_set_bak[y_key].astype('int')

        ### get x,y
        (train_set_x,
         train_set_y) = sampling_obj.get_x_y_from_dataframe(train_set, y_key)
        (test_set_x,
         test_set_y) = sampling_obj.get_x_y_from_dataframe(test_set, y_key)

        # print("######################## train_set_x")
        # print(train_set_x)
        # print("######################## train_set_y")
        # print(train_set_y)

        # print("######################## test_set_x")
        # print(test_set_x)
        # print("######################## test_set_y")
        # print(test_set_y)

        ### seperate by disease
        test_set_disease = test_set_bak.loc[test_set_bak[y_key] == 1, :]
        test_set_normal = test_set_bak.loc[test_set_bak[y_key] == 0, :]

        (test_set_disease_x,
         test_set_disease_y) = sampling_obj.get_x_y_from_dataframe(
             test_set_disease, y_key)
        (test_set_normal_x,
         test_set_normal_y) = sampling_obj.get_x_y_from_dataframe(
             test_set_normal, y_key)

        # print("######################## test_set_disease")
        # print(test_set_disease)
        # print("######################## test_set_disease_x")
        # print(test_set_disease_x)
        # print("######################## test_set_disease_y")
        # print(test_set_disease_y)

        # print("######################## test_set_normal")
        # print(test_set_normal)
        # print("######################## test_set_normal_x")
        # print(test_set_normal_x)
        # print("######################## test_set_normal_y")
        # print(test_set_normal_y)

        test_set_disease_len = len(test_set_disease_y)
        test_set_normal_len = len(test_set_normal_y)

        print(
            "######################## test_set_disease_len, test_set_normal_len"
        )
        print("{}, {}".format(test_set_disease_len, test_set_normal_len))
        # ### feature list
        # # feature_dict = {}
        # # feature_dict['numeric'] = list(train_set_x.columns.values)

        ### training the model
        my_model = train_obj.xgboot_training(train_set_x, train_set_y,
                                             test_set_x, test_set_y)

        # # test_set_x = preprocessing.scale(test_set_x)
        # # test_set_x = scaler.scale(test_set_x)
        # test_set_x = scaler.transform(test_set_x)

        # # test_set_y = to_categorical(test_set_y)
        # test_score = my_model.evaluate(test_set_x, test_set_y)
        # # test_set_diseas_score = my_model.score(test_set_disease_x, test_set_disease_y)
        # # test_set_normal_score = my_model.score(test_set_normal_x, test_set_normal_y)

        # print("\ntest data set, %s: %.2f%%" % (my_model.metrics_names[1], test_score[1]*100))

        # print("Test score:", test_score[0])
        # print('Test accuracy:', test_score[1])

        return data_df

예제 #7

0

파일 보기

    def LinearRegressor(self, df_1, df_2, target_name, feature_dict):
        # train_numeric_list = ["Age", "Blast",
        # 	"WBC", "RBC", "HGB", "HCT", "MCV", "MCH", "MCHC", "RDW", "PLT", "MPV", "NE", "LY",
        # 	"MO", "EO", "BA", "NE_c", "LY_c", "MO_c", "EO_c", "BA_c", "NRBC", "NRBC_c",
        # 	"Left Shift 3"]
        NUM_STEPS = 200
        MINIBATCH_SIZE = 20

        sampling_obj = DataSampling()

        (train_set, train_label, test_set,
         test_label) = sampling_obj.cbc_sampling_pipeline(df_1,
                                                          df_2,
                                                          target_name,
                                                          sample_ratio=0.8)

        # train_label = train_label.to_frame()
        # print(train_label)
        # print(type(train_label))

        ### feature columns
        numeric_fields = feature_dict['numeric']
        category_fields = feature_dict['category']
        all_features = numeric_fields + category_fields

        train_set_feature = train_set.loc[:, all_features]
        train_set_feature = self.data_selection_obj.cast_to_numeric(
            train_set_feature, numeric_fields)
        train_set_feature = train_set_feature.reindex()

        test_set_feature = test_set.loc[:, all_features]
        test_set_feature = self.data_selection_obj.cast_to_numeric(
            test_set_feature, numeric_fields)
        test_set_feature = test_set_feature.reindex()

        train_feature_columns = self.build_feature_columns(feature_dict)

        print(train_feature_columns)

        print("train_set_feature = {0}".format(train_set_feature.columns))

        print("train_set_feature len = {0}".format(len(train_set_feature)))
        print("train_label len = {0}".format(len(train_label)))

        print("DF NA checking ...")
        print(train_set_feature.isnull().any().any())
        # print(train_set_feature['BA_c'])

        # reg = tf.estimator.LinearRegressor(
        # 	feature_columns=train_feature_columns,
        # 	optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.1)
        # )

        reg = tf.estimator.LinearRegressor(
            feature_columns=train_feature_columns)

        train_fn = lambda: self.data_func(
            train_set_feature, train_label, batch_size=MINIBATCH_SIZE)

        test_fn = lambda: self.test_func(test_set_feature, test_label)

        ### training
        reg.train(input_fn=train_fn, steps=NUM_STEPS)

        ## evaulate
        eval_result = reg.evaluate(input_fn=test_fn)
        average_loss = eval_result["average_loss"]
        print("average_loss = {0}".format(average_loss))
        print("\nRMSE for the test set: {:.2f}".format(average_loss**0.5))

        ## prediction
        predict_set = dict(test_set_feature.head(10))
        predict_set = {
            key: np.array(value)
            for key, value in predict_set.items()
        }
        predict_input_fn = tf.estimator.inputs.numpy_input_fn(predict_set,
                                                              shuffle=False)
        predict_results = reg.predict(input_fn=predict_input_fn)

예제 #8

0

파일 보기

    def RandomForest(self, df_1, df_2, target_name, feature_dict):

        NUM_STEPS = 1000
        MINIBATCH_SIZE = 20
        NUM_TREE = 2000
        MAX_NODE = 1000
        NUM_FEATURES = 24
        NUM_CLASS = 1
        sampling_obj = DataSampling()

        ### modify target
        df_1 = self.data_selection_obj.cast_to_numeric(df_1, ['Blast_count'])
        df_2 = self.data_selection_obj.cast_to_numeric(df_2, ['Blast_count'])

        ### Sampling
        (train_set, train_label, test_set,
         test_label) = sampling_obj.cbc_sampling_pipeline(df_1,
                                                          df_2,
                                                          target_name,
                                                          sample_ratio=0.8)

        ### feature columns
        numeric_fields = feature_dict['numeric']
        category_fields = feature_dict['category']
        all_features = numeric_fields + category_fields

        train_set_feature = train_set.loc[:, all_features]
        train_set_feature = self.data_selection_obj.cast_to_numeric(
            train_set_feature, numeric_fields)
        train_set_feature = train_set_feature.reindex()

        test_set_feature = test_set.loc[:, all_features]
        test_set_feature = self.data_selection_obj.cast_to_numeric(
            test_set_feature, numeric_fields)
        test_set_feature = test_set_feature.reindex()

        train_feature_columns = self.build_feature_columns(feature_dict)

        print(train_feature_columns)
        print(len(train_feature_columns))

        print("train_set_feature = {0}".format(train_set_feature.columns))

        print("train_set_feature len = {0}".format(len(train_set_feature)))
        print("train_label len = {0}".format(len(train_label)))

        print("DF NA checking ...")
        print(train_set_feature.isnull().any().any())

        params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
            num_classes=NUM_CLASS,
            num_features=NUM_FEATURES,
            num_trees=NUM_TREE,
            regression=True,
            max_nodes=MAX_NODE)

        graph_builder_class = tf.contrib.tensor_forest.python.tensor_forest.RandomForestGraphs
        my_model = tf.contrib.tensor_forest.client.random_forest.TensorForestEstimator(
            params, graph_builder_class=graph_builder_class)

        # train_fn = lambda: self.data_func(train_set_feature, train_label, batch_size=MINIBATCH_SIZE)

        # test_fn = lambda: self.test_func(test_set_feature, test_label)

        train_fn = self.randomforest_data_func(train_set_feature,
                                               train_label,
                                               batch_size=MINIBATCH_SIZE)
        ### training
        print("Before fitting")
        my_model.fit(input_fn=train_fn, steps=None)
        print("After fitting")

예제 #9

0

파일 보기

    def DNNRegressor(self, df_1, df_2, target_name, feature_dict):

        print(" DNNRegressor ...")
        NUM_STEPS = 1000
        MINIBATCH_SIZE = 20
        NUM_TREE = 2000
        MAX_NODE = 1000
        NUM_FEATURES = 24
        NUM_CLASS = 1
        sampling_obj = DataSampling()

        ### modify target
        df_1 = self.data_selection_obj.cast_to_numeric(df_1, ['Blast_count'])
        df_2 = self.data_selection_obj.cast_to_numeric(df_2, ['Blast_count'])

        ### Sampling
        (train_set, train_label, test_set,
         test_label) = sampling_obj.cbc_sampling_pipeline(df_1,
                                                          df_2,
                                                          target_name,
                                                          sample_ratio=0.8)

        ### feature columns
        numeric_fields = feature_dict['numeric']
        category_fields = feature_dict['category']
        all_features = numeric_fields + category_fields

        train_set_feature = train_set.loc[:, all_features]
        train_set_feature = self.data_selection_obj.cast_to_numeric(
            train_set_feature, numeric_fields)
        train_set_feature = train_set_feature.reindex()

        test_set_feature = test_set.loc[:, all_features]
        test_set_feature = self.data_selection_obj.cast_to_numeric(
            test_set_feature, numeric_fields)
        test_set_feature = test_set_feature.reindex()

        train_feature_columns = self.build_feature_columns(feature_dict)

        # print(train_feature_columns)
        # print(len(train_feature_columns))

        # print("train_set_feature = {0}".format(train_set_feature.columns))

        # print("train_set_feature len = {0}".format(len(train_set_feature)))
        # print("train_label len = {0}".format(len(train_label)))

        # print("DF NA checking ...")
        # print(train_set_feature.isnull().any().any())

        reg = tf.estimator.DNNRegressor(
            feature_columns=train_feature_columns,
            # hidden_units=[100, 100, 100, 50, 20, 10],
            hidden_units=[30, 30, 30, 20, 10],
            optimizer=tf.train.ProximalAdagradOptimizer(
                learning_rate=0.01, l1_regularization_strength=0.001))

        train_fn = lambda: self.data_func(
            train_set_feature, train_label, batch_size=MINIBATCH_SIZE)

        test_fn = lambda: self.test_func(test_set_feature, test_label)

        ### training
        reg.train(input_fn=train_fn, steps=NUM_STEPS)

        ## evaulate
        eval_result = reg.evaluate(input_fn=test_fn)
        average_loss = eval_result["average_loss"]
        print("average_loss = {0}".format(average_loss))
        print("\nRMSE for the test set: {:.2f}".format(average_loss**0.5))

예제 #10

0

파일 보기

    def SKRandomForest(self, df_1, df_2, target_name, feature_dict):

        print("SKRandomForest ...")
        # NUM_STEPS = 1000
        # MINIBATCH_SIZE = 20
        NUM_TREE = 2000
        # MAX_NODE = 1000
        NUM_FEATURES = 24
        # NUM_CLASS = 1
        MAX_DEPTH = None
        sampling_obj = DataSampling()

        ### modify target
        df_1 = self.data_selection_obj.cast_to_numeric(df_1, ['Blast_count'])
        df_2 = self.data_selection_obj.cast_to_numeric(df_2, ['Blast_count'])

        ### Sampling
        (train_set, train_label, test_set,
         test_label) = sampling_obj.cbc_sampling_pipeline(df_1,
                                                          df_2,
                                                          target_name,
                                                          sample_ratio=0.8)

        ### feature columns
        numeric_fields = feature_dict['numeric']
        category_fields = feature_dict['category']
        all_features = numeric_fields + category_fields

        train_set_feature = train_set.loc[:, all_features]
        train_set_feature = self.data_selection_obj.cast_to_numeric(
            train_set_feature, numeric_fields)
        train_set_feature = train_set_feature.reindex()

        test_set_feature = test_set.loc[:, all_features]
        test_set_feature = self.data_selection_obj.cast_to_numeric(
            test_set_feature, numeric_fields)
        test_set_feature = test_set_feature.reindex()

        train_feature_columns = self.build_feature_columns(feature_dict)

        my_model = RandomForestRegressor(n_estimators=NUM_TREE,
                                         max_features=NUM_FEATURES,
                                         min_samples_split=2,
                                         max_depth=MAX_DEPTH,
                                         oob_score=True)

        ### training
        print("Before fitting")
        my_model.fit(train_set_feature, train_label)
        train_score = my_model.score(train_set_feature, train_label)
        print("After fitting")

        print("feature_importances_ = {} ".format(
            my_model.feature_importances_))
        print("n_features_ =  {} ".format(my_model.n_features_))
        print("oob_score_ = {} ".format(my_model.oob_score_))
        print("train_score = {} ".format(train_score))

        ### prediction
        test_prediction_result = my_model.predict(test_set_feature)
        # print("test_prediction_result = {} ".format(test_prediction_result))

        for (pair_1, pari2) in zip(test_label, test_prediction_result):
            print("real,predict = {},{} ".format(pair_1, pari2))