def SKRandomForest_Category(self, train_set_x, train_set_y): print("SKRandomForest_Category ...") # NUM_STEPS = 1000 # MINIBATCH_SIZE = 20 NUM_TREE = 3000 # MAX_NODE = 1000 NUM_FEATURES = 'auto' # NUM_CLASS = 1 MAX_DEPTH = None sampling_obj = DataSampling() my_model = RandomForestClassifier(n_estimators=NUM_TREE, max_features=NUM_FEATURES, min_samples_split=2, max_depth=MAX_DEPTH, oob_score=False) ### training print("Before fitting") my_model.fit(train_set_x, train_set_y) train_score = my_model.score(train_set_x, train_set_y) print("After fitting") print("train_score = {} ".format(train_score)) print("feature_importances_ = {} ".format( my_model.feature_importances_)) print("n_features_ = {} ".format(my_model.n_features_)) # print("oob_score_ = {} ".format(my_model.oob_score_ )) # ### prediction # test_prediction_result = my_model.predict(test_set_feature) # # print("test_prediction_result = {} ".format(test_prediction_result)) return my_model
def cyto_cnn_training_pipeline(self, data_ary, test_ary): # data_processing_obj = DataProcessing() sampling_obj = DataSampling() sys_obj = SysConfig() train_obj = DataTrainning() process_obj = DataProcessing() TEST_VALIDATION_SPLIT = 0.33 # model_output_dir = sys_obj.get_model_output_dir() #### data preprocessing np.random.shuffle(data_ary) train_set_x, train_set_y = sampling_obj.get_x_y_from_data_ary(data_ary) print('train set shape', train_set_x.shape) train_set_x = train_set_x.reshape(train_set_x.shape[0], 432, 1220, 1) np.random.shuffle(test_ary) test_X_ary, test_Y_ary = sampling_obj.get_x_y_from_data_ary(test_ary) test_X_ary = test_X_ary.reshape(test_X_ary.shape[0], 432, 1220, 1) (test_set_x, valid_set_x, test_set_y, valid_set_y) = train_test_split(test_X_ary, test_Y_ary, test_size=TEST_VALIDATION_SPLIT) # train_set_x, train_set_y = sampling_obj.binary_ary_data_over_sampling(train_set_x, train_set_y) ### sampling my_model = train_obj.cnn_apply(train_set_x, train_set_y, valid_set_x, valid_set_y, test_X_ary, test_Y_ary) # test_score = my_model.evaluate(test_set_x, test_set_y) # test_set_diseas_score = my_model.score(test_set_disease_x, test_set_disease_y) # test_set_normal_score = my_model.score(test_set_normal_x, test_set_normal_y) # print("\ntest data set, %s: %.2f%%" % (my_model.metrics_names[1], test_score[1]*100)) # print("Test score:", test_score[0]) # print('Test accuracy:', test_score[1]) return my_model
def cyto_ai_training_pipeline_log_df(self, data_df): data_processing_obj = DataProcessing() sampling_obj = DataSampling() sys_obj = SysConfig() train_obj = DataTrainning() test_set_ratio = sys_obj.get_test_set_ratio() y_key = sys_obj.get_y_key() df_output_file = sys_obj.get_df_output_file() print(data_df) #### data preprocessing print("start cast_all_to_numeric.") data_df = data_processing_obj.cast_all_to_numeric(data_df) print(data_df) print("end cast_all_to_numeric.") ### data_df.to_csv(df_output_file, index=False) print("DF output file = %s" % (df_output_file)) return data_df
def cyto_ai_training_pipeline(self, data_df): data_processing_obj = DataProcessing() sampling_obj = DataSampling() sys_obj = SysConfig() train_obj = DataTrainning() test_set_ratio = sys_obj.get_test_set_ratio() y_key = sys_obj.get_y_key() print(data_df) #### data preprocessing print("start cast_all_to_numeric.") data_df = data_processing_obj.cast_all_to_numeric(data_df) print("end cast_all_to_numeric.") # print("########################") # print(data_df) ### convert to category data_df[y_key] = data_df[y_key].astype('category') ### sampling (train_set, test_set) = sampling_obj.sk_sampling(data_df, test_set_ratio) # print("######################## train_set") # print(train_set) # print("######################## test_set") # print(test_set) #### temp data test_set_bak = test_set.copy() # test_set_bak[y_key] = test_set_bak[y_key].astype('int') ### get x,y (train_set_x, train_set_y) = sampling_obj.get_x_y_from_dataframe(train_set, y_key) (test_set_x, test_set_y) = sampling_obj.get_x_y_from_dataframe(test_set, y_key) # print("######################## train_set_x") # print(train_set_x) # print("######################## train_set_y") # print(train_set_y) # print("######################## test_set_x") # print(test_set_x) # print("######################## test_set_y") # print(test_set_y) ### seperate by disease test_set_disease = test_set_bak.loc[test_set_bak[y_key] == 1, :] test_set_normal = test_set_bak.loc[test_set_bak[y_key] == 0, :] (test_set_disease_x, test_set_disease_y) = sampling_obj.get_x_y_from_dataframe( test_set_disease, y_key) (test_set_normal_x, test_set_normal_y) = sampling_obj.get_x_y_from_dataframe( test_set_normal, y_key) # print("######################## test_set_disease") # print(test_set_disease) # print("######################## test_set_disease_x") # print(test_set_disease_x) # print("######################## test_set_disease_y") # print(test_set_disease_y) # print("######################## test_set_normal") # print(test_set_normal) # print("######################## test_set_normal_x") # print(test_set_normal_x) # print("######################## test_set_normal_y") # print(test_set_normal_y) test_set_disease_len = len(test_set_disease_y) test_set_normal_len = len(test_set_normal_y) print( "######################## test_set_disease_len, test_set_normal_len" ) print("{}, {}".format(test_set_disease_len, test_set_normal_len)) # ### feature list # # feature_dict = {} # # feature_dict['numeric'] = list(train_set_x.columns.values) ### training the model my_model = train_obj.SKRandomForest_Category(train_set_x, train_set_y) test_score = my_model.score(test_set_x, test_set_y) test_set_diseas_score = my_model.score(test_set_disease_x, test_set_disease_y) test_set_normal_score = my_model.score(test_set_normal_x, test_set_normal_y) print("Test score = {}".format(test_score)) print("test_set_diseas_score score = {}".format(test_set_diseas_score)) print("test_set_normal_score score = {}".format(test_set_normal_score)) return data_df
def cyto_ai_balance_training_pipeline(self, data_df): data_processing_obj = DataProcessing() sampling_obj = DataSampling() sys_obj = SysConfig() train_obj = DataTrainning() test_set_ratio = sys_obj.get_test_set_ratio() y_key = sys_obj.get_y_key() model_output_dir = sys_obj.get_model_output_dir() log_file = sys_obj.get_log_file() model_threshold = 200 model_count = 1 print(data_df) #### data preprocessing print("start cast_all_to_numeric.") data_df = data_processing_obj.cast_all_to_numeric(data_df) print("end cast_all_to_numeric.") # print("########################") # print(data_df) ### convert to category data_df[y_key] = data_df[y_key].astype('category') ### sampling disease_df = data_df.loc[data_df[y_key] == 1, :] normal_df = data_df.loc[data_df[y_key] == 0, :] print("### disease_df") print(disease_df) print("### normal_df") print(normal_df) print(len(disease_df), len(normal_df)) # # log_file = '/app/data/model/RF_3000_log.txt' fh_writer = open(log_file, 'w') while model_count < model_threshold: # (train_set, train_label, test_set, test_label) = sampling_obj.category2_sampling_pipeline(normal_df, disease_df,y_key, test_set_ratio) (train_set, test_set) = sampling_obj.category2_simple_sampling_pipeline( normal_df, disease_df, y_key, test_set_ratio) print("###############################") print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print(model_count) # print("######################## train_set") # print(train_set) # print("######################## test_set") # print(test_set) # #### temp data test_set_bak = test_set.copy() # # test_set_bak[y_key] = test_set_bak[y_key].astype('int') ### get x,y (train_set_x, train_set_y) = sampling_obj.get_x_y_from_dataframe( train_set, y_key) (test_set_x, test_set_y) = sampling_obj.get_x_y_from_dataframe( test_set, y_key) # print("######################## train_set_x") # print(train_set_x) # print("######################## train_set_y") # print(train_set_y) # print("######################## test_set_x") # print(test_set_x) # print("######################## test_set_y") # print(test_set_y) ### seperate by disease test_set_disease = test_set_bak.loc[test_set_bak[y_key] == 1, :] test_set_normal = test_set_bak.loc[test_set_bak[y_key] == 0, :] (test_set_disease_x, test_set_disease_y) = sampling_obj.get_x_y_from_dataframe( test_set_disease, y_key) (test_set_normal_x, test_set_normal_y) = sampling_obj.get_x_y_from_dataframe( test_set_normal, y_key) # print("######################## test_set_disease") # print(test_set_disease) # print("######################## test_set_disease_x") # print(test_set_disease_x) # print("######################## test_set_disease_y") # print(test_set_disease_y) # print("######################## test_set_normal") # print(test_set_normal) # print("######################## test_set_normal_x") # print(test_set_normal_x) # print("######################## test_set_normal_y") # print(test_set_normal_y) test_set_disease_len = len(test_set_disease_y) test_set_normal_len = len(test_set_normal_y) print( "######################## test_set_disease_len, test_set_normal_len" ) print("{}, {}".format(test_set_disease_len, test_set_normal_len)) # ### feature list # # feature_dict = {} # # feature_dict['numeric'] = list(train_set_x.columns.values) ### training the model my_model = train_obj.SKRandomForest_Category( train_set_x, train_set_y) test_score = my_model.score(test_set_x, test_set_y) test_set_diseas_score = my_model.score(test_set_disease_x, test_set_disease_y) test_set_normal_score = my_model.score(test_set_normal_x, test_set_normal_y) print("Test score = {}".format(test_score)) print("test_set_diseas_score score = {}".format( test_set_diseas_score)) print("test_set_normal_score score = {}".format( test_set_normal_score)) ### log file fh_writer.write("{}\t{}\t{}\n".format(test_score, test_set_diseas_score, test_set_normal_score)) ### model model_file = model_output_dir + str(model_count) + '.pkl' joblib.dump(my_model, model_file) model_count += 1 fh_writer.close() return data_df
def cyto_xgboost_balance_training_pipeline(self, data_df): data_processing_obj = DataProcessing() sampling_obj = DataSampling() sys_obj = SysConfig() train_obj = DataTrainning() test_set_ratio = sys_obj.get_test_set_ratio() y_key = sys_obj.get_y_key() model_output_dir = sys_obj.get_model_output_dir() log_file = sys_obj.get_log_file() model_threshold = 200 model_count = 1 print(data_df) #### data preprocessing print("start cast_all_to_numeric.") data_df = data_processing_obj.cast_all_to_numeric(data_df) print("end cast_all_to_numeric.") # print("########################") # print(data_df) ### convert to category # data_df[y_key] = data_df[y_key].astype('category') ### sampling disease_df = data_df.loc[data_df[y_key] == 1, :] normal_df = data_df.loc[data_df[y_key] == 0, :] # # log_file = '/app/data/model/RF_3000_log.txt' # fh_writer = open(log_file, 'w') # while model_count < model_threshold: # (train_set, train_label, test_set, test_label) = sampling_obj.category2_sampling_pipeline(normal_df, disease_df,y_key, test_set_ratio) (train_set, test_set) = sampling_obj.category2_simple_sampling_pipeline( normal_df, disease_df, y_key, test_set_ratio) print("###############################") print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print(model_count) # print("######################## train_set") # print(train_set) # print("######################## test_set") # print(test_set) # #### temp data test_set_bak = test_set.copy() # # test_set_bak[y_key] = test_set_bak[y_key].astype('int') ### get x,y (train_set_x, train_set_y) = sampling_obj.get_x_y_from_dataframe(train_set, y_key) (test_set_x, test_set_y) = sampling_obj.get_x_y_from_dataframe(test_set, y_key) # print("######################## train_set_x") # print(train_set_x) # print("######################## train_set_y") # print(train_set_y) # print("######################## test_set_x") # print(test_set_x) # print("######################## test_set_y") # print(test_set_y) ### seperate by disease test_set_disease = test_set_bak.loc[test_set_bak[y_key] == 1, :] test_set_normal = test_set_bak.loc[test_set_bak[y_key] == 0, :] (test_set_disease_x, test_set_disease_y) = sampling_obj.get_x_y_from_dataframe( test_set_disease, y_key) (test_set_normal_x, test_set_normal_y) = sampling_obj.get_x_y_from_dataframe( test_set_normal, y_key) # print("######################## test_set_disease") # print(test_set_disease) # print("######################## test_set_disease_x") # print(test_set_disease_x) # print("######################## test_set_disease_y") # print(test_set_disease_y) # print("######################## test_set_normal") # print(test_set_normal) # print("######################## test_set_normal_x") # print(test_set_normal_x) # print("######################## test_set_normal_y") # print(test_set_normal_y) test_set_disease_len = len(test_set_disease_y) test_set_normal_len = len(test_set_normal_y) print( "######################## test_set_disease_len, test_set_normal_len" ) print("{}, {}".format(test_set_disease_len, test_set_normal_len)) # ### feature list # # feature_dict = {} # # feature_dict['numeric'] = list(train_set_x.columns.values) ### training the model my_model = train_obj.xgboot_training(train_set_x, train_set_y, test_set_x, test_set_y) # # test_set_x = preprocessing.scale(test_set_x) # # test_set_x = scaler.scale(test_set_x) # test_set_x = scaler.transform(test_set_x) # # test_set_y = to_categorical(test_set_y) # test_score = my_model.evaluate(test_set_x, test_set_y) # # test_set_diseas_score = my_model.score(test_set_disease_x, test_set_disease_y) # # test_set_normal_score = my_model.score(test_set_normal_x, test_set_normal_y) # print("\ntest data set, %s: %.2f%%" % (my_model.metrics_names[1], test_score[1]*100)) # print("Test score:", test_score[0]) # print('Test accuracy:', test_score[1]) return data_df
def LinearRegressor(self, df_1, df_2, target_name, feature_dict): # train_numeric_list = ["Age", "Blast", # "WBC", "RBC", "HGB", "HCT", "MCV", "MCH", "MCHC", "RDW", "PLT", "MPV", "NE", "LY", # "MO", "EO", "BA", "NE_c", "LY_c", "MO_c", "EO_c", "BA_c", "NRBC", "NRBC_c", # "Left Shift 3"] NUM_STEPS = 200 MINIBATCH_SIZE = 20 sampling_obj = DataSampling() (train_set, train_label, test_set, test_label) = sampling_obj.cbc_sampling_pipeline(df_1, df_2, target_name, sample_ratio=0.8) # train_label = train_label.to_frame() # print(train_label) # print(type(train_label)) ### feature columns numeric_fields = feature_dict['numeric'] category_fields = feature_dict['category'] all_features = numeric_fields + category_fields train_set_feature = train_set.loc[:, all_features] train_set_feature = self.data_selection_obj.cast_to_numeric( train_set_feature, numeric_fields) train_set_feature = train_set_feature.reindex() test_set_feature = test_set.loc[:, all_features] test_set_feature = self.data_selection_obj.cast_to_numeric( test_set_feature, numeric_fields) test_set_feature = test_set_feature.reindex() train_feature_columns = self.build_feature_columns(feature_dict) print(train_feature_columns) print("train_set_feature = {0}".format(train_set_feature.columns)) print("train_set_feature len = {0}".format(len(train_set_feature))) print("train_label len = {0}".format(len(train_label))) print("DF NA checking ...") print(train_set_feature.isnull().any().any()) # print(train_set_feature['BA_c']) # reg = tf.estimator.LinearRegressor( # feature_columns=train_feature_columns, # optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.1) # ) reg = tf.estimator.LinearRegressor( feature_columns=train_feature_columns) train_fn = lambda: self.data_func( train_set_feature, train_label, batch_size=MINIBATCH_SIZE) test_fn = lambda: self.test_func(test_set_feature, test_label) ### training reg.train(input_fn=train_fn, steps=NUM_STEPS) ## evaulate eval_result = reg.evaluate(input_fn=test_fn) average_loss = eval_result["average_loss"] print("average_loss = {0}".format(average_loss)) print("\nRMSE for the test set: {:.2f}".format(average_loss**0.5)) ## prediction predict_set = dict(test_set_feature.head(10)) predict_set = { key: np.array(value) for key, value in predict_set.items() } predict_input_fn = tf.estimator.inputs.numpy_input_fn(predict_set, shuffle=False) predict_results = reg.predict(input_fn=predict_input_fn)
def RandomForest(self, df_1, df_2, target_name, feature_dict): NUM_STEPS = 1000 MINIBATCH_SIZE = 20 NUM_TREE = 2000 MAX_NODE = 1000 NUM_FEATURES = 24 NUM_CLASS = 1 sampling_obj = DataSampling() ### modify target df_1 = self.data_selection_obj.cast_to_numeric(df_1, ['Blast_count']) df_2 = self.data_selection_obj.cast_to_numeric(df_2, ['Blast_count']) ### Sampling (train_set, train_label, test_set, test_label) = sampling_obj.cbc_sampling_pipeline(df_1, df_2, target_name, sample_ratio=0.8) ### feature columns numeric_fields = feature_dict['numeric'] category_fields = feature_dict['category'] all_features = numeric_fields + category_fields train_set_feature = train_set.loc[:, all_features] train_set_feature = self.data_selection_obj.cast_to_numeric( train_set_feature, numeric_fields) train_set_feature = train_set_feature.reindex() test_set_feature = test_set.loc[:, all_features] test_set_feature = self.data_selection_obj.cast_to_numeric( test_set_feature, numeric_fields) test_set_feature = test_set_feature.reindex() train_feature_columns = self.build_feature_columns(feature_dict) print(train_feature_columns) print(len(train_feature_columns)) print("train_set_feature = {0}".format(train_set_feature.columns)) print("train_set_feature len = {0}".format(len(train_set_feature))) print("train_label len = {0}".format(len(train_label))) print("DF NA checking ...") print(train_set_feature.isnull().any().any()) params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams( num_classes=NUM_CLASS, num_features=NUM_FEATURES, num_trees=NUM_TREE, regression=True, max_nodes=MAX_NODE) graph_builder_class = tf.contrib.tensor_forest.python.tensor_forest.RandomForestGraphs my_model = tf.contrib.tensor_forest.client.random_forest.TensorForestEstimator( params, graph_builder_class=graph_builder_class) # train_fn = lambda: self.data_func(train_set_feature, train_label, batch_size=MINIBATCH_SIZE) # test_fn = lambda: self.test_func(test_set_feature, test_label) train_fn = self.randomforest_data_func(train_set_feature, train_label, batch_size=MINIBATCH_SIZE) ### training print("Before fitting") my_model.fit(input_fn=train_fn, steps=None) print("After fitting")
def DNNRegressor(self, df_1, df_2, target_name, feature_dict): print(" DNNRegressor ...") NUM_STEPS = 1000 MINIBATCH_SIZE = 20 NUM_TREE = 2000 MAX_NODE = 1000 NUM_FEATURES = 24 NUM_CLASS = 1 sampling_obj = DataSampling() ### modify target df_1 = self.data_selection_obj.cast_to_numeric(df_1, ['Blast_count']) df_2 = self.data_selection_obj.cast_to_numeric(df_2, ['Blast_count']) ### Sampling (train_set, train_label, test_set, test_label) = sampling_obj.cbc_sampling_pipeline(df_1, df_2, target_name, sample_ratio=0.8) ### feature columns numeric_fields = feature_dict['numeric'] category_fields = feature_dict['category'] all_features = numeric_fields + category_fields train_set_feature = train_set.loc[:, all_features] train_set_feature = self.data_selection_obj.cast_to_numeric( train_set_feature, numeric_fields) train_set_feature = train_set_feature.reindex() test_set_feature = test_set.loc[:, all_features] test_set_feature = self.data_selection_obj.cast_to_numeric( test_set_feature, numeric_fields) test_set_feature = test_set_feature.reindex() train_feature_columns = self.build_feature_columns(feature_dict) # print(train_feature_columns) # print(len(train_feature_columns)) # print("train_set_feature = {0}".format(train_set_feature.columns)) # print("train_set_feature len = {0}".format(len(train_set_feature))) # print("train_label len = {0}".format(len(train_label))) # print("DF NA checking ...") # print(train_set_feature.isnull().any().any()) reg = tf.estimator.DNNRegressor( feature_columns=train_feature_columns, # hidden_units=[100, 100, 100, 50, 20, 10], hidden_units=[30, 30, 30, 20, 10], optimizer=tf.train.ProximalAdagradOptimizer( learning_rate=0.01, l1_regularization_strength=0.001)) train_fn = lambda: self.data_func( train_set_feature, train_label, batch_size=MINIBATCH_SIZE) test_fn = lambda: self.test_func(test_set_feature, test_label) ### training reg.train(input_fn=train_fn, steps=NUM_STEPS) ## evaulate eval_result = reg.evaluate(input_fn=test_fn) average_loss = eval_result["average_loss"] print("average_loss = {0}".format(average_loss)) print("\nRMSE for the test set: {:.2f}".format(average_loss**0.5))
def SKRandomForest(self, df_1, df_2, target_name, feature_dict): print("SKRandomForest ...") # NUM_STEPS = 1000 # MINIBATCH_SIZE = 20 NUM_TREE = 2000 # MAX_NODE = 1000 NUM_FEATURES = 24 # NUM_CLASS = 1 MAX_DEPTH = None sampling_obj = DataSampling() ### modify target df_1 = self.data_selection_obj.cast_to_numeric(df_1, ['Blast_count']) df_2 = self.data_selection_obj.cast_to_numeric(df_2, ['Blast_count']) ### Sampling (train_set, train_label, test_set, test_label) = sampling_obj.cbc_sampling_pipeline(df_1, df_2, target_name, sample_ratio=0.8) ### feature columns numeric_fields = feature_dict['numeric'] category_fields = feature_dict['category'] all_features = numeric_fields + category_fields train_set_feature = train_set.loc[:, all_features] train_set_feature = self.data_selection_obj.cast_to_numeric( train_set_feature, numeric_fields) train_set_feature = train_set_feature.reindex() test_set_feature = test_set.loc[:, all_features] test_set_feature = self.data_selection_obj.cast_to_numeric( test_set_feature, numeric_fields) test_set_feature = test_set_feature.reindex() train_feature_columns = self.build_feature_columns(feature_dict) my_model = RandomForestRegressor(n_estimators=NUM_TREE, max_features=NUM_FEATURES, min_samples_split=2, max_depth=MAX_DEPTH, oob_score=True) ### training print("Before fitting") my_model.fit(train_set_feature, train_label) train_score = my_model.score(train_set_feature, train_label) print("After fitting") print("feature_importances_ = {} ".format( my_model.feature_importances_)) print("n_features_ = {} ".format(my_model.n_features_)) print("oob_score_ = {} ".format(my_model.oob_score_)) print("train_score = {} ".format(train_score)) ### prediction test_prediction_result = my_model.predict(test_set_feature) # print("test_prediction_result = {} ".format(test_prediction_result)) for (pair_1, pari2) in zip(test_label, test_prediction_result): print("real,predict = {},{} ".format(pair_1, pari2))