def test_verify_features_does_not_work_by_default(): df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, perform_feature_scaling=False, model_names=['DeepLearningClassifier']) file_name = ml_predictor.save(str(random.random())) saved_ml_pipeline = utils_models.load_keras_model(file_name) os.remove(file_name) test_score = saved_ml_pipeline.score(df_titanic_test, df_titanic_test.survived) assert -0.25 < test_score < -0.17
def test_verify_features_finds_no_missing_features_when_none_are_missing(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'sex': 'categorical', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, verify_features=True) file_name = ml_predictor.save(str(random.random())) with open(file_name, 'rb') as read_file: saved_ml_pipeline = dill.load(read_file) os.remove(file_name) missing_features = saved_ml_pipeline.named_steps[ 'final_model'].verify_features(df_titanic_test) print('missing_features') print(missing_features) print("len(missing_features['prediction_not_training'])") print(len(missing_features['prediction_not_training'])) print("len(missing_features['training_not_prediction'])") print(len(missing_features['training_not_prediction'])) assert len(missing_features['prediction_not_training']) == 0 assert len(missing_features['training_not_prediction']) == 0
def test_perform_feature_scaling_false_regression(): np.random.seed(42) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = { 'MEDV': 'output' , 'CHAS': 'categorical' } ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train, perform_feature_scaling=False, model_names=['DeepLearningRegressor']) file_name = ml_predictor.save(str(random.random())) saved_ml_pipeline = utils_models.load_keras_model(file_name) # with open(file_name, 'rb') as read_file: # saved_ml_pipeline = dill.load(read_file) os.remove(file_name) test_score = saved_ml_pipeline.score(df_boston_test, df_boston_test.MEDV) print('test_score') print(test_score) assert -24 < test_score < -2.8
def test_verify_features_finds_missing_training_features(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'sex': 'categorical', 'embarked': 'categorical', 'pclass': 'categorical' } # Remove the "sibsp" column from our training data df_titanic_train = df_titanic_train.drop('sibsp', axis=1) ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, verify_features=True) file_name = ml_predictor.save(str(random.random())) with open(file_name, 'rb') as read_file: saved_ml_pipeline = dill.load(read_file) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass missing_features = saved_ml_pipeline.named_steps[ 'final_model'].verify_features(df_titanic_test) print('missing_features') print(missing_features) print("len(missing_features['prediction_not_training'])") print(len(missing_features['prediction_not_training'])) print("len(missing_features['training_not_prediction'])") print(len(missing_features['training_not_prediction'])) assert len(missing_features['prediction_not_training']) == 1 assert len(missing_features['training_not_prediction']) == 0
def test_getting_single_predictions_nlp_date_multilabel_classification( model_name=None): # auto_ml does not support multilabel classification for deep learning at the moment if model_name == 'DeepLearningClassifier': return np.random.seed(0) df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset( ) column_descriptions = { 'airline_sentiment': 'output', 'airline': 'categorical', 'text': 'nlp', 'tweet_location': 'categorical', 'user_timezone': 'categorical', 'tweet_created': 'date' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_twitter_train, model_names=model_name) file_name = ml_predictor.save(str(random.random())) # if model_name == 'DeepLearningClassifier': # from auto_ml.utils_models import load_keras_model # saved_ml_pipeline = load_keras_model(file_name) # else: # with open(file_name, 'rb') as read_file: # saved_ml_pipeline = dill.load(read_file) saved_ml_pipeline = load_ml_model(file_name) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_twitter_test_dictionaries = df_twitter_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_twitter_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) print('predictions') print(predictions) first_score = accuracy_score(df_twitter_test.airline_sentiment, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = 0.73 # if model_name == 'LGBMClassifier': # lower_bound = 0.655 assert lower_bound < first_score < 0.79 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_twitter_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_twitter_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly # time_upper_bound = 3 # if model_name == 'XGBClassifier': # time_upper_bound = 4 assert 0.2 < duration.total_seconds() < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_twitter_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) print('predictions') print(predictions) print('df_twitter_test_dictionaries') print(df_twitter_test_dictionaries) second_score = accuracy_score(df_twitter_test.airline_sentiment, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < 0.79
'category': 'nlp', # 'amount': 'nlp', 'fraud': 'output' } column_descriptions_3 = { # 'step', 'type':'ignore', # 'amount', 'nameOrig':'nlp', # 'oldbalanceOrg', # 'newbalanceOrig':'ignore', 'nameDest':'nlp', # 'oldbalanceDest', # 'newbalanceDest', 'isFraud':'output', 'isFlaggedFraud':'ignore' } column_descriptions_2 = { 'Source': 'nlp', 'Target':'nlp', 'Weight': 'nlp', 'typeTrans': 'nlp', 'fraud': 'output'} ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions_1) ml_predictor.train(df_train, model_names='RandomForestClassifier') # ml_predictor.score(df_test, df_test.fraud) ml_predictor.score(df_test, df_test.fraud) ml_predictor.save(file_name="..\\mlweb\\trained_pipeline\\forest\\1.sav")
def test_feature_learning_categorical_ensembling_getting_single_predictions_regression( model_name=None): np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'} ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) # NOTE: this is bad practice to pass in our same training set as our fl_data set, but we don't have enough data to do it any other way df_boston_train, fl_data = train_test_split(df_boston_train, test_size=0.2) ml_predictor.train_categorical_ensemble(df_boston_train, model_names=model_name, feature_learning=True, fl_data=fl_data, categorical_column='CHAS') # print('Score on training data') # ml_predictor.score(df_boston_train, df_boston_train.MEDV) file_name = ml_predictor.save(str(random.random())) from auto_ml.utils_models import load_ml_model saved_ml_pipeline = load_ml_model(file_name) # with open(file_name, 'rb') as read_file: # saved_ml_pipeline = dill.load(read_file) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_boston_test_dictionaries = df_boston_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_boston_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) first_score = utils.calculate_rmse(df_boston_test.MEDV, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -4.5 assert lower_bound < first_score < -3.4 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_boston_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_boston_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() / 1.0 < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_boston_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) second_score = utils.calculate_rmse(df_boston_test.MEDV, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < -3.4
def test_user_input_func_classification(model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) def age_bucketing(data): def define_buckets(age): if age <= 17: return 'youth' elif age <= 40: return 'adult' elif age <= 60: return 'adult2' else: return 'over_60' if isinstance(data, dict): data['age_bucket'] = define_buckets(data['age']) else: data['age_bucket'] = data.age.apply(define_buckets) return data column_descriptions = { 'survived': 'output', 'embarked': 'categorical', 'pclass': 'categorical', 'age_bucket': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, perform_feature_scaling=False, user_input_func=age_bucketing, model_names=model_name) file_name = ml_predictor.save(str(random.random())) # if model_name == 'DeepLearningClassifier': # from auto_ml.utils_models import load_keras_model # saved_ml_pipeline = load_keras_model(file_name) # else: # with open(file_name, 'rb') as read_file: # saved_ml_pipeline = dill.load(read_file) saved_ml_pipeline = load_ml_model(file_name) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -0.215 if model_name == 'DeepLearningClassifier': lower_bound = -0.237 assert lower_bound < first_score < -0.17 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) print('df_titanic_test_dictionaries') print(df_titanic_test_dictionaries) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < -0.17
def getting_single_predictions_regression(model_name=None): np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'} ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train, perform_feature_scaling=False, model_names=model_name) file_name = ml_predictor.save(str(random.random())) # if model_name == 'DeepLearningRegressor': # from auto_ml.utils_models import load_keras_model # saved_ml_pipeline = load_keras_model(file_name) # else: # with open(file_name, 'rb') as read_file: # saved_ml_pipeline = dill.load(read_file) saved_ml_pipeline = load_ml_model(file_name) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_boston_test_dictionaries = df_boston_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_boston_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) print('predictions') print(predictions) print('predictions[0]') print(predictions[0]) print('type(predictions)') print(type(predictions)) first_score = utils.calculate_rmse(df_boston_test.MEDV, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -3.2 if model_name == 'DeepLearningRegressor': lower_bound = -8.8 if model_name == 'LGBMRegressor': lower_bound = -4.95 if model_name == 'XGBRegressor': lower_bound = -3.4 assert lower_bound < first_score < -2.8 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_boston_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_boston_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.1 < duration.total_seconds() / 1.0 < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_boston_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) second_score = utils.calculate_rmse(df_boston_test.MEDV, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < -2.8
def test_feature_learning_getting_single_predictions_classification( model_name=None): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'sex': 'categorical', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) # NOTE: this is bad practice to pass in our same training set as our fl_data set, but we don't have enough data to do it any other way df_titanic_train, fl_data = train_test_split(df_titanic_train, test_size=0.2) ml_predictor.train(df_titanic_train, model_names=model_name, feature_learning=True, fl_data=fl_data) file_name = ml_predictor.save(str(random.random())) saved_ml_pipeline = load_ml_model(file_name) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -0.16 if model_name == 'DeepLearningClassifier': lower_bound = -0.187 assert lower_bound < first_score < -0.133 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) print('df_titanic_test_dictionaries') print(df_titanic_test_dictionaries) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < -0.133
df_test = table.sample(frac=.5) rew_descriptions = { 'acts1': 'output', 'cur_action': 'categorical', 'prev_action': 'categorical' } ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=rew_descriptions) ml_predictor.train(df_train, model_names=['DeepLearningClassifier'], ml_for_analytics=True) # ml_predictor.score(df_test, df_test.acts1) ml_predictor.predict(table[-1:]) ml_predictor.save(file_name='reward.ml', verbose=True) etable = pd.read_sql_query("SELECT * from sarsa", conn) df_etrain = etable.sample(frac=.5) df_etest = etable.sample(frac=.5) esteem_descriptions = { 'esteem': 'output', 'cluster': 'categorical', } ml_predictor2 = Predictor(type_of_estimator='regressor', column_descriptions=esteem_descriptions) ml_predictor2.train(df_etrain, ml_for_analytics=True) # ml_predictor2.score(df_test, df_test.acts1)
# 'year': 'categorical', # 'month': 'categorical', } print(column_description1) # 合并两个字典 column_descriptions = dict(column_description1, **column_description2) ml_predictor = Predictor(type_of_estimator='Regressor', column_descriptions=column_descriptions) ml_predictor.train( df_train, model_names='XGBRegressor') # KerasRegressor XGBRegressor ml_predictor.save( model_path) # 这里保存模型之后可能存在某个问题就是可能会在原有模型的基础上进行训练; # 预测 pred1 = ml_predictor.predict(df_test) print(mean_absolute_error(df_test_label, pred1)) # 计算比例: # 合并保存测试数据预测结果 test_data_save_and_merge_data(pred1, origin_data, f) # 合并保存训练数据的预测结果 train_data_save_and_merge(ml_predictor, origin_data_train, df_train_prediction, f) time.sleep(delay_time) # 开始切分数据 process_train_merge_data_remove_some_data(f, i) is_first_train = 0
def getting_single_predictions_classifier_test(): np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() column_descriptions = { 'survived': 'output' , 'sex': 'categorical' , 'embarked': 'categorical' , 'pclass': 'categorical' , 'age_bucket': 'categorical' } ensemble_config = [ { 'model_name': 'LGBMClassifier' } , { 'model_name': 'RandomForestClassifier' } ] ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train, ensemble_config=ensemble_config) file_name = ml_predictor.save(str(random.random())) saved_ml_pipeline = load_ml_model(file_name) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -0.16 assert -0.15 < first_score < -0.135 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 60 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) print('predictions') print(predictions) print('df_titanic_test_dictionaries') print(df_titanic_test_dictionaries) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert -0.15 < second_score < -0.135
def train_old_model(): print('auto_ml_version') print(auto_ml_version) if auto_ml_version > '2.1.6': raise(TypeError) np.random.seed(0) df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset() column_descriptions = { 'survived': 'output' , 'sex': 'categorical' , 'embarked': 'categorical' , 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train) file_name = ml_predictor.save('trained_ml_model_v_2_1_6.dill') saved_ml_pipeline = load_ml_model(file_name) df_titanic_test_dictionaries = df_titanic_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) first_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) # Make sure our score is good, but not unreasonably good lower_bound = -0.16 assert -0.16 < first_score < -0.135 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_titanic_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_titanic_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.2 < duration.total_seconds() < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_titanic_test_dictionaries: predictions.append(saved_ml_pipeline.predict_proba(row)[1]) second_score = utils.calculate_brier_score_loss(df_titanic_test.survived, predictions) # Make sure our score is good, but not unreasonably good assert -0.16 < second_score < -0.135
# #'inv', 'sin', 'cos', 'tan'), # p_crossover=0.7, p_subtree_mutation=0.1, max_samples=1., # p_hoist_mutation=0.05, p_point_mutation=0.1, # verbose=1, n_jobs=12, generations=20, metric='mean absolute error') # ridgealpha = 300 # print(ga) column_descriptions = {'output': 'output'} ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) print("started training at", datetime.now()) ml_predictor.train(pd_train, compute_power=5, X_test=pd_testX, y_test=pd_testY, perform_feature_scaling=True, perform_feature_selection=False, optimize_entire_pipeline=True, optimize_final_model=False, #crashed on validation trying without op final model take_log_of_y=False, model_names=[#'Ridge', #'XGBRegressor', 'RANSACRegressor', 'RandomForestRegressor', 'AdaBoostRegressor',# 'LinearRegression', "DeepLearningClassifier",]) # 'ExtraTreesRegressor', "LGBMRegressor", 'LogisticRegression',]) ml_predictor.save() # base estimator, train score, test score, test accu % #TODO maybe add random noise to train data only # BaggingRegressor, .994, -4, .5284 # DecisionTreeRegressor, .996, -.72, .53 # ExtraTreeRegressor, .995, -1.43, .53 # ExtraTreesRegressor, .998, -.98, .51 # GradientBoostingRegressor,.90, -.17, .566 i liked this one, no multicore though # HuberRegressor .68, -71, .48 # KernelRidge total garbage # LinearRegression total garbage # LinearSVR .83, -108, .558 # NuSVR rbf, sigmoid, poly no work .808, -1.24, .566 # RANSACRegressor base Ridge .73, -9, .528 has potential # RandomForestRegressor .98, -1.5, .524 potential if able to regularize # Ridge .76, -70, .59
ml_predictor.train( df_houseprices_train, take_log_of_y=True, #compute_power=10, ) # Score the model on test data test_score = ml_predictor.score(df_houseprices_test, df_houseprices_test['SalePrice']) # auto_ml is specifically tuned for running in production # It can get predictions on an individual row (passed in as a dictionary) # A single prediction like this takes ~1 millisecond # Here we will demonstrate saving the trained model, and loading it again file_name = ml_predictor.save() # dill is a drop-in replacement for pickle that handles functions better with open(file_name, 'rb') as read_file: trained_model = dill.load(read_file) # .predict and .predict_proba take in either: # A pandas DataFrame # A list of dictionaries # A single dictionary (optimized for speed in production evironments) # predictions = trained_model.predict(df_houseprices_test) # print(predictions) #print(sb.test_df) predictions = trained_model.predict(sb.test_df)
class deepl(object): def __init__(self): self.train_df = pd.read_csv("..\\mlweb\\input\\bs140513_032310.csv") # self.data = pd.read_csv("..\\input\\bsNET140513_032310.csv") self.ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) self.model = Sequential() def handle_non_numerical_data(self): columns = self.train_df.columns.values for column in columns: text_digit_vals = {} def convert_to_int(val): return text_digit_vals[val] if self.train_df[column].dtype != np.int64 and self.train_df[ column].dtype != np.float64: column_contents = self.train_df[column].values.tolist() unique_elements = set(column_contents) x = 0 for unique in unique_elements: if unique not in text_digit_vals: text_digit_vals[unique] = x x += 1 self.train_df[column] = list( map(convert_to_int, self.train_df[column])) def getX_Y(self): return train_test_split(self.train_df, test_size=0.2, shuffle=True) def get_train_X_Y(self): train_X = self.train_df.drop(columns=['fraud']) train_Y = self.train_df[['fraud']] print(train_X.head()) print(train_Y.head()) return train_X, train_Y def create_dl_model(self): train_X, train_Y = self.get_train_X_Y() #get number of columns in training data n_cols = train_X.shape[1] #add model layers self.model.add(Dense(10, activation='relu', input_shape=(n_cols, ))) self.model.add(Dense(10, activation='relu')) self.model.add(Dense(1)) self.model.compile(optimizer='adam', loss='mean_squared_error') def get_dl_model(self): return self.model def train_dl_model(self): train_X, train_Y = self.get_train_X_Y() #set early stopping monitor so the model stops training when it won't improve anymore early_stopping_monitor = EarlyStopping(patience=3) #train model self.model.fit(train_X, train_Y, validation_split=0.2, epochs=30, callbacks=[early_stopping_monitor]) def sav_dl_model(self): self.model.save_weights( '..\\mlweb\\trained_pipeline\\deepl\\deep_learning.h5') def get_features(self): return list(self.train_df) def send_tojson(self, list): features = {} list = self.get_features() for x in range(len(list)): features["feature" + str(x)] = list[x] with open(jsonpath, 'w') as outfile: json.dump(features, outfile) def learn_model(self): df_train, df_test = self.getX_Y() # self.ml_predictor.train(df_train, model_names='DeepLearningRegressor') self.ml_predictor.train(df_train, feature_learning=True, fl_data=df_test, model_names='DeepLearningRegressor') self.ml_predictor.score(df_test, df_test.fraud) def sav_model(self): self.ml_predictor.save() # dmodel = deepl() # dmodel.send_tojson(dmodel.get_features()) # # dmodel.getX_Y() # dmodel.learn_model() # dmodel.sav_model() #2 # dmodel = deepl() # dmodel.handle_non_numerical_data() # dmodel.create_dl_model() # dmodel.train_dl_model() # dmodel.sav_dl_model() # print(dmodel.get_features()) # SITE_ROOT = os.path.realpath(os.path.dirname(__file__)) # pipline_path = os.path.join(SITE_ROOT, "trained_pipeline/deepl", "deepLearning.h5") # print(pipline_path)
"tradeTypeId": 'categorical', # 'bedrooms': 'categorical', # 'year': 'categorical', # 'month': 'categorical', } print(column_description1) # 合并两个字典 column_descriptions = dict(column_description1, **column_description2) ml_predictor = Predictor(type_of_estimator='Regressor', column_descriptions=column_descriptions) ml_predictor.train(df_train, model_names='XGBRegressor' ) # KerasRegressor XGBRegressor DeepLearningRegressor ml_predictor.save('auto_ml_new.h5') # 预测预测数据 x = ml_predictor.predict(df_test) x_dataframe = pd.DataFrame(x, columns=['predictions']) merge_data = pd.concat((origin_data, x_dataframe), axis=1) merge_data_df = pd.DataFrame(merge_data) merge_data_df.to_csv('./merge_data_bak/merge_data_predictions_auto_ml.csv', index=False) print(x_dataframe.describe()) print(df_test_label.describe()) print(mean_absolute_error(df_test_label, x)) compute_ratio(merge_data_df) # compute_ratio2(merge_data_df)
'LGBMRegressor', "AdaBoostRegressor", "XGBRegressor", "ExtraTreesRegressor", "RANSACRegressor", "GradientBoostingRegressor", "DeepLearningRegressor", "RandomForestRegressor", "SGDRegressor", "PassiveAggressiveRegressor" ] # ml_predictor.train(df_train , model_names = model_names ) ml_predictor.train(df_train) # Score the model on test data test_score = ml_predictor.score(df_test, df_test.y) # auto_ml is specifically tuned for running in production # It can get predictions on an individual row (passed in as a dictionary) # A single prediction like this takes ~1 millisecond # Here we will demonstrate saving the trained model, and loading it again file_name = ml_predictor.save(file_name="d:/automl.saved") test_score = ml_predictor.score(df_test, df_test.y) # print(test_score) trained_model = load_ml_model(file_name) # .predict and .predict_proba take in either: # A pandas DataFrame # A list of dictionaries # A single dictionary (optimized for speed in production evironments) predictions = trained_model.predict(df_test) # print(df_test) # print(predictions) print("test:", len(df_test)) print("train:", len(df_train))
def test_ignores_new_invalid_features(): # One of the great unintentional features of auto_ml is that you can pass in new features at prediction time, that weren't present at training time, and they're silently ignored! # One edge case here is new features that are strange objects (lists, datetimes, intervals, or anything else that we can't process in our default data processing pipeline). Initially, we just ignored them in dict_vectorizer, but we need to ignore them earlier. np.random.seed(0) df_boston_train, df_boston_test = utils.get_boston_regression_dataset() column_descriptions = {'MEDV': 'output', 'CHAS': 'categorical'} ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_boston_train) file_name = ml_predictor.save(str(random.random())) saved_ml_pipeline = load_ml_model(file_name) os.remove(file_name) try: keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5' os.remove(keras_file_name) except: pass df_boston_test_dictionaries = df_boston_test.to_dict('records') # 1. make sure the accuracy is the same predictions = [] for row in df_boston_test_dictionaries: if random.random() > 0.9: row['totally_new_feature'] = datetime.datetime.now() row['really_strange_feature'] = random.random row['we_should_really_ignore_this'] = Predictor row['pretty_vanilla_ignored_field'] = 8 row['potentially_confusing_things_here'] = float('nan') row['potentially_confusing_things_again'] = float('inf') row['this_is_a_list'] = [1, 2, 3, 4, 5] predictions.append(saved_ml_pipeline.predict(row)) print('predictions') print(predictions) print('predictions[0]') print(predictions[0]) print('type(predictions)') print(type(predictions)) first_score = utils.calculate_rmse(df_boston_test.MEDV, predictions) print('first_score') print(first_score) # Make sure our score is good, but not unreasonably good lower_bound = -3.0 assert lower_bound < first_score < -2.7 # 2. make sure the speed is reasonable (do it a few extra times) data_length = len(df_boston_test_dictionaries) start_time = datetime.datetime.now() for idx in range(1000): row_num = idx % data_length saved_ml_pipeline.predict(df_boston_test_dictionaries[row_num]) end_time = datetime.datetime.now() duration = end_time - start_time print('duration.total_seconds()') print(duration.total_seconds()) # It's very difficult to set a benchmark for speed that will work across all machines. # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions # That's about 1 millisecond per prediction # Assuming we might be running on a test box that's pretty weak, multiply by 3 # Also make sure we're not running unreasonably quickly assert 0.1 < duration.total_seconds() / 1.0 < 15 # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time) predictions = [] for row in df_boston_test_dictionaries: predictions.append(saved_ml_pipeline.predict(row)) second_score = utils.calculate_rmse(df_boston_test.MEDV, predictions) print('second_score') print(second_score) # Make sure our score is good, but not unreasonably good assert lower_bound < second_score < -2.7
'Stato':'categorical', 'Riscaldamento':'categorical', 'Climatizzatore':'categorical', 'Classe energetica':'categorical', 'Arredato S/N':'categorical' } df_train, df_test = train_test_split(dati,train_size=0.75, test_size=0.25) ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_train) # Score the model on test data test_score = ml_predictor.score(df_test, df_test.Price) test_modello = ml_predictor.save() trained_model = load_ml_model(test_modello) predictions = trained_model.predict(dati) #print(predictions) # In[12]: valutazione = pd.DataFrame() ground_truth = dati['Price'].values predictions = trained_model.predict(dati) valutazione['Reale'] = ground_truth valutazione['predictions'] = predictions with open('predictions_AUTOML.csv', 'w') as myfile: