def set_missing_ages(p_df): age_df = p_df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']] # 归一化 scaler = preprocessing.StandardScaler() age_df['Fare_scaled'] = scaler.fit_transform(age_df['Fare'].values.reshape( -1, 1)) del age_df['Fare'] # 分割已经数据和待预测数据集 known_age = age_df[age_df.Age.notnull()].as_matrix() unknown_age = age_df[age_df.Age.isnull()].as_matrix() y_inner = known_age[:, 0] x_inner = known_age[:, 1:] rfr_inner = AbuML(x_inner, y_inner, age_df.Age.notnull()) rfr_inner.estimator.polynomial_regression(degree=1) reg_inner = rfr_inner.fit() predicted_ages = reg_inner.predict(unknown_age[:, 1::]) p_df.loc[(p_df.Age.isnull()), 'Age'] = predicted_ages return p_df
def sample_105_0(): """ 10.5 AbuML :return: """ global g_with_date_week_noise g_with_date_week_noise = True train_x, train_y_regress, train_y_classification, pig_three_feature, \ test_x, test_y_regress, test_y_classification, kl_another_word_feature_test = sample_1031_1() from abupy import AbuML # 通过x, y矩阵和特征的DataFrame对象组成AbuML ml = AbuML(train_x, train_y_classification, pig_three_feature) # 使用随机森林作为分类器 _ = ml.estimator.random_forest_classifier() # 交织验证结果的正确率 print('ml.cross_val_accuracy_score():\n', ml.cross_val_accuracy_score()) # 特征的选择 print('ml.feature_selection():\n', ml.feature_selection())
def train_val(data): """封装所有处理训练步骤""" # 处理离散特征 dummies__cabin = pd.get_dummies(data['Cabin'], prefix='Cabin') dummies__embarked = pd.get_dummies(data['Embarked'], prefix='Embarked') dummies__sex = pd.get_dummies(data['Sex'], prefix='Sex') dummies__pclass = pd.get_dummies(data['Pclass'], prefix='Pclass') df = pd.concat([data, dummies__cabin, dummies__embarked, dummies__sex, dummies__pclass], axis=1) df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) # 归一化数据 scaler = preprocessing.StandardScaler() df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1, 1)) df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1, 1)) df['SibSp_scaled'] = scaler.fit_transform(df['SibSp'].astype(float).values.reshape(-1, 1)) df['Parch_scaled'] = scaler.fit_transform(df['Parch'].astype(float).values.reshape(-1, 1)) # 选择特征 train_df = df.filter(regex='Survived|Age_.*|SibSp_.*|Parch_.*|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') train_np = train_df.as_matrix() y = train_np[:, 0] x = train_np[:, 1:] titanic = AbuML(x, y, train_df) titanic.estimator.logistic_classifier() titanic.cross_val_accuracy_score()
def new(df): # 选择哪些特征作为训练特征 train_df = df.filter( regex= 'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) train_df.head(1) # 用新加入模型训练 train_np = train_df.as_matrix() y = train_np[:, 0] x = train_np[:, 1:] new_titanic = AbuML(x, y, train_df) return new_titanic
def __init__(self): self.titanic = AbuML.create_test_more_fiter() self.data_train = pd.read_csv("./data/titanic/train.csv") self.df = None
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) # 选择哪些特征作为训练特征 train_df = df.filter( regex= 'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) train_df.head(1) from abupy import AbuML train_np = train_df.as_matrix() y = train_np[:, 0] x = train_np[:, 1:] titanic = AbuML(x, y, train_df) titanic.estimator.logistic_classifier() titanic.cross_val_accuracy_score() #构造非线性特征 df['Child'] = (data_train['Age'] <= 10).astype(int) df['Age*Age'] = data_train['Age'] * data_train['Age'] df['Age*Age_scaled'] = scaler.fit_transform(df['Age*Age']) df['Age*Class'] = data_train['Age'] * data_train['Pclass'] df['Age*Class_scaled'] = scaler.fit_transform(df['Age*Class'].values.reshape( -1, 1)) # filter加入新增的特征 train_df = df.filter(
x = np.array([1, 2, 3, 4, 5]) assert np.mean(x) == np.sum(x) / 5 assert np.std(x) == np.sqrt(np.mean((x - np.mean(x))**2)) f1 = np.array([0.2, 0.5, 1.1]).reshape(-1, 1) f2 = np.array([-100.0, 56.0, -77.0]).reshape(-1, 1) f1_scaled = (f1 - np.mean(f1)) / np.std(f1) f2_scaled = (f2 - np.mean(f2)) / np.std(f2) import sklearn.preprocessing as preprocessing scaler = preprocessing.StandardScaler() f1_sk_scaled = scaler.fit_transform(f1) f2_sk_scaled = scaler.fit_transform(f2) assert np.allclose(f1_sk_scaled, f1_scaled) and np.allclose( f2_sk_scaled, f2_scaled) from abupy import AbuML import sklearn.preprocessing as preprocessing iris = AbuML.create_test_fiter() iris.estimator.logistic_classifier(multi_class='multinomial', solver='lbfgs') iris.cross_val_accuracy_score()
df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1, 1)) df['SibSp_scaled'] = scaler.fit_transform( df['SibSp'].astype(float).values.reshape(-1, 1)) df['Parch_scaled'] = scaler.fit_transform( df['Parch'].astype(float).values.reshape(-1, 1)) # 选择特征 train_df = df.filter( regex= 'Survived|Age_.*|SibSp_.*|Parch_.*|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) train_np = train_df.as_matrix() y = train_np[:, 0] x = train_np[:, 1:] titanic = AbuML(x, y, train_df) titanic.estimator.logistic_classifier() titanic.cross_val_accuracy_score() from abupy import ABuMLGrid # 切换决策树 titanic.estimator.decision_tree_classifier(criterion='entropy') # grid seach寻找最优的决策树层数 best_score_, best_params_ = ABuMLGrid.grid_search_init_kwargs( titanic.estimator.clf, titanic.x, titanic.y, param_name='max_depth', param_range=range(3, 10), show=True)
df.head(1) #检查缺失 df.info() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) #归一化数据 scaler = StandardScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.fit_transform(x_test) #模型训练 df = pd.DataFrame(data=np.c_[x_train, y_train]) columns = np.append(scikit_boston.feature_names, ['MEDV']) boston = AbuML(x_train, y_train, df) boston.estimator.polynomial_regression(degree=1) reg = boston.fit() #测试集上预测 y_pred = reg.predict(x_test) from sklearn.metrics import r2_score r2_score(y_test, y_pred) #平方展开 boston.estimator.polynomial_regression(degree=2) reg = boston.fit() y_pred = reg.predict(x_test)
# -*- coding: utf-8 -* from abupy import AbuML # 泰坦尼克号生存预测 titanic = AbuML.create_test_more_fiter() titanic.plot_confusion_matrices() from abupy import ABuMLExecute from sklearn import metrics titanic_y_pred = ABuMLExecute.run_cv_estimator(titanic.get_fiter(), titanic.x, titanic.y, n_folds=10) confusion_matrix = metrics.confusion_matrix(titanic.y, titanic_y_pred) TP = confusion_matrix[1, 1] TN = confusion_matrix[0, 0] FP = confusion_matrix[0, 1] FN = confusion_matrix[1, 0] print TP, TN, FP, FN assert metrics.accuracy_score( titanic.y, titanic_y_pred) == (TP + TN) / float(TP + TN + FP + FN) # “生存”类别的精确率 tit_precision = TP / float(TP + FP) # “生存”类别的召回率 tit_recall = TP / float(TP + FN) assert metrics.precision_score(titanic.y, titanic_y_pred) == tit_precision
ph3 = dummies_pclass.head(3) print('ph3=', ph3) dummies_embarked = pd.get_dummies(data_train['Embarked'], prefix='Embarked') dummies_sex = pd.get_dummies(data_train['Sex'], prefix='Sex') df = pd.concat([df, dummies_embarked, dummies_sex, dummies_pclass], axis=1) # noinspection PyUnresolvedReferences df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) # 选择哪些特征作为训练特征 train_df = df.filter( regex= 'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) print(train_df.head(1)) train_np = train_df.as_matrix() y = train_np[:, 0] x = train_np[:, 1:] from abupy import AbuML titanic = AbuML(x, y, train_df) titanic.estimator.logistic_classifier() s = titanic.cross_val_accuracy_score() print(s)
# coding: utf-8 import pandas as pd # pandas是python的数据格式处理类库 from abupy import AbuML # 泰坦尼克号生存预测 titanic = AbuML.create_test_more_fiter() AbuML().estimator.polynomial_regression()
scaler = preprocessing.StandardScaler() df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1, 1)) df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1, 1)) df['SibSp_scaled'] = scaler.fit_transform( df['SibSp'].astype(float).values.reshape(-1, 1)) df['Parch_scaled'] = scaler.fit_transform( df['Parch'].astype(float).values.reshape(-1, 1)) # 选择特征 train_df = df.filter( regex= 'Survived|Age_.*|SibSp_.*|Parch_.*|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) train_np = train_df.as_matrix() y = train_np[:, 0] x = train_np[:, 1:] titanic = AbuML(x, y, train_df) from abupy import ABuMLGrid # # 决策树 # titanic.estimator.decision_tree_classifier() # # grid seach寻找最优的决策树层数 # best_score_, best_params_ = ABuMLGrid.grid_search_init_kwargs(titanic.estimator.clf, titanic.x, titanic.y, # param_name='max_depth',param_range=range(3, 10), show=True) # # titanic.estimator.decision_tree_classifier(**best_params_) # titanic.cross_val_accuracy_score() #随机森林 titanic.estimator.random_forest_classifier()
print('RFE selection') print( pd.DataFrame( { 'support': selector.support_, 'ranking': selector.ranking_ }, index=fairy_tale_feature.columns[1:])) feature_selection(estimator, train_x, train_y_classification) # 3.3 from abupy import AbuML # 通过X ,Y矩阵和特征的DataFrame对象醉成AbuML ml = AbuML(train_x, train_y_classification, fairy_tale_feature) # 使用随机森林作为分类器 _ = ml.estimator.random_forest_classifier() # 交织验证结果的正确率 ml.cross_val_accuracy_score() # 特征的选择 ml.feature_selection() abupy.env.g_enable_ml_feature = True abupy.env.g_enable_train_test_split = True # 初始化资金200万元 read_cash = 2000000 # 每笔交易的买入基数资金设置为万分之15 abupy.beta.atr.g_atr_pos_base = 0.0015
#选择那些特征作为训练特征 train_df = df.filter( regex= 'Survived|Age_.*|SibSp|Parch|Fare_.*\|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) train_df.head(1) print(train_df.head(1)) #输入模型查看成绩 from abupy import AbuML train_np = train_df.as_matrix() y = train_np[:, 0] x = train_np[:, 1:] titanic = AbuML(x, y, train_df) titanic.estimator.logistic_classifier() titanic.cross_val_accuracy_score() #逻辑分类是一个线性模型,线性模型就是把特征对应的分类结果的作用相加起来 #特征的非线性的表达式可以分为两类: #(1)用于表达数值特征本身的非线性因素 #(2)用于表达特征与特征之间存在的非线性关联,并且这种关联关系对分类结果有帮助 #第一种仅适用于数值特征,对应的构造特征的方式有很多种:多项式化和离散化。多项式构造是指将原有数值的高次方作为特征,数据离散化是指将连续的数值划分为一个个区间 #将数值是否在区间内作为特征。高次方让数值内在的表达变得复杂,可描述能力增强,而离散则是让模型来拟合逼近真实的关系描述。 #划分区间 # 划分区间 df['Child'] = (data_train['Age'] <= 10).astype(int)