def main(): """ 主函数 """ # 准备数据集 train_data, test_data = utils.prepare_data() # 查看数据集 utils.inspect_dataset(train_data, test_data) # 特征工程处理 # 构建训练测试数据 X_train, X_test = utils.do_feature_engineering(train_data, test_data) print('共有{}维特征。'.format(X_train.shape[1])) # 标签处理 y_train = train_data['label'].values y_test = test_data['label'].values # 数据建模及验证 print('\n===================== 数据建模及验证 =====================') nb_model = GaussianNB() nb_model.fit(X_train, y_train) y_pred = nb_model.predict(X_test) print('准确率:', accuracy_score(y_test, y_pred)) print('AUC值:', roc_auc_score(y_test, y_pred))
def main(): """ 主函数 """ # 加载数据 raw_data = pd.read_csv(os.path.join(config.dataset_path, 'charging_pile.csv'), index_col='id') # 分割数据集 train_data, test_data = train_test_split(raw_data, test_size=1 / 4, random_state=10) # 数据查看 utils.inspect_dataset(train_data, test_data) # 特征工程 print('\n===================== 特征工程 =====================') X_train, y_train = utils.transform_data(train_data) X_test, y_test = utils.transform_data(test_data) # 构建训练测试数据 # 数据建模及验证 print('\n===================== 数据建模及验证 =====================') model_name_param_dict = {'kNN': [5, 11, 15], 'LR': [0.1, 1, 10]} # 比较结果的DataFrame results_df = pd.DataFrame(columns=['Accuracy (%)', 'Time (s)'], index=list(model_name_param_dict.keys())) results_df.index.name = 'Model' for model_name, param_range in model_name_param_dict.items(): _, best_acc, mean_duration = utils.train_test_model( X_train, y_train, X_test, y_test, param_range, model_name) results_df.loc[model_name, 'Accuracy (%)'] = best_acc * 100 results_df.loc[model_name, 'Time (s)'] = mean_duration results_df.to_csv(os.path.join(config.output_path, 'model_comparison.csv')) # 模型及结果比较 print('\n===================== 模型及结果比较 =====================') plt.figure(figsize=(10, 4)) ax1 = plt.subplot(1, 2, 1) results_df.plot(y=['Accuracy (%)'], kind='bar', ylim=[60, 100], ax=ax1, title='Accuracy(%)', legend=False) ax2 = plt.subplot(1, 2, 2) results_df.plot(y=['Time (s)'], kind='bar', ax=ax2, title='Time(s)', legend=False) plt.tight_layout() plt.savefig(os.path.join(config.output_path, 'pred_results.png')) plt.show()
def main(): # Loading Data cleaned = pd.read_csv(os.path.join(config.dataset_path, 'cleaned.csv')) cleaned = cleaned[config.feat_cols] raw_data = cleaned[pd.isna(cleaned.y)==False] valid = cleaned[pd.isna(cleaned.y)] # Cleaning Data # Splitting Data train_data, test_data = train_test_split(raw_data, test_size=1/4, random_state=10) y_train = train_data['y'].values y_test = test_data['y'].values train_data = train_data.drop('y', axis=1) test_data = test_data.drop('y', axis=1) # Checking Data utils.inspect_dataset(train_data, test_data) # Feature Engineering print('\n===================== Feature Engineering =====================') X_train, label_encs, onehot_enc, scaler, pca = utils.transform_train_data(train_data) X_test = utils.transform_test_data(test_data, label_encs, onehot_enc, scaler, pca) X_valid = utils.transform_test_data(valid, label_encs, onehot_enc, scaler, pca) # Testing print('\n===================== Modeling =====================') model_name_param_dict = {'LR': LinearRegression(), 'Lasso': Lasso(alpha=0.01), 'Ridge': Ridge(alpha=0.01), 'SVM':SVR(), 'SGD':SGDRegressor() } # Create tables results_df = pd.DataFrame(columns=['MSE', 'Time (s)'], index=list(model_name_param_dict.keys())) results_df.index.name = 'Model' for model_name, model in model_name_param_dict.items(): _, best_score, mean_duration = utils.train_test_model(X_train, y_train, X_test, y_test, model_name, model) results_df.loc[model_name, 'MSE'] = best_score results_df.loc[model_name, 'Time (s)'] = mean_duration results_df.to_csv(os.path.join(config.output_path, 'table.csv')) lasso = Ridge(alpha=0.01) lasso.fit(X_train, y_train) predict = np.exp(lasso.predict(X_valid)) result = pd.DataFrame(predict) result.to_csv('C:\\Users\\alexliuyi\\Documents\\Kaggle\\Home Price\\result.csv')
def main(): """ 主函数 """ # 加载数据 raw_data = pd.read_csv( os.path.join(config.dataset_path, 'german_credit_data.csv')) # 清洗数据 cln_data = utils.clean_data(raw_data) # 分割数据集 train_data, test_data = train_test_split(cln_data, test_size=1 / 4, random_state=10) y_train = train_data['Label'].values y_test = test_data['Label'].values # 数据查看 utils.inspect_dataset(train_data, test_data) # 特征工程 print('\n===================== 特征工程 =====================') X_train, label_encs, onehot_enc, scaler, pca = utils.transform_train_data( train_data) X_test = utils.transform_test_data(test_data, label_encs, onehot_enc, scaler, pca) # 构建训练测试数据 # 数据建模及验证 print('\n===================== 数据建模及验证 =====================') sclf = StackingClassifier( classifiers=[KNeighborsClassifier(), SVC(), DecisionTreeClassifier()], meta_classifier=LogisticRegression()) model_name_param_dict = { 'kNN': (KNeighborsClassifier(), { 'n_neighbors': [5, 25, 55] }), 'LR': (LogisticRegression(), { 'C': [0.01, 1, 100] }), 'SVM': (SVC(probability=True), { 'C': [0.01, 1, 100] }), 'DT': (DecisionTreeClassifier(), { 'max_depth': [50, 100, 150] }), 'Stacking': (sclf, { 'kneighborsclassifier__n_neighbors': [5, 25, 55], 'svc__C': [0.01, 1, 100], 'decisiontreeclassifier__max_depth': [50, 100, 150], 'meta-logisticregression__C': [0.01, 1, 100] }), 'AdaBoost': (AdaBoostClassifier(), { 'n_estimators': [50, 100, 150, 200] }), 'GBDT': (GradientBoostingClassifier(), { 'learning_rate': [0.01, 0.1, 1, 10, 100] }), 'RF': (RandomForestClassifier(), { 'n_estimators': [100, 150, 200, 250] }) } # 比较结果的DataFrame results_df = pd.DataFrame(columns=['AUC', 'Time (s)'], index=list(model_name_param_dict.keys())) results_df.index.name = 'Model' for model_name, (model, param_range) in model_name_param_dict.items(): _, best_score, mean_duration = utils.train_test_model( X_train, y_train, X_test, y_test, model_name, model, param_range) results_df.loc[model_name, 'AUC'] = best_score results_df.loc[model_name, 'Time (s)'] = mean_duration results_df.to_csv(os.path.join(config.output_path, 'model_comparison.csv')) # 模型及结果比较 print('\n===================== 模型及结果比较 =====================') plt.figure(figsize=(10, 4)) ax1 = plt.subplot(1, 2, 1) results_df.plot(y=['AUC'], kind='bar', ylim=[0, 1], ax=ax1, title='AUC', legend=False) ax2 = plt.subplot(1, 2, 2) results_df.plot(y=['Time (s)'], kind='bar', ax=ax2, title='Time(s)', legend=False) plt.tight_layout() plt.savefig(os.path.join(config.output_path, 'pred_results.png')) plt.show()