def read_file(): file_content = pd.read_csv('train.csv') exc_cols = [u'Id', u'Response'] cols = [c for c in file_content.columns if c not in exc_cols] train_datas = file_content.ix[:, cols] train_lables = file_content['Response'].values test_file = pd.read_csv('test.csv') test_ids = test_file['Id'].values test_datas = test_file.ix[:, [c for c in test_file.columns if c not in [u'Id']]] # 填充平均值 test_datas = test_datas.fillna(-1) train_datas = train_datas.fillna(-1) all_datas = pd.concat([train_datas, test_datas], axis=0) # 对数据进行一下划分 categoricalVariables = ["Product_Info_1", "Product_Info_2", "Product_Info_3", "Product_Info_5", "Product_Info_6", "Product_Info_7", "Employment_Info_2", "Employment_Info_3", "Employment_Info_5", "InsuredInfo_1", "InsuredInfo_2", "InsuredInfo_3", "InsuredInfo_4", "InsuredInfo_5", "InsuredInfo_6", "InsuredInfo_7", "Insurance_History_1", "Insurance_History_2", "Insurance_History_3", "Insurance_History_4", "Insurance_History_7", "Insurance_History_8", "Insurance_History_9", "Family_Hist_1", "Medical_History_2", "Medical_History_3", "Medical_History_4", "Medical_History_5", "Medical_History_6", "Medical_History_7", "Medical_History_8", "Medical_History_9", "Medical_History_10", "Medical_History_11", "Medical_History_12", "Medical_History_13", "Medical_History_14", "Medical_History_16", "Medical_History_17", "Medical_History_18", "Medical_History_19", "Medical_History_20", "Medical_History_21", "Medical_History_22", "Medical_History_23", "Medical_History_25", "Medical_History_26", "Medical_History_27", "Medical_History_28", "Medical_History_29", "Medical_History_30", "Medical_History_31", "Medical_History_33", "Medical_History_34", "Medical_History_35", "Medical_History_36", "Medical_History_37", "Medical_History_38", "Medical_History_39", "Medical_History_40", "Medical_History_41"] all_file_data = all_datas.ix[:, [c for c in all_datas.columns if c not in categoricalVariables]] all_file_cate = all_datas.ix[:, [c for c in categoricalVariables]] # 归一化 对数值数据 scalar_this = StandardScaler() scalar_this.fit_transform(all_file_data) # 重新组合数据 train_datas = pd.concat([all_file_data[:train_datas.shape[0]], all_file_cate[:train_datas.shape[0]]], axis=1) test_datas = pd.concat([all_file_data[file_content.shape[0]:], all_file_cate[file_content.shape[0]:]], axis=1) # 向量化 train_datas = DictVectorizer().fit_transform(train_datas.to_dict(outtype='records')).toarray() test_datas = DictVectorizer().fit_transform(test_datas.to_dict(outtype='records')).toarray() return (train_datas, train_lables, test_ids, test_datas)
""" import pandas as pd import numpy as np from sklearn.ensemble import GradientBoostingClassifier from sklearn.feature_extraction import DictVectorizer # データを読み込む df=pd.read_csv("C:\\Users\\fukazu\\Documents\\IPython Notebooks\\deepanAlytics\\train.csv",header=None,nrows=10000) # データクリーニング # NaNが一つでも入っているrowを除く df = df[pd.notnull(df).all(1)] # 説明変数x、目的変数yに分ける x = df.loc[:, 2:] y = df[1] # カテゴリカル変数は文字列に直す x.loc[:, 4:9] = x.loc[:, 4:9].astype(str) # カテゴリカル変数を数量化 x = DictVectorizer(sparse=False).fit_transform(x.to_dict('records')) # SVCで最初の5000個を学習 clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=5, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) clf.fit(x[:5000], y[:5000]) # 5000番目以降に対する学習スコア print clf.score(x[5000:], y[5000:])