import pandas as pd import numpy as np from sklearn.neighbors import NearestNeighbors from random import choice import data_helper import time trainPath = "../tmp/train_all.csv" localTestPath = "../tmp/localtest.csv" onlineTestPath = "../data/d_train_20180102.csv" dataset = data_helper.dataset("../tmp/train_all.csv", "../tmp/localtest.csv", trainable=1) dataset.trans_datetime2weather() dataset.category_sex() dataset.fillna_outliermean() X_train, y_train = dataset.train, dataset.train_label X_train['血糖'] = y_train less_data = X_train[(X_train['血糖'] > 6.1) | (X_train['血糖'] < 22)] less_data = less_data.reset_index(drop=True) index_new = less_data.columns # mean_fill =numerical_data[numerical_data<outlier_baseline].mean() # less_data = np.array( # data[data.iloc[:, tag_index] == np.array(case_state[case_state == min(case_state)].index)[0]]) # more_data = np.array( # data[data.iloc[:, tag_index] == np.array(case_state[case_state == max(case_state)].index)[0]]) # 找出每个少量数据中每条数据k个邻居
from sklearn.cross_validation import cross_val_score from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor import pandas as pd import numpy as np import data_helper from sklearn.externals import joblib from sklearn.cross_validation import cross_val_score dataset = data_helper.dataset("../tmp/train.csv", "../tmp/localtest.csv") dataset.trans_datetime2weather() dataset.fill_nan() dataset.generate_arithmetic() dataset.category_sex() model = RandomForestRegressor(n_estimators=1000, criterion='mse', max_depth=6, max_features=0.8, min_samples_leaf=8, n_jobs=12, random_state=777) #min_samples_leaf: 5~10 X_train, y_train = dataset.train.values, dataset.train_label.values X_test, y_test = dataset.test.values, dataset.test_label.values scores = cross_val_score(model, X_train, y_train, cv=5, scoring='mean_squared_error')
with tf.name_scope('train'): train_step = tf.train.AdamOptimizer(1e-4).minimize(loss) with tf.name_scope('pearson'): _, pearson = tf.contrib.metrics.streaming_pearson_correlation(y_conv, y_) sess.run(tf.local_variables_initializer()) tf.summary.scalar('pearson', pearson) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter("logs/",sess.graph) test_writer = tf.summary.FileWriter("logs/") sess.run(tf.global_variables_initializer()) print "start import data" STS_train = data_helper.dataset(s1=s1_train, s2=s2_train, label=label_train) print "初始化完毕,开始训练" for i in range(40000): batch_train = STS_train.next_batch(50) # 训练模型 train_step.run(feed_dict={x1: batch_train[0], x2:batch_train[1], y_: batch_train[2], keep_prob: 0.5}) # 对结果进行记录 if i % 100 == 0: train_result = sess.run(merged, feed_dict={ x1: batch_train[0], x2: batch_train[1], y_: batch_train[2], keep_prob: 1.0}) train_writer.add_summary(train_result, i) train_pearson = pearson.eval(feed_dict={ x1: batch_train[0], x2: batch_train[1], y_: batch_train[2], keep_prob: 1.0}) train_loss = loss.eval(feed_dict={ x1: batch_train[0], x2: batch_train[1], y_: batch_train[2], keep_prob: 1.0})
##step, summaries, loss, pearson = sess.run( #[global_step, dev_summary_op, cnn.loss, cnn.pearson], #feed_dict) step, summaries, loss = sess.run( [global_step, dev_summary_op, rnn.loss], feed_dict) time_str = datetime.datetime.now().isoformat() #print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, pearson)) print("{}: step {}, dev loss {:g}".format(time_str, step, loss)) if writer: writer.add_summary(summaries, step) ## Generate batches STS_train = data_helper.dataset(s1 = s1_train, s2=s2_train, label= y_train,\ seqlen1 = seqlen1_train,seqlen2 = seqlen2_train ) # Training loop. For each batch... for i in range(40000): #print "this is the key round" print "batch =",i ## next_batch needs modify fo rnn. batch_train = STS_train.next_batch(FLAGS.batch_size) ###NOTICE. ### HERE we should run the "one step" of the train. #print "batch_train[0] = ", batch_train[0] #print "batch_train[1] = ", batch_train[1] #print "batch_train[2] = ", batch_train[2] #print "batch_train[3] = ", batch_train[3]
import pandas as pd import numpy as np import matplotlib.pyplot as plt import data_helper from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, log_loss import xgboost as xgb dataset = data_helper.dataset("../tmp/train.csv", "../tmp/onlinetest.csv", test=1) dataset.trans_datetime2weather() #dataset.del_outlier() dataset.fill_nan() dataset.generate_arithmetic() dataset.category_sex() value = 10 dataset.translabelbelow(value=value) X_train, y_train = dataset.train.values, dataset.train_label.values X_test = dataset.test.values dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = { 'booster': 'gbtree', 'objective': 'binary:logistic', 'scale_pos_weight': float(len(X_train) - sum(y_train)) / sum(y_train), 'eval_metric': 'auc', 'max_depth': 6,
import xgboost as xgb import pandas as pd import numpy as np import data_helper import operator import matplotlib.pyplot as plt modelpath = "../model/combine/" liver_modelpath = "../model/combine/liver.model" bloodfat_modelpath = "../model/combine/bloodfat.model" urea_modelpath = "../model/combine/urea.model" hepatitis_modelpath = "../model/combine/hepatitis.model" bloodnorm_modelpath = "../model/combine/bloodnorm.model" dataset = data_helper.dataset("../tmp/train_all.csv", "../tmp/onlinetest.csv", trainable=0) dataset.trans_datetime2weather() dataset.del_outlier() dataset.category_sex() liver_train, liver_train_label, liver_test, liver_test_label = dataset.liver_columns( ) bloodfat_train, bloodfat_train_label, bloodfat_test, bloodfat_test_label = dataset.bloodfat_columns( ) urea_train, urea_train_label, urea_test, urea_test_label = dataset.urea_columns( ) hepatitis_train, hepatitis_train_label, hepatitis_test, hepatitis_test_label = dataset.hepatitis( ) bloodnorm_train, bloodnorm_train_label, bloodnorm_test, bloodnorm_test_label = dataset.bloodnorm( )
import pandas as pd import data_helper import matplotlib.pyplot as plt train_data = data_helper.dataset("../tmp/onlinetest.csv") data = train_data.data data = data.reset_index(drop=True) data["体检日期"] = pd.to_datetime(data["体检日期"]) # data =data.sort_values(by='体检日期',ascending=True) num_data = len(data) weatherpath = "../tmp/weather.csv" weatherdata = pd.read_csv(weatherpath, index_col='日期', parse_dates=True) data.insert(1, 'high_temperature', weatherdata.ix[data['体检日期'], :]['最高'].tolist()) data.insert(1, 'low_temperature', weatherdata.ix[data['体检日期'], :]['最低'].tolist()) bins = [0, 7, 18, 41, 66, 100] group_periods = [0, 1, 2, 3, 4] cats = pd.cut(data.年龄, bins, right=False, labels=group_periods) cats.rename('age_type') data.insert(1, 'age_type', cats.tolist()) # print(data) data.to_csv("../tmp/addFeature1_onlinetest.csv", index_label='id') exit() data = data[data.columns.drop( ["性别", "age_type", "low_temperature", "high_temperature", '年龄', "体检日期"])]
import numpy as np import pandas as pd import matplotlib.pyplot as plt import data_helper # import dataAnalysis from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression train_data = data_helper.dataset("../tmp/train.csv") dev_data = data_helper.dataset("../tmp/dev.csv") test_data = data_helper.dataset("../tmp/localtest.csv") # y_train_temp = train_data.label X_train, y_train = train_data.feature, train_data.label X_dev, y_dev = dev_data.feature, dev_data.label X_test, y_test = test_data.feature, test_data.label num_data = train_data.example_nums poly_reg = PolynomialFeatures(degree=2) #degree 就是自变量需要的维度 X_poly = poly_reg.fit_transform(X_train) lin_reg_2 = LinearRegression() lin_reg_2.fit(X_poly, y_train) print("finish!") train_predict = lin_reg_2.predict(poly_reg.fit_transform(X_train)) train_mseError = sum((train_predict - y_train)**2) / (2 * len(X_train)) dev_predict = lin_reg_2.predict(poly_reg.fit_transform(X_dev)) dev_mseError = sum((dev_predict - y_dev)**2) / (2 * len(X_dev))