def svm_model(self,data_kind='train_data'): self.data_kind=data_kind data_file=get_train_data_path(self.fc,self.fj,self.model_kind) df = pd.read_csv(data_file, encoding='utf-8', index_col=0,low_memory=False) print(df.shape) traindata = df.iloc[:, :].values x = traindata[:, :-1] y = traindata[:, -1] x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8) # list print('when training,train data number:',len(y_train)) print('when training,test data number:',len(y_test)) print('training model:',self.model_kind) raw_model=SVR() parameters = [ { 'C': [3, 7, 9, 15, 19], 'gamma': [ 0.001, 1, 10, ], 'kernel': ['rbf'] } ] gsc=GridSearchCV(raw_model,parameters,cv=3) gsc.fit(x_train,y_train) raw_model.fit(x_train, y_train) print('svm best parameters:', gsc.best_params_) print(self.model_path) joblib.dump(gsc,self.model_path+self.model_name) pred = raw_model.predict(x_test) self.save_result_dataframe(y_test,pred,) self.set_var(true_v=y_test,pred_v=pred) self.show_save_figure(detal_idx=4) t_mean=self.cal_mean(self.true) p_mean=self.cal_mean(self.pred) self.save_result(true_mean=t_mean, pred_mean=p_mean,train_n=len(x_train),test_n=len(x_test))
def get_data(self): train_data = get_train_data_path(fc=self.fc, fj=self.fj, model_kind=self.model_kind) df = pd.read_csv(train_data, encoding='utf-8', index_col=0) data_set = df.iloc[:, :].values x_dataset = data_set[:, :-1] y_dataset = data_set[:, -1] self.x_train_mean = None self.x_train_std = None normalized_data = data_normalized(x_dataset) x_train, x_test, y_train, y_test = train_test_split(normalized_data, y_dataset, train_size=0.6) print(x_train[0], len(x_train)) print(y_train[0], len(y_train)) if self.data_kind == 'train': self.train_x_batch = [] self.train_y_batch = [] self.batch_index = [] for i in range(len(x_train) - self.time_step - 1): if i % self.batch_size == 0: self.batch_index.append(i) x = x_train[i:i + self.time_step] y = y_train[i:i + self.time_step] self.train_x_batch.append(x.tolist()) self.train_y_batch.append( np.reshape(y, newshape=(self.time_step, self.output_size))) self.test_x_batch = [] self.test_y_batch = [] self.test_batch_index = [] for i in range(len(x_test) - self.time_step - 1): if i % self.batch_size == 0: self.test_batch_index.append(i) x = x_train[i:i + self.time_step] y = y_train[i:i + self.time_step] self.test_x_batch.append(x.tolist()) self.test_y_batch.append( np.reshape(y, newshape=(self.time_step, self.output_size))) elif self.data_kind == 'fault_test': fault_test_data = get_test_data_path() falut_df = pd.read_csv(fault_test_data, encoding='utf-8', index_col=0) fault_data_set = falut_df.iloc[:, :].values fault_x_dataset = fault_data_set[:, :-1] fault_y_dataset = fault_data_set[:, -1] fault_normalized_data = data_normalized(fault_x_dataset) print(len(fault_normalized_data), len(fault_y_dataset)) self.train_x_batch = [] self.self.tmp_y_batch = [] batch_index = [] for i in range(len(fault_normalized_data) - self.time_step - 1): if i % self.batch_size == 0: self.batch_index.append(i) x = fault_normalized_data[i:i + self.time_step] y = fault_y_dataset[i:i + self.time_step] self.train_x_batch.append(x.tolist()) self.train_y_batch.append( np.reshape(y, newshape=(self.time_step, self.output_size)))
def train_lightgbm_model(self, data_kind='train'): self.data_kind = data_kind data_file = get_train_data_path(self.fc, self.fj, self.model_kind, self.params_kind) df = pd.read_csv(data_file, index_col=0, encoding='utf-8', low_memory=False) #print(df.columns) print('df shape:', df.shape) traindata = df.iloc[:, :].values x = traindata[:, :-1] y = traindata[:, -1] x_train, x_test, y_train, y_test = train_test_split( x, y, train_size=0.8) # list print(len(x_train), len(x_test)) print('training lightgbm model') model = LGBMRegressor(boosting_type='gbdt', num_leaves=self.num_leaves, n_estimators=self.n_estimator, max_depth=self.max_depth) model.fit(x_train, y_train) print(model.feature_importances_) print(model.best_score_) print(model.best_iteration_) joblib.dump(model, self.model_path + self.model_name) pred = model.predict(x_test) self.set_var(true_v=y_test, pred_v=pred) self.show_save_figure(detal_idx=8) t_mean = self.cal_mean(self.true) p_mean = self.cal_mean(self.pred) self.save_result(true_mean=t_mean, pred_mean=p_mean, train_n=len(x_train), test_n=len(x_test))
def get_data(self): data_file = get_train_data_path(self.fc, self.fj, self.model_kind) df = pd.read_csv(data_file, encoding='utf-8', index_col=0, low_memory=False) print(df.shape) traindata = df.iloc[:, :].values x = traindata[:, :-1] y = traindata[:, -1] self.train_x, self.test_x, self.train_y, self.test_y = train_test_split( x, y, train_size=0.6) # list
def xgboostmodel(self, data_kind='train_data'): self.data_kind = data_kind data_file = get_train_data_path(self.fc, self.fj, self.model_kind, self.params_kind) df = pd.read_csv(data_file, encoding='utf-8', index_col=0, low_memory=False) logger.info(df.columns) logger.info(df.shape) traindata = df.iloc[:, :].values x = traindata[:, :-1] y = traindata[:, -1] x_train, x_test, y_train, y_test = train_test_split( x, y, train_size=0.8) # list logger.info('when training,train data number:{}'.format( str(len(y_train)))) logger.info('when training,test data number:{}'.format(len(y_test))) logger.info('training model:{}'.format(self.model_kind)) # params={'booster':'gbtree','objective':'reg:squarederror','eval_metric':'rmse','seed':0,'n_jobs':10,'max_depth':self.max_depth,'n_estimators':self.n_estimator,'min_child_weight':self.min_child_weight, # 'verbosity':1,'learning_rate':0.05} raw_model = xgb.XGBRegressor(max_depth=self.max_depth, n_estimators=self.n_estimator, learning_rate=0.02, silent=False, min_child_weight=self.min_child_weight, tree_mothod='gpu_hist') # raw_model = xgb.XGBRegressor(**params) raw_model.fit(x_train, y_train) logger.info(self.model_path) raw_model.save_model(self.model_path + self.model_file_name) pred = raw_model.predict(x_test) plot_importance(raw_model) if not os.path.exists(self.feature_importance_path): os.makedirs(self.feature_importance_path) plt.savefig(self.feature_importance_path + self.fj_model_kind + '_feature_importance') plt.show() plt.close() self.save_result_dataframe( y_test, pred, ) self.set_var(true_v=y_test, pred_v=pred) self.show_save_figure(detal_idx=4) t_mean = self.cal_mean(self.true) p_mean = self.cal_mean(self.pred) self.save_result(true_mean=t_mean, pred_mean=p_mean, train_n=len(x_train), test_n=len(x_test))
def get_data(self): train_data = get_train_data_path() df = pd.read_csv(train_data, encoding='utf-8', index_col=0) data_set = df.iloc[:, :].values x_train = data_set[:, :-1] y_train = data_set[:, -1] normalized_data = (x_train - np.mean(x_train)) / np.std(x_train) tmp_x_batch = [] tmp_y_batch = [] for i in range(len(data_set) - self.time_step - 1): x = normalized_data[i:i + self.time_step] y = y_train[i + self.time_step - 1] tmp_x_batch.append(x) tmp_y_batch.append(y) return tmp_x_batch, tmp_y_batch
def feature_data_filter(self): data_file = get_train_data_path(self.fc, self.fj, self.model_kind, params_kind=self.params_kind) df = pd.read_csv(data_file, encoding='utf-8', index_col=0) print(df.columns) for f in self.params: df = self.recursive_filter_abnormal_data(df, f) if os.path.exists(self.mergedData_filtered_path + self.filtered_data_file): print('remove file:' + self.mergedData_filtered_path + self.filtered_data_file) os.remove(self.mergedData_filtered_path + self.filtered_data_file) df.to_csv(self.mergedData_filtered_path + self.filtered_data_file, encoding='utf-8')
def __init__(self,fc=None,fj=None,model_kind=None,params_kind=None): #self.dataFile = data_path + os.listdir(data_path)[0] self.dataFile=get_train_data_path(fc,fj,model_kind,params_kind) self.df = pd.read_csv(self.dataFile, encoding='utf-8', index_col=0) # 省去索引 result_path = cur_path + '/result/' if not os.path.exists(result_path): os.makedirs(result_path) self.single_result_path = result_path + 'result_' +fc+'_'+fj+'_'+model_kind+ '/' if not os.path.exists(self.single_result_path): os.makedirs(self.single_result_path) self.figure_path=self.single_result_path+'_'+params_kind+'_'+ hour_minute + '/' if not os.path.exists(self.figure_path): os.makedirs(self.figure_path) self.model_number=str.split(model_kind,'_')[-1] self.params=ParamsDict().model_params[params_kind][model_kind]
def adaboostmodel(self, data_kind='train'): self.data_kind = data_kind data_file = get_train_data_path(self.fc, self.fj, self.model_kind) df = pd.read_csv(data_file, encoding='utf-8', index_col=0, low_memory=False) print(df.shape) traindata = df.iloc[:, :].values x = traindata[:, :-1] y = traindata[:, -1] x_train, x_test, y_train, y_test = train_test_split( x, y, train_size=0.8) # list6 print('when training,train data number:', len(y_train)) print('when training,test data number:', len(y_test)) print('training model:', self.model_kind) raw_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( max_features=None, max_depth=self.max_depth, min_samples_split=20, min_samples_leaf=10, min_weight_fraction_leaf=0, max_leaf_nodes=None), learning_rate=0.01, loss='square', n_estimators=self.n_estimator) raw_model.fit(x_train, y_train) print(self.model_path) joblib.dump(raw_model, self.model_path + self.model_file_name) pred = raw_model.predict(x_test) self.save_result_dataframe( y_test, pred, ) self.set_var(true_v=y_test, pred_v=pred) self.show_save_figure(detal_idx=4) t_mean = self.cal_mean(self.true) p_mean = self.cal_mean(self.pred) self.save_result(true_mean=t_mean, pred_mean=p_mean, train_n=len(x_train), test_n=len(x_test))
def __init__(self,job_name=None,fc=None,fj=None,model_kind=None): super().__init__() self.columns=['pitch_Atech_hub_temp_1', 'pitch_Atech_cabinet_temp_1', 'pitch_position_1', 'wind_speed', 'rotor_speed', 'pitch_Atech_capacitor_temp_1'] self.job_name = job_name + '_' + fc + '_' + fj self.model_folder_name = fj + '_' + model_kind self.model_name = fj self.fc = fc self.fj = fj self.model_kind = model_kind self.cur_path = cur_path self.init_param() self.fj_model_kind = fj + '_' + model_kind self.field_default=[0,0,0,0,0,0] self.train_file_path=get_train_data_path(fc=fc,fj=fj,model_kind=model_kind) print(self.train_file_path) self.batch_size=100
import pandas as pd from sklearn.ensemble import AdaBoostRegressor from sklearn.tree import DecisionTreeRegressor from model.get_data_path import get_train_data_path, get_test_data_path from sklearn.model_selection import train_test_split import os from util.show_save_result import ShowAndSave cur_path = os.path.abspath(os.path.dirname(__file__)) datafile = get_train_data_path() class AdaboostModel(ShowAndSave): def __init__(self, params=None, jobname='adbmodel'): super().__init__() self.job_name = jobname self.cur_path = cur_path self.init_param() self.params = params def adaboostmodel(self): df = pd.read_csv(datafile, encoding='utf-8', index_col=0) traindata = df.iloc[:, :].values x = traindata[:, :-1] y = traindata[:, -1] x_train, x_test, y_train, y_test = train_test_split( x, y, train_size=0.7) # list raw_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( max_features=None, max_depth=None, min_samples_split=20,
from model.get_data_path import get_train_data_path import numpy as np import tensorflow as tf import pandas as pd from util.data_normalized import data_normalized HIDDEN_SIZE = 500 NUM_LAYERS = 2 TIMESTEPS = 20 TRAINING_STEPS = 10000 BATCH_SIZE = 1000 INPUT_SIZE = 9 LEARNING_RATE_BASE = 0.01 LEARNING_RATE_DECAY = 0.9 NUM_EXAMPLES = 50000 datafile = get_train_data_path('wfzc', 'A2', 'cap_temp_1', 'model_params_v3') f = open(datafile) df = pd.read_csv(f, index_col=0) data = df.iloc[:, :].values MODEL_SAVE_PATH = "model_saved/" MODEL_NAME = "model.ckpt" LSTM_KEEP_PROB = 0.9 def lstm_model(x, dropout_keep_prob): lstm_cells = [ tf.nn.rnn_cell.DropoutWrapper( tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE), output_keep_prob=dropout_keep_prob) for _ in range(NUM_LAYERS) ]
import pandas as pd import numpy as np from model.get_data_path import get_train_data_path def get_new_mergeddata(filename): df = pd.read_csv(filename, encoding='utf-8', index_col=0) data_set = df.iloc[:, :].values x_train = data_set[:, :-1] y_train = data_set[:, -1] # tmp_data_set=[] # for i in range(len(data_set)): # if i % 2==0: # tmp_data_set.append(data_set[i]) # print(np.shape(np.array(tmp_data_set))) tmp_x_batch = [] tmp_y_batch = [] print(x_train[:3]) print(y_train[:3]) filename = get_train_data_path() get_new_mergeddata(filename)