save_loss = args.save_loss save_prediction = args.save_prediction # prepare for the data time_horizon = args.horizon if args.action == 'train': comparison = None n = 0 #read the data from the 4E or NExT database time_series,LME_dates,config_length = gn.read_data_with_specified_columns(args.source,args.data_configure_file,"2003-11-12") #generate list of list of dates to be used to roll over 5 half years today = args.date length = 5 if gn.even_version(args.version) and time_horizon > 5: length = 4 start_time,end_time = gn.get_relevant_dates(today,length,"tune") split_dates = gn.rolling_half_year(start_time,end_time,length) split_dates = split_dates[:] importance_list = [] #generate the version version_params=generate_version_params(args.version) for s, split_date in enumerate(split_dates): torch.manual_seed(1) np.random.seed(1) random.seed(1) lag = args.lag
def train( self,split = 0.9, num_epochs=50, drop_out=0.0, drop_out_mc = 0.0, repeat_mc = 10, embedding_size=5, batch_size=512, hidden_state=50, lrate=0.001, attention_size=2, interval=1, lambd=0, save_loss=0, save_prediction=0, method =""): """ drop_out: the dropout rate of LSTM network hidden: number of hidden_state of encoder/decoder embdedding_size: the size of embedding layer batch: the mini-batch size hidden_satte: number of hidden_state of encoder/decoder lrate: learning rate attention_size: the head number in MultiheadAttention Mechanism interval: save models every interval epoch lambd: the weight of classfication loss save_loss: whether to save loss results save_prediction: whether to save prediction results """ sys.path[0] = os.curdir print("begin to train") #assert that the configuration path is correct self.path = gn.generate_config_path(self.version) #retrieve column list based on configuration path time_series,LME_dates,config_length = gn.read_data_with_specified_columns(self.source,self.path,"2003-11-12") #begin to split the train data for date in self.date.split(","): torch.manual_seed(1) np.random.seed(1) random.seed(1) today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time,train_time,evalidate_date = gn.get_relevant_dates(today,length,"train") split_dates = [train_time,evalidate_date,str(today)] #generate the version version_params = generate_version_params(self.version) print("the train date is {}".format(split_dates[0])) print("the test date is {}".format(split_dates[1])) norm_volume = "v1" norm_3m_spread = "v1" norm_ex = "v1" len_ma = 5 len_update = 30 tol = 1e-7 norm_params = {'vol_norm':norm_volume,'ex_spread_norm':norm_ex,'spot_spread_norm':norm_3m_spread, 'len_ma':len_ma,'len_update':len_update,'both':3,'strength':0.01,'xgboost':False} tech_params = {'strength':0.01,'both':3,'Win_VSD':[10,20,30,40,50,60],'Win_EMA':12,'Win_Bollinger':22, 'Fast':12,'Slow':26,'Win_NATR':10,'Win_VBM':22,'acc_initial':0.02,'acc_maximum':0.2,"live":None} #for versions that tune over 6 metals final_X_tr = [] final_y_tr = [] final_X_val = [] final_y_val = [] final_X_te = [] final_y_te = [] final_y_te_class_list = [] final_y_te_class_top_list = [] final_y_te_top_ind_list = [] final_y_te_class_bot_list = [] final_y_te_bot_ind_list = [] final_train_X_embedding = [] final_test_X_embedding = [] final_val_X_embedding = [] i = 0 #toggle metal id metal_id = False ground_truths_list = ["LME_Cu_Spot","LME_Al_Spot","LME_Ni_Spot","LME_Xi_Spot","LME_Zn_Spot","LME_Pb_Spot"] for ground_truth in ground_truths_list: print(ground_truth) new_time_series = copy(time_series) ts = new_time_series.loc[start_time:split_dates[2]] #load data for use X_tr, y_tr, X_va, y_va, val_dates, column_lag_list = gn.prepare_data(ts,LME_dates,self.horizon,[ground_truth],self.lag,copy(split_dates),version_params,metal_id_bool = metal_id,reshape = False) # split validation X_ta = X_tr[:int(len(X_tr) * split), :, :] y_ta = y_tr[:int(len(y_tr) * split),0] X_val = X_tr[int(len(X_tr) * split):, :, :] y_val = y_tr[int(len(y_tr) * split):,0] X_te = X_va y_te = y_va[:,0] # generate metal id for embedding lookup train_X_id_embedding = [i]*len(X_ta) val_X_id_embedding = [i]*len(X_val) test_X_id_embedding = [i]*len(X_te) if len(final_X_tr) == 0: final_X_tr = copy(X_ta) else: final_X_tr = np.concatenate((final_X_tr, X_ta), axis=0) if len(final_y_tr) == 0: final_y_tr = copy(y_ta) else: final_y_tr = np.concatenate((final_y_tr, y_ta), axis=0) if len(final_X_te) == 0: final_X_te = copy(X_te) else: final_X_te = np.concatenate((final_X_te, X_te), axis=0) if len(final_y_te) == 0: final_y_te = copy(y_te) else: final_y_te = np.concatenate((final_y_te, y_te), axis=0) y_te_rank = np.argsort(y_te) y_te_class = [] for item in y_te: y_te_class.append(item) final_y_te_class_list.append(y_te_class) split_position = len(y_te) // 3 final_y_te_bot_ind_list.append(y_te_rank[:split_position]) final_y_te_top_ind_list.append(y_te_rank[-split_position:]) y_te_class = np.array(y_te_class) final_y_te_class_bot_list.append( y_te_class[y_te_rank[:split_position]]) final_y_te_class_top_list.append( y_te_class[y_te_rank[-split_position:]]) if len(final_X_val) == 0: final_X_val = copy(X_val) else: final_X_val = np.concatenate((final_X_val, X_val), axis=0) if len(final_y_val) == 0: final_y_val = copy(y_val) else: final_y_val = np.concatenate((final_y_val, y_val), axis=0) final_train_X_embedding+=train_X_id_embedding final_test_X_embedding+=test_X_id_embedding final_val_X_embedding+=val_X_id_embedding # update metal index i+=1 print('Dataset statistic: #examples') print('Train:', len(final_X_tr), len(final_y_tr), len(final_train_X_embedding)) print(np.max(final_X_tr), np.min(final_X_tr), np.max(final_y_tr), np.min(final_y_tr)) print('Validation:', len(final_X_val), len(final_y_val), len(final_val_X_embedding)) print('Testing:', len(final_X_te), len(final_y_te), len(final_test_X_embedding)) # begin to train the model input_dim = final_X_tr.shape[-1] window_size = self.lag case_number = len(ground_truths_list) start = time.time() trainer = Trainer(input_dim, hidden_state, window_size, lrate, drop_out, case_number, attention_size, embedding_size, drop_out_mc,repeat_mc, final_X_tr, final_y_tr, final_X_te, final_y_te, final_X_val, final_y_val, final_train_X_embedding, final_test_X_embedding, final_val_X_embedding, final_y_te_class_list, final_y_te_class_top_list, final_y_te_class_bot_list, final_y_te_top_ind_list, final_y_te_bot_ind_list, self.mc ) end = time.time() print("pre-processing time: {}".format(end-start)) print("the split date is {}".format(split_dates[1])) save = 1 net=trainer.train_minibatch(num_epochs, batch_size, interval, self.lag, self.version, self.horizon, split_dates, method)
def test(self,split = 0.9, num_epochs=50, drop_out=0.0, drop_out_mc = 0.0, repeat_mc = 10, embedding_size=5, batch_size=512, hidden_state=50, lrate=0.001, attention_size=2, interval=1, lambd=0, save_loss=0, save_prediction=0, method = ""): sys.path[0] = os.curdir print(sys.path) print("begin to test") #assert that the configuration path is correct self.path = gn.generate_config_path(self.version) #retrieve column list based on configuration path time_series,LME_dates,config_length = gn.read_data_with_specified_columns(self.source,self.path,"2003-11-12") #begin to split the train data for date in self.date.split(","): torch.manual_seed(1) np.random.seed(1) random.seed(1) today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time,train_time,evalidate_date = gn.get_relevant_dates(today,length,"test") split_dates = [train_time,evalidate_date,str(today)] #generate the version version_params=generate_version_params(self.version) print("the test date is {}".format(split_dates[1])) norm_volume = "v1" norm_3m_spread = "v1" norm_ex = "v1" len_ma = 5 len_update = 30 tol = 1e-7 norm_params = {'vol_norm':norm_volume,'ex_spread_norm':norm_ex,'spot_spread_norm':norm_3m_spread, 'len_ma':len_ma,'len_update':len_update,'both':3,'strength':0.01,'xgboost':False} tech_params = {'strength':0.01,'both':3,'Win_VSD':[10,20,30,40,50,60],'Win_EMA':12,'Win_Bollinger':22, 'Fast':12,'Slow':26,'Win_NATR':10,'Win_VBM':22,'acc_initial':0.02,'acc_maximum':0.2,"live":None} #for versions that tune over 6 metals final_X_tr = [] final_y_tr = [] final_X_val = [] final_y_val = [] final_X_te = [] final_y_te = [] final_y_te_class_list = [] final_y_te_class_top_list = [] final_y_te_top_ind_list = [] final_y_te_class_bot_list = [] final_y_te_bot_ind_list = [] final_train_X_embedding = [] final_test_X_embedding = [] final_val_X_embedding = [] spot_list = [] i = 0 #toggle metal id metal_id = False ground_truths_list = ["LME_Cu_Spot","LME_Al_Spot","LME_Ni_Spot","LME_Xi_Spot","LME_Zn_Spot","LME_Pb_Spot"] for ground_truth in ground_truths_list: print(ground_truth) new_time_series = copy(time_series) ts = new_time_series.loc[start_time:split_dates[2]] #load data for use X_tr, y_tr, X_va, y_va, val_dates, column_lag_list = gn.prepare_data(ts,LME_dates,self.horizon,[ground_truth],self.lag,copy(split_dates),version_params,metal_id_bool = metal_id,reshape = False,live = True) # split validation X_ta = X_tr[:int(len(X_tr) * split), :, :] y_ta = y_tr[:int(len(y_tr) * split),0] X_val = X_tr[int(len(X_tr) * split):, :, :] y_val = y_tr[int(len(y_tr) * split):,0] X_te = X_va y_te = y_va[:,0] spot_list = np.concatenate([spot_list,y_va[:,1]],axis = 0) if len(spot_list) > 0 else y_va[:,1] # generate metal id for embedding lookup train_X_id_embedding = [i]*len(X_ta) val_X_id_embedding = [i]*len(X_val) test_X_id_embedding = [i]*len(X_te) if len(final_X_tr) == 0: final_X_tr = copy(X_ta) else: final_X_tr = np.concatenate((final_X_tr, X_ta), axis=0) if len(final_y_tr) == 0: final_y_tr = copy(y_ta) else: final_y_tr = np.concatenate((final_y_tr, y_ta), axis=0) if len(final_X_te) == 0: final_X_te = copy(X_te) else: final_X_te = np.concatenate((final_X_te, X_te), axis=0) if len(final_y_te) == 0: final_y_te = copy(y_te) else: final_y_te = np.concatenate((final_y_te, y_te), axis=0) y_te_rank = np.argsort(y_te) y_te_class = [] for item in y_te: y_te_class.append(item) final_y_te_class_list.append(y_te_class) split_position = len(y_te) // 3 final_y_te_bot_ind_list.append(y_te_rank[:split_position]) final_y_te_top_ind_list.append(y_te_rank[-split_position:]) y_te_class = np.array(y_te_class) final_y_te_class_bot_list.append( y_te_class[y_te_rank[:split_position]]) final_y_te_class_top_list.append( y_te_class[y_te_rank[-split_position:]]) if len(final_X_val) == 0: final_X_val = copy(X_val) else: final_X_val = np.concatenate((final_X_val, X_val), axis=0) if len(final_y_val) == 0: final_y_val = copy(y_val) else: final_y_val = np.concatenate((final_y_val, y_val), axis=0) final_train_X_embedding+=train_X_id_embedding final_test_X_embedding+=test_X_id_embedding final_val_X_embedding+=val_X_id_embedding i+=1 print('Dataset statistic: #examples') print('Testing:', len(final_X_te), len(final_y_te), len(final_test_X_embedding)) # begin to train the model input_dim = final_X_tr.shape[-1] window_size = self.lag case_number = len(ground_truths_list) # begin to predict start = time.time() test_loss_list = [] test_X = torch.from_numpy(final_X_te).float() test_Y = torch.from_numpy(final_y_te).float() var_x_test_id = torch.LongTensor(np.array(final_test_X_embedding)) if self.mc: net = torch.load(os.path.join('result','model','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+str(drop_out)+"_"+str(hidden_state)+"_"+str(embedding_size)+"_"+str(self.lag)+"_"+str(drop_out_mc)+"_"+str(repeat_mc)+"_"+self.version+"_"+'alstm.pkl')) final_test_output = [[],[],[],[],[],[]] for i in range(len(test_X)//6): clone_test_X = test_X.clone()[i::(len(test_X)//6)] clone_var_x_test_id = var_x_test_id.clone()[i::(len(test_X)//6)] for rep in range(repeat_mc): if rep == 0: test_output = net(clone_test_X, clone_var_x_test_id).detach().numpy() else: test_output = np.append(test_output,net(clone_test_X, clone_var_x_test_id).detach().numpy(),axis = 1) final_test_output[0].append(test_output[0].tolist()) final_test_output[1].append(test_output[1].tolist()) final_test_output[2].append(test_output[2].tolist()) final_test_output[3].append(test_output[3].tolist()) final_test_output[4].append(test_output[4].tolist()) final_test_output[5].append(test_output[5].tolist()) final_test_output = np.array(final_test_output[0] + final_test_output[1] + final_test_output[2] + final_test_output[3] + final_test_output[4] + final_test_output[5]) standard_dev = final_test_output.std(axis = 1) test_output = final_test_output.sum(axis = 1)/repeat_mc print(len(standard_dev),len(test_output)) else: net = torch.load(os.path.join('result','model','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+str(drop_out)+"_"+str(hidden_state)+"_"+str(embedding_size)+"_"+str(self.lag)+"_"+self.version+"_"+'alstm.pkl')) net.eval() test_output = net(test_X, var_x_test_id).detach().view(-1,) current_test_pred = list((1+test_output) * spot_list) pred_length = int(len(current_test_pred)/6) for num,gt in enumerate(ground_truths_list): final_list = pd.DataFrame(current_test_pred[num*pred_length:(num+1)*pred_length],index = val_dates, columns = ["Prediction"]) sd_list = pd.DataFrame(standard_dev[num*pred_length:(num+1)*pred_length],index = val_dates, columns = ["uncertainty"]) if self.mc: pred_path = os.path.join(os.getcwd(),"result","prediction","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version,str(self.mc)])+".csv") sd_path = os.path.join(os.getcwd(),"result","uncertainty","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version,str(self.mc)])+".csv") sd_list.to_csv(sd_path) else: pred_path = os.path.join(os.getcwd(),"result","prediction","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version])+".csv") final_list.to_csv(pred_path) end = time.time() print("predict time: {}".format(end-start))
def test(self): print("begin to test") pure_LogReg = LogReg(parameters={}) #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "test") split_dates = [train_time, evalidate_date, str(today)] if gn.even_version(self.version): model = pure_LogReg.load(self.version, "LME_All_Spot", self.horizon, self.lag, evalidate_date) else: model = pure_LogReg.load(self.version, self.gt, self.horizon, self.lag, evalidate_date) #generate the version version_params = generate_version_params(self.version) metal_id = False if gn.even_version(self.version): metal_id = True #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, [self.gt], self.lag, copy(split_dates), version_params, metal_id_bool=metal_id, live=True) prob = model.predict(final_X_va) probability = model.predict_proba(final_X_va) np.savetxt( os.path.join( "result", "probability", "logistic", "_".join([ self.gt + str(self.horizon), date, "lr", self.version, "probability.txt" ])), probability) final_list = [] piece_list = [] for i, val_date in enumerate(val_dates): piece_list.append(val_date) piece_list.append(prob[i]) final_list.append(piece_list) piece_list = [] final_dataframe = pd.DataFrame(prob, columns=['prediction'], index=val_dates) final_dataframe.to_csv( os.path.join( "result", "prediction", "logistic", "_".join([self.gt, date, str(self.horizon), self.version]) + ".csv"))
def tune(self, max_iter): print("begin to tune") #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") #generate list of list of dates for rolling window today = self.date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, end_time = gn.get_relevant_dates(today, length, "tune") split_dates = gn.rolling_half_year(start_time, end_time, length) #generate the version parameters (parameters that control the preprocess) version_params = generate_version_params(self.version) #prepare holder for results ans = {"C": []} #loop over each half year for s, split_date in enumerate(split_dates): print("the train date is {}".format(split_date[1])) print("the test date is {}".format(split_date[2])) #toggle metal id metal_id = False ground_truth_list = [self.gt] if gn.even_version(self.version): metal_id = True ground_truth_list = [ "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Zn_Spot", "LME_Pb_Spot" ] #extract copy of data to process ts = copy(time_series.loc[split_date[0]:split_date[-1]]) tvt_date = split_date[1:-1] #prepare data according to model type and version parameters final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, ground_truth_list, self.lag, copy(tvt_date), version_params, metal_id_bool=metal_id) #generate hyperparameters instances if self.horizon == 1: C_list = [0.01, 0.1, 1.0, 10.0, 100.0] elif self.horizon == 3: C_list = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] elif self.horizon == 5: C_list = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] elif self.horizon == 10: C_list = [0.001, 0.01, 0.1, 1.0, 10.0] elif self.horizon == 20: C_list = [0.001, 0.01, 0.1, 1.0, 10.0] elif self.horizon == 60: C_list = [1.0, 10.0, 100.0, 1000.0] # if self.horizon <=5: # if self.version == "v23": # C_list = [0.01,0.1,1.0,10.0,100.0,1000.0] # else: # C_list = [0.001,0.01,0.1,1.0,10.0,100.0] # else: # if self.version == "v24": # C_list = [0.1,1.0,10.0,100.0,1000.0,10000.0] # else: # C_list = [1e-5,0.0001,0.001,0.01,0.1,1.0,10.0] #generate model results for each hyperparameter instance for each half year for C in C_list: if C not in ans['C']: ans["C"].append(C) if split_date[2] + "_acc" not in ans.keys(): ans[split_date[2] + "_acc"] = [] ans[split_date[2] + "_pos_f1_score"] = [] ans[split_date[2] + "_neg_f1_score"] = [] ans[split_date[2] + "_f1_score"] = [] ans[split_date[2] + "_length"] = [] pure_LogReg = LogReg(parameters={}) max_iter = max_iter parameters = { "penalty": "l2", "C": C, "solver": "lbfgs", "tol": 1e-7, "max_iter": 6 * 4 * config_length * max_iter, "verbose": 0, "warm_start": False, "n_jobs": -1 } pure_LogReg.train(final_X_tr, final_y_tr.flatten(), parameters) pred = pure_LogReg.predict(final_X_va) y_label = pd.DataFrame(final_y_va.flatten(), columns=["prediction"], index=val_dates) y_pred = pd.DataFrame(pred, columns=["prediction"], index=val_dates) acc = accuracy_score(y_label, y_pred) pos_f1 = f1_score(y_label, y_pred) y_label = 1 * (y_label == 0.0) y_pred = 1 * (y_pred == 0.0) neg_f1 = f1_score(y_label, y_pred) f1 = (pos_f1 + neg_f1) / 2 ans[split_date[2] + "_acc"].append(acc) ans[split_date[2] + "_pos_f1_score"].append(pos_f1) ans[split_date[2] + "_neg_f1_score"].append(neg_f1) ans[split_date[2] + "_f1_score"].append(f1) ans[split_date[2] + "_length"].append(len( final_y_va.flatten())) ans = pd.DataFrame(ans) ave_acc = None length = None #generate total average across all half years for col in ans.columns.values.tolist(): if "_acc" in col: if ave_acc is None: ave_acc = ans.loc[:, col] * ans.loc[:, col[:-3] + "length"] ave_f1 = ans.loc[:, col[:-3 + "f1_score"]] * ans.loc[:, col[:-3] + "length"] length = ans.loc[:, col[:-3] + "length"] else: ave_acc = ave_acc + ans.loc[:, col] * ans.loc[:, col[:-3] + "length"] ave_f1 = ave_f1 + ans.loc[:, col[:-3 + "f1_score"]] * ans.loc[:, col[:-3] + "length"] length = length + ans.loc[:, col[:-3] + "length"] ave_acc = ave_acc / length ave_f1 = ave_f1 / length ans = pd.concat([ ans, pd.DataFrame({ "average accuracy": ave_acc, "average_f1": ave_f1 }) ], axis=1) #store results in csv pd.DataFrame(ans).to_csv(os.path.join(os.getcwd(),'result','validation','logistic',\ "_".join(["log_reg",self.gt,self.version,str(self.lag),str(self.horizon)+".csv"])))
def test(self): #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "test") split_dates = [train_time, evalidate_date, str(today)] #generate the version version_params = generate_version_params(self.version) metal_id = False if gn.even_version(self.version): metal_id = True #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, [self.gt], self.lag, copy(split_dates), version_params, metal_id_bool=metal_id, live=True) train_dataframe = pd.DataFrame(final_X_tr, columns=column_lag_list) train_X = train_dataframe.loc[:, column_lag_list] train_y = pd.DataFrame(final_y_tr, columns=['prediction']) test_dataframe = pd.DataFrame(final_X_va, columns=column_lag_list) test_X = test_dataframe.loc[:, column_lag_list] n_splits = 10 pos = sum(train_y.values)[0] from sklearn.metrics import accuracy_score model = xgb.XGBClassifier( n_estimators=500, silent=True, nthread=10, colsample_bytree=0.7, colsample_bylevel=1, reg_alpha=0.0001, reg_lambda=1, scale_pos_weight=(len(train_y.values) - pos) / pos, seed=1440, missing=None) folds = KFold(n_splits=n_splits) scores = [] prediction = np.zeros((len(final_X_va), 1)) folder_index = [] #load the model for fold_n, (train_index, valid_index) in enumerate(folds.split(train_X)): if not gn.even_version(self.version): model = pickle.load( open( os.path.join( "result", "model", "xgboost", split_dates[1] + "_" + self.gt + "_" + str(self.horizon) + "_" + str(self.lag) + "_" + str(fold_n) + "_" + self.version + "_" + 'xgb.model'), "rb")) else: model = pickle.load( open( os.path.join( "result", "model", "xgboost", split_dates[1] + "_LME_All_Spot_" + str(self.horizon) + "_" + str(self.lag) + "_" + str(fold_n) + "_" + self.version + "_" + 'xgb.model'), "rb")) y_pred = model.predict_proba( test_X, ntree_limit=model.best_ntree_limit)[:, 1] y_pred = y_pred.reshape(-1, 1) if fold_n == 0: folder_1 = y_pred folder_1 = folder_1.reshape(len(folder_1), 1) elif fold_n == 1: folder_2 = y_pred folder_2 = folder_2.reshape(len(folder_2), 1) elif fold_n == 2: folder_3 = y_pred folder_3 = folder_3.reshape(len(folder_3), 1) elif fold_n == 3: folder_4 = y_pred folder_4 = folder_4.reshape(len(folder_4), 1) elif fold_n == 4: folder_5 = y_pred folder_5 = folder_5.reshape(len(folder_5), 1) elif fold_n == 5: folder_6 = y_pred folder_6 = folder_6.reshape(len(folder_6), 1) elif fold_n == 6: folder_7 = y_pred folder_7 = folder_7.reshape(len(folder_7), 1) elif fold_n == 7: folder_8 = y_pred folder_8 = folder_8.reshape(len(folder_8), 1) elif fold_n == 8: folder_9 = y_pred folder_9 = folder_9.reshape(len(folder_9), 1) elif fold_n == 9: folder_10 = y_pred folder_10 = folder_10.reshape(len(folder_10), 1) #calculate the all folder voting result = np.concatenate( (folder_1, folder_2, folder_3, folder_4, folder_5, folder_6, folder_7, folder_8, folder_9, folder_10), axis=1) np.savetxt( os.path.join( "result", "probability", "xgboost", self.gt + "_h" + str(self.horizon) + "_" + date + "_xgboost" + self.version + ".txt"), result) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) print("the all folder voting precision is {}".format( metrics.accuracy_score(final_y_va, final_list))) final_list = pd.DataFrame(final_list, index=val_dates, columns=["prediction"]) final_list.to_csv( os.path.join( os.getcwd(), "result", "prediction", "xgboost", "_".join([self.gt, date, str(self.horizon), self.version]) + ".csv"))
def train(self, C=0.01, tol=1e-7, max_iter=100): print("begin to train") #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "train") split_dates = [train_time, evalidate_date, str(today)] #generate the version version_params = generate_version_params(self.version) print("the train date is {}".format(split_dates[0])) print("the test date is {}".format(split_dates[1])) #toggle metal id metal_id = False ground_truth_list = [self.gt] if gn.even_version(self.version): metal_id = True ground_truth_list = [ "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Zn_Spot", "LME_Pb_Spot" ] #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, ground_truth_list, self.lag, copy(split_dates), version_params, metal_id_bool=metal_id) pure_LogReg = LogReg(parameters={}) parameters = { "penalty": "l2", "C": C, "solver": "lbfgs", "tol": tol, "max_iter": 6 * 4 * config_length * max_iter, "verbose": 0, "warm_start": False, "n_jobs": -1 } pure_LogReg.train(final_X_tr, final_y_tr.flatten(), parameters) pure_LogReg.save(self.version, self.gt, self.horizon, self.lag, evalidate_date)
def train(self, max_depth, learning_rate, gamma, min_child_weight, subsample): print("begin to train") #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "train") split_dates = [train_time, evalidate_date, str(today)] #generate the version version_params = generate_version_params(self.version) print("the train date is {}".format(split_dates[0])) print("the test date is {}".format(split_dates[1])) #toggle metal id metal_id = False ground_truth_list = [self.gt] if gn.even_version(self.version): metal_id = True ground_truth_list = [ "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Zn_Spot", "LME_Pb_Spot" ] #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, ground_truth_list, self.lag, copy(split_dates), version_params, metal_id_bool=metal_id) train_dataframe = pd.DataFrame(final_X_tr, columns=column_lag_list) train_X = train_dataframe.loc[:, column_lag_list] train_y = pd.DataFrame(final_y_tr, columns=['result']) test_dataframe = pd.DataFrame(final_X_va, columns=column_lag_list) test_X = test_dataframe.loc[:, column_lag_list] n_splits = 10 pos = sum(train_y.values)[0] from sklearn.metrics import accuracy_score model = xgb.XGBClassifier( max_depth=max_depth, learning_rate=learning_rate, n_estimators=500, silent=True, nthread=10, gamma=gamma, min_child_weight=min_child_weight, subsample=subsample, colsample_bytree=0.7, colsample_bylevel=1, reg_alpha=0.0001, reg_lambda=1, scale_pos_weight=(len(train_y.values) - pos) / pos, seed=1440, missing=None) folds = KFold(n_splits=n_splits) scores = [] prediction = np.zeros((len(final_X_va), 1)) folder_index = [] #save the model for fold_n, (train_index, valid_index) in enumerate(folds.split(train_X)): X_train, X_valid = train_X[column_lag_list].iloc[ train_index], train_X[column_lag_list].iloc[valid_index] y_train, y_valid = train_y.iloc[train_index], train_y.iloc[ valid_index] model.fit(X_train, y_train, eval_metric='error', verbose=True, eval_set=[(X_valid, y_valid)], early_stopping_rounds=5) y_pred_valid = model.predict(X_valid) pickle.dump( model, open( os.path.join( 'result', 'model', 'xgboost', split_dates[1] + "_" + self.gt + "_" + str(self.horizon) + "_" + str(self.lag) + "_" + str(fold_n) + "_" + self.version + "_" + 'xgb.model'), "wb")) y_pred = model.predict_proba( test_X, ntree_limit=model.best_ntree_limit)[:, 1] y_pred = y_pred.reshape(-1, 1) if fold_n == 0: folder_1 = y_pred folder_1 = folder_1.reshape(len(folder_1), 1) elif fold_n == 1: folder_2 = y_pred folder_2 = folder_2.reshape(len(folder_2), 1) elif fold_n == 2: folder_3 = y_pred folder_3 = folder_3.reshape(len(folder_3), 1) elif fold_n == 3: folder_4 = y_pred folder_4 = folder_4.reshape(len(folder_4), 1) elif fold_n == 4: folder_5 = y_pred folder_5 = folder_5.reshape(len(folder_5), 1) elif fold_n == 5: folder_6 = y_pred folder_6 = folder_6.reshape(len(folder_6), 1) elif fold_n == 6: folder_7 = y_pred folder_7 = folder_7.reshape(len(folder_7), 1) elif fold_n == 7: folder_8 = y_pred folder_8 = folder_8.reshape(len(folder_8), 1) elif fold_n == 8: folder_9 = y_pred folder_9 = folder_9.reshape(len(folder_9), 1) elif fold_n == 9: folder_10 = y_pred folder_10 = folder_10.reshape(len(folder_10), 1) #calculate the all folder voting result = np.concatenate( (folder_1, folder_2, folder_3, folder_4, folder_5, folder_6, folder_7, folder_8, folder_9, folder_10), axis=1)
def tune(self): print("begin to choose the parameter") #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") #generate list of list of dates for rolling window today = self.date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, end_time = gn.get_relevant_dates(today, length, "tune") split_dates = gn.rolling_half_year(start_time, end_time, length) #generate the version parameters (parameters that control the preprocess) version_params = generate_version_params(self.version) #prepare holder for results ans = {"C": []} #loop over each half year for s, split_date in enumerate(split_dates): print("the train date is {}".format(split_date[1])) print("the test date is {}".format(split_date[2])) #toggle metal id metal_id = False ground_truth_list = [self.gt] if gn.even_version(self.version): metal_id = True ground_truth_list = [ "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Zn_Spot", "LME_Pb_Spot" ] #extract copy of data to process ts = copy(time_series.loc[split_date[0]:split_date[-1]]) tvt_date = split_date[1:-1] #prepare data according to model type and version parameters final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, ground_truth_list, self.lag, copy(tvt_date), version_params, metal_id_bool=metal_id) train_dataframe = pd.DataFrame(final_X_tr, columns=column_lag_list) train_X = train_dataframe.loc[:, column_lag_list] train_y = pd.DataFrame(final_y_tr, columns=['result']) test_dataframe = pd.DataFrame(final_X_va, columns=column_lag_list) test_X = test_dataframe.loc[:, column_lag_list] n_splits = 10 #tune xgboost hyper parameter with grid search for max_depth in [3, 4, 5]: for learning_rate in [0.6, 0.7, 0.8, 0.9]: for gamma in [0.6, 0.7, 0.8, 0.9]: for min_child_weight in [3, 4, 5, 6]: for subsample in [0.6, 0.7, 0.85, 0.9]: from sklearn.metrics import accuracy_score model = xgb.XGBClassifier( max_depth=max_depth, learning_rate=learning_rate, n_estimators=500, silent=True, nthread=10, gamma=gamma, min_child_weight=min_child_weight, subsample=subsample, colsample_bytree=0.7, colsample_bylevel=1, reg_alpha=0.0001, reg_lambda=1, scale_pos_weight=1, seed=1440, missing=None) folds = KFold(n_splits=n_splits) scores = [] prediction = np.zeros((len(final_X_va), 1)) folder_index = [] #generate k fold and train xgboost model for fold_n, (train_index, valid_index) in enumerate( folds.split(train_X)): X_train, X_valid = train_X[ column_lag_list].iloc[ train_index], train_X[ column_lag_list].iloc[ valid_index] y_train, y_valid = train_y.iloc[ train_index], train_y.iloc[valid_index] model.fit(X_train, y_train, eval_metric='error', verbose=True, eval_set=[(X_valid, y_valid)], early_stopping_rounds=5) y_pred_valid = model.predict(X_valid) y_pred = model.predict_proba( test_X, ntree_limit=model.best_ntree_limit)[:, 1] y_pred = y_pred.reshape(-1, 1) if fold_n == 0: folder_1 = y_pred folder_1 = folder_1.reshape( len(folder_1), 1) elif fold_n == 1: folder_2 = y_pred folder_2 = folder_2.reshape( len(folder_2), 1) elif fold_n == 2: folder_3 = y_pred folder_3 = folder_3.reshape( len(folder_3), 1) elif fold_n == 3: folder_4 = y_pred folder_4 = folder_4.reshape( len(folder_4), 1) elif fold_n == 4: folder_5 = y_pred folder_5 = folder_5.reshape( len(folder_5), 1) elif fold_n == 5: folder_6 = y_pred folder_6 = folder_6.reshape( len(folder_6), 1) elif fold_n == 6: folder_7 = y_pred folder_7 = folder_7.reshape( len(folder_7), 1) elif fold_n == 7: folder_8 = y_pred folder_8 = folder_8.reshape( len(folder_8), 1) elif fold_n == 8: folder_9 = y_pred folder_9 = folder_9.reshape( len(folder_9), 1) elif fold_n == 9: folder_10 = y_pred folder_10 = folder_10.reshape( len(folder_10), 1) #calculate the all folder voting result = np.concatenate( (folder_1, folder_2, folder_3, folder_4, folder_5, folder_6, folder_7, folder_8, folder_9, folder_10), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) #print("the lag is {}".format(lag)) print("the all folder voting precision is {}". format( metrics.accuracy_score( final_y_va, final_list))) #calculate the near folder voting result = np.concatenate( (folder_6, folder_7, folder_8, folder_9, folder_10), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) print("the near precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) #calculate the far folder voting result = np.concatenate( (folder_1, folder_2, folder_3, folder_4, folder_5), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) print("the far precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) #calculate the same folder voting if split_date[1].split("-")[1] == '01': result = np.concatenate( (folder_1, folder_3, folder_5, folder_7, folder_9), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) #print("the lag is {}".format(lag)) print("the same precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) #calculate the reverse folder voting result = np.concatenate( (folder_2, folder_4, folder_6, folder_8, folder_10), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) #print("the lag is {}".format(lag)) print("the reverse precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) print("the max_depth is {}".format( max_depth)) print("the learning_rate is {}".format( learning_rate)) print("the gamma is {}".format(gamma)) print("the min_child_weight is {}".format( min_child_weight)) print("the subsample is {}".format( subsample)) else: #calculate the same folder voting result = np.concatenate( (folder_2, folder_4, folder_6, folder_8, folder_10), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) #print("the lag is {}".format(lag)) print("the same precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) #calculate the reverse folder voting result = np.concatenate( (folder_1, folder_3, folder_5, folder_7, folder_9), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) #print("the lag is {}".format(lag)) print("the reverse precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) print("the max_depth is {}".format( max_depth)) print("the learning_rate is {}".format( learning_rate)) print("the gamma is {}".format(gamma)) print("the min_child_weight is {}".format( min_child_weight)) print("the subsample is {}".format( subsample)) print("the lag is {}".format(self.lag)) print("the train date is {}".format(split_date[0])) print("the test date is {}".format(split_date[1])) print("the length is {}".format(len(test_X)))
def test(self,split = 0.9, num_epochs=50, drop_out=0.0, embedding_size=5, batch_size=512, hidden_state=50, lrate=0.001, attention_size=2, interval=1, lambd=0, save_loss=0, save_prediction=0, method = ""): sys.path[0] = os.curdir print(sys.path) print("begin to test") #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series,LME_dates,config_length = gn.read_data_with_specified_columns(self.source,self.path,"2003-11-12") for date in self.date.split(","): torch.manual_seed(1) np.random.seed(1) random.seed(1) today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time,train_time,evalidate_date = gn.get_relevant_dates(today,length,"test") split_dates = [train_time,evalidate_date,str(today)] #generate the version parameters version_params=generate_version_params(self.version) print("the test date is {}".format(split_dates[1])) norm_volume = "v1" norm_3m_spread = "v1" norm_ex = "v1" len_ma = 5 len_update = 30 tol = 1e-7 norm_params = {'vol_norm':norm_volume,'ex_spread_norm':norm_ex,'spot_spread_norm':norm_3m_spread, 'len_ma':len_ma,'len_update':len_update,'both':3,'strength':0.01,'xgboost':False} tech_params = {'strength':0.01,'both':3,'Win_VSD':[10,20,30,40,50,60],'Win_EMA':12,'Win_Bollinger':22, 'Fast':12,'Slow':26,'Win_NATR':10,'Win_VBM':22,'acc_initial':0.02,'acc_maximum':0.2,"live":None} #for versions that tune over 6 metals final_X_tr = [] final_y_tr = [] final_X_val = [] final_y_val = [] final_X_te = [] final_y_te = [] final_y_te_class_list = [] final_y_te_class_top_list = [] final_y_te_top_ind_list = [] final_y_te_class_bot_list = [] final_y_te_bot_ind_list = [] final_train_X_embedding = [] final_test_X_embedding = [] final_val_X_embedding = [] i = 0 #toggle metal id metal_id = False ground_truths_list = ["LME_Cu_Spot","LME_Al_Spot","LME_Ni_Spot","LME_Xi_Spot","LME_Zn_Spot","LME_Pb_Spot"] for ground_truth in ground_truths_list: new_time_series = copy(time_series) spot_list = np.array(new_time_series[ground_truth]) new_time_series['spot_price'] = spot_list ts = new_time_series.loc[start_time:split_dates[2]] #load data for use X_tr, y_tr, X_va, y_va, val_dates, column_lag_list = gn.prepare_data(ts,LME_dates,self.horizon,[ground_truth],self.lag,copy(split_dates),version_params,metal_id_bool = metal_id,reshape = False,live = True) # split validation X_ta = X_tr[:int(len(X_tr) * split), :, :] y_ta = y_tr[:int(len(y_tr) * split)] X_val = X_tr[int(len(X_tr) * split):, :, :] y_val = y_tr[int(len(y_tr) * split):] X_te = X_va y_te = y_va # generate metal id for embedding lookup train_X_id_embedding = [i]*len(X_ta) val_X_id_embedding = [i]*len(X_val) test_X_id_embedding = [i]*len(X_te) if len(final_X_tr) == 0: final_X_tr = copy(X_ta) else: final_X_tr = np.concatenate((final_X_tr, X_ta), axis=0) if len(final_y_tr) == 0: final_y_tr = copy(y_ta) else: final_y_tr = np.concatenate((final_y_tr, y_ta), axis=0) if len(final_X_te) == 0: final_X_te = copy(X_te) else: final_X_te = np.concatenate((final_X_te, X_te), axis=0) if len(final_y_te) == 0: final_y_te = copy(y_te) else: final_y_te = np.concatenate((final_y_te, y_te), axis=0) y_te_rank = np.argsort(y_te[:,0]) y_te_class = [] for item in y_te: if item >= thresh: y_te_class.append(1) else: y_te_class.append(0) final_y_te_class_list.append(y_te_class) split_position = len(y_te) // 3 final_y_te_bot_ind_list.append(y_te_rank[:split_position]) final_y_te_top_ind_list.append(y_te_rank[-split_position:]) y_te_class = np.array(y_te_class) final_y_te_class_bot_list.append( y_te_class[y_te_rank[:split_position]]) final_y_te_class_top_list.append( y_te_class[y_te_rank[-split_position:]]) if len(final_X_val) == 0: final_X_val = copy(X_val) else: final_X_val = np.concatenate((final_X_val, X_val), axis=0) if len(final_y_val) == 0: final_y_val = copy(y_val) else: final_y_val = np.concatenate((final_y_val, y_val), axis=0) final_train_X_embedding+=train_X_id_embedding final_test_X_embedding+=test_X_id_embedding final_val_X_embedding+=val_X_id_embedding # update metal index i+=1 print('Dataset statistic: #examples') print('Testing:', len(final_X_te), len(final_y_te), len(final_test_X_embedding)) # begin to train the model input_dim = final_X_tr.shape[-1] window_size = self.lag case_number = len(ground_truths_list) # begin to predict start = time.time() test_loss_list = [] test_X = torch.from_numpy(final_X_te).float() test_Y = torch.from_numpy(final_y_te).float() var_x_test_id = torch.LongTensor(np.array(final_test_X_embedding)) net = torch.load(os.path.join('result','model','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+str(drop_out)+"_"+str(hidden_state)+"_"+str(embedding_size)+"_"+str(self.lag)+"_"+self.version+"_"+'alstm.pkl')) net.eval() test_output = net(test_X, var_x_test_id) current_test_pred = list(test_output.detach().view(-1,)) current_test_class = [1 if ele>thresh else 0 for ele in current_test_pred] np.savetxt(os.path.join('result','probability','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+self.version+".txt"),current_test_class) pred_length = int(len(current_test_class)/6) for num,gt in enumerate(ground_truths_list): final_list = pd.DataFrame(current_test_class[num*pred_length:(num+1)*pred_length],index = val_dates, columns = ["Prediction"]) final_list.to_csv(os.path.join(os.getcwd(),"result","prediction","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version])+".csv")) end = time.time() print("predict time: {}".format(end-start))
def train(self): print("begin to train") #assert that the configuration path is correct self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "train") split_dates = [train_time, evalidate_date, str(today)] #generate the version version_params = generate_version_params(self.version) print("the train date is {}".format(split_dates[0])) print("the test date is {}".format(split_dates[1])) #toggle metal id metal_id = False ground_truth_list = [self.gt] if gn.even_version(self.version): metal_id = True ground_truth_list = [ "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Zn_Spot", "LME_Pb_Spot" ] #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, ground_truth_list, self.lag, copy(split_dates), version_params, metal_id_bool=metal_id) LR = LinearRegression(n_jobs=-1) LR.fit(final_X_tr, final_y_tr[:, 0]) if gn.even_version(self.version): joblib.dump( LR, os.path.join( os.getcwd(), 'result', 'model', 'linear', self.version + "_ALL_" + str(self.horizon) + "_" + str(self.lag) + "_" + evalidate_date + '.pkl')) else: joblib.dump( LR, os.path.join( os.getcwd(), 'result', 'model', 'linear', self.version + "_" + self.gt + "_" + str(self.horizon) + "_" + str(self.lag) + "_" + evalidate_date + '.pkl'))
def test(self): #split the date print("begin to test") #assert that the configuration path is correct self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "test") split_dates = [train_time, evalidate_date, str(today)] if gn.even_version(self.version): model = joblib.load( os.path.join( os.getcwd(), 'result', 'model', 'linear', self.version + "_ALL_" + str(self.horizon) + "_" + str(self.lag) + "_" + evalidate_date + '.pkl')) else: model = joblib.load( os.path.join( os.getcwd(), 'result', 'model', 'linear', self.version + "_" + self.gt + "_" + str(self.horizon) + "_" + str(self.lag) + "_" + evalidate_date + '.pkl')) #generate the version version_params = generate_version_params(self.version) metal_id = False if gn.even_version(self.version): metal_id = True #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, [self.gt], self.lag, copy(split_dates), version_params, metal_id_bool=metal_id, live=True) prob = (1 + model.predict(final_X_va)) * final_y_va[:, 1] final_list = [] piece_list = [] for i, val_date in enumerate(val_dates): piece_list.append(val_date) piece_list.append(prob[i]) final_list.append(piece_list) piece_list = [] final_dataframe = pd.DataFrame(prob, columns=['prediction'], index=val_dates) final_dataframe.to_csv( os.path.join( "result", "prediction", "linear", self.version, "_".join([self.gt, date, str(self.horizon), self.version]) + ".csv"))