def test(self, inputs): spot_price = read_data_with_specified_columns(inputs['source'],'exp/3d/Co/logistic_regression/v3/LMCADY_v3.conf','2003-11-12')[0].loc[:,self.ground_truth].to_frame() for date in self.dates.split(','): #generate model specific arguments if self.model is None: model = Post_process() #case if we are running a substitution post process elif self.model == "Substitution": X = { 'Prediction' : read_classification(self.ground_truth,self.horizon,date,self.version[0],"ensemble") } #case if we are substituting with analyst report if inputs['substitution'] == "analyst": validation_date = date.split("-")[0]+"-01-01" if date[5:7] <= "06" else date.split("-")[0]+"-07-01" #read substitution configuration which details the metal and horizon combinations that are eligible for substitution with open("exp/substitution.conf","r") as f: config = json.load(f) #case if they are not to be substituted if self.ground_truth not in config.keys() or self.horizon not in config[self.ground_truth]: model = Post_process() X["Uncertainty"] = read_uncertainty(self.ground_truth,self.horizon,date,"ensemble","classification") prediction = model.predict(X) X["Uncertainty"].to_csv(os.path.join("result","uncertainty","classification",'_'.join([self.ground_truth,validation_date,str(self.horizon),"substitution.csv"]))) #case if they are to be substituted else: model = Post_process_substitution() validation_date = date.split("-")[0]+"-01-01" if date[5:7] <= "06" else date.split("-")[0]+"-07-01" X["Substitute"] = read_substitution_analyst(self.ground_truth, self.horizon, date) X["Uncertainty"] = read_uncertainty(self.ground_truth,self.horizon,date,"ensemble","classification") prediction, uncertainty = model.predict(X) uncertainty.to_csv(os.path.join("result","uncertainty","classification",'_'.join([self.ground_truth,validation_date,str(self.horizon),"substitution.csv"]))) #case if we are running a filter post process elif self.model == "Filter": X = { 'Prediction' : read_classification(self.ground_truth,self.horizon,date,self.version[0],"Substitution") } model = Post_process_filter() X["Prediction"] = read_regression(spot_price, self.ground_truth, self.horizon, date, self.version[1]) X["Filter"] = generate_final_signal(spot_price,self.ground_truth, self.horizon, date, self.version[0], self.version[1], inputs["class_threshold"], inputs["reg_threshold"], inputs["reg_window"]) print(X["Filter"]) prediction = model.predict(X) prediction.to_csv(os.path.join('result','prediction','post_process',self.model,'_'.join([self.ground_truth,date,str(self.horizon),self.model+".csv"])))
hidden_state = args.hidden_state dropout = args.drop_out attention_size = args.attention_size embedding_size = args.embedding_size lambd = args.lambd save_loss = args.save_loss save_prediction = args.save_prediction # prepare for the data time_horizon = args.horizon if args.action == 'train': comparison = None n = 0 #read the data from the 4E or NExT database time_series,LME_dates,config_length = gn.read_data_with_specified_columns(args.source,args.data_configure_file,"2003-11-12") #generate list of list of dates to be used to roll over 5 half years today = args.date length = 5 if gn.even_version(args.version) and time_horizon > 5: length = 4 start_time,end_time = gn.get_relevant_dates(today,length,"tune") split_dates = gn.rolling_half_year(start_time,end_time,length) split_dates = split_dates[:] importance_list = [] #generate the version version_params=generate_version_params(args.version)
def tune(self, inputs): #initialize parameters class_dict = {'threshold':[],'acc':[],'coverage':[], 'total_len' :[]} reg_dict = {'threshold':[],'mae':[], 'coverage':[], 'total_len' :[]} spot_price = read_data_with_specified_columns(inputs['source'],'exp/3d/Co/logistic_regression/v3/LMCADY_v3.conf','2003-11-12')[0].loc[:,self.ground_truth].to_frame() for date in self.dates.split(","): class_dict[date+"_acc"] = [] class_dict[date+"_coverage"] = [] class_dict[date+"_total_len"] = [] # reg_dict[date+"_acc"] = [] reg_dict[date+"_mae"] = [] reg_dict[date+"_coverage"] = [] reg_dict[date+"_total_len"] = [] #begin tuning with looping of date for date in self.dates.split(','): validation_date = date.split("-")[0]+"-01-01" if date[5:7] <= "06" else date.split("-")[0]+"-07-01" if self.model == "Filter": #classification tune class_thresh = [i + 0.01 for i in np.arange(0.51,step = float(0.05))] class_combination = product([self.ground_truth], [self.horizon], [date], [self.version[0]], class_thresh) #generate classification signals p = pl(multiprocessing.cpu_count()) class_signal = p.starmap(generate_class_signal, class_combination) p.close() class_signal = pd.concat(class_signal, axis = 1) class_signal.columns = [class_thresh] #begin analysis of classification results for c, col in enumerate(class_signal.columns): class_pred = (read_classification(self.ground_truth, self.horizon, date, self.version[0],"ensemble")*2 - 1).multiply(class_signal[col]*1, axis = 0) class_pred = class_pred.loc[class_pred['result'] != 0] class_label = pd.read_csv(os.path.join("data","Label",'_'.join([self.ground_truth,"h"+str(self.horizon),validation_date,"label.csv"])),index_col = 0)*2 - 1 class_label = class_label.loc[class_pred.index,:] if col not in class_dict['threshold']: class_dict['threshold'].append(col) class_dict['acc'].append(0) class_dict['coverage'].append(0) class_dict['total_len'].append(0) if len(class_pred.index) > 0: class_dict[date + "_acc"].append(metrics.accuracy_score(class_pred,class_label)) class_dict['acc'][c] += metrics.accuracy_score(class_pred, class_label)*len(class_label.index) else: class_dict[date + "_acc"].append(0) class_dict['acc'][c] += 0 class_dict[date +"_coverage"].append(len(class_label.index)/len(class_signal.index)) class_dict[date +"_total_len"].append(len(class_signal.index)) class_dict['coverage'][c] += len(class_label.index) class_dict['total_len'][c] += len(class_signal.index) #regression tuning if self.horizon <= 5: reg_thresh = np.arange(1.01, step = 0.1) else: reg_thresh = np.arange(0.51, step = 0.05) reg_thresh = np.arange(0.05,0.31, step = 0.025) reg_window = [60] reg_combination = product([spot_price], [self.ground_truth], [self.horizon], [date], [self.version[1]], reg_thresh, reg_window) p = pl(multiprocessing.cpu_count()) #generate regression signals reg_signal = p.starmap(generate_reg_signal, reg_combination) p.close() reg_signal = pd.concat(reg_signal, axis = 1) reg_signal.columns = [reg_thresh] #begin analysis of regression results for c,col in enumerate(reg_signal.columns): reg_pred = (read_regression(spot_price, self.ground_truth, self.horizon, date, self.version[1])).multiply(reg_signal[col]*1, axis = 0) reg_pred = reg_pred.loc[reg_pred['Prediction'] != 0] class_pred = np.sign(reg_pred) reg_label = spot_price.shift(-self.horizon).loc[reg_pred.index,:] class_label = pd.read_csv(os.path.join("data","Label",'_'.join([self.ground_truth,"h"+str(self.horizon),validation_date,"label.csv"])),index_col = 0)*2 - 1 class_label = class_label.loc[reg_pred.index,:] spot = spot_price.loc[reg_pred.index,:] if col not in reg_dict['threshold']: reg_dict['threshold'].append(col) # reg_dict['acc'].append(0) reg_dict['mae'].append(0) reg_dict['coverage'].append(0) reg_dict['total_len'].append(0) if len(reg_pred.index) > 0: # reg_dict[date+"_acc"].append(metrics.accuracy_score(class_pred,class_label)) reg_dict[date + "_mae"].append(metrics.mean_absolute_error(reg_pred/np.array(spot),reg_label/np.array(spot))) reg_dict['mae'][c] += metrics.mean_absolute_error(reg_pred/np.array(spot),reg_label/np.array(spot))*len(reg_label.index) # reg_dict["acc"][c] += metrics.accuracy_score(class_pred,class_label)*len(reg_label.index) else: # reg_dict[date + "_acc"].append(0) reg_dict[date + "_mae"].append(0) reg_dict['mae'][c] += 0 # reg_dict['acc'][c] += 0 reg_dict[date +"_coverage"].append(len(reg_label.index)/len(reg_signal.index)) reg_dict[date+"_total_len"].append(len(reg_signal.index)) reg_dict['coverage'][c] += len(reg_label.index) reg_dict['total_len'][c] += len(reg_signal.index) print(reg_dict) #compute average for i in range(len(class_dict['threshold'])): class_dict['acc'][i] = class_dict['acc'][i]/class_dict['coverage'][i] if class_dict['coverage'][i] > 0 else 0 class_dict['coverage'][i] = class_dict['coverage'][i]/class_dict['total_len'][i] for i in range(len(reg_dict['threshold'])): # reg_dict['acc'][i] = reg_dict['acc'][i]/reg_dict['coverage'][i] if reg_dict['coverage'][i] > 0 else 0 reg_dict['mae'][i] = reg_dict['mae'][i]/reg_dict['coverage'][i] if reg_dict['coverage'][i] > 0 else 0 reg_dict['coverage'][i] = reg_dict['coverage'][i]/reg_dict['total_len'][i] class_df = pd.DataFrame(class_dict) reg_df = pd.DataFrame(reg_dict) reg_df = reg_df.loc[reg_df["coverage"] != 0].reset_index(drop = True) #generate ranking class_df['acc_rank'] = class_df['acc'].rank(method = 'min', ascending = False) class_df['coverage_rank'] = class_df['coverage'].rank(method = 'min', ascending = False) class_df['rank'] = (class_df['acc_rank'] + class_df['coverage_rank'])/2 # reg_df['acc_rank'] = reg_df['acc'].rank(method = 'min', ascending = False) reg_df['mae_rank'] = reg_df['mae'].rank(method = 'min', ascending = True) reg_df['coverage_rank'] = reg_df['coverage'].rank(method = 'min', ascending = False) reg_df['rank'] = (reg_df['mae_rank'] + reg_df['coverage_rank'])/2 return class_df,reg_df
parser.add_argument('-sou', '--source', help='source of data', type=str, default="NExT") args = parser.parse_args() if args.ground_truth == 'None': args.ground_truth = None os.chdir(os.path.abspath(sys.path[0])) args.ground_truth = args.ground_truth.split(",") args.horizon = [int(i) for i in args.horizon.split(",")] #read data from specified source if args.source == "NExT": ts, dates, length = read_data_with_specified_columns( "NExT", "exp/LMCADY_v3.conf", "2003-11-12") else: start_date = "2003-11-12" import rpy2.robjects as robjects robjects.r('.sourceQlib()') ts = robjects.r( '''merge(getSecurity(c("LMCADY Comdty","LMAHDY Comdty","LMPBDY Comdty","LMZSDY Comdty","LMNIDY Comdty","LMSNDY Comdty"), start = "''' + start_date + '''"), getSecurityOHLCV(c("LMCADS03 Comdty","LMPBDS03 Comdty","LMNIDS03 Comdty","LMSNDS03 Comdty","LMZSDS03 Comdty","LMAHDS03 Comdty"), start = "''' + start_date + '''") ) ''') ts.colnames = robjects.vectors.StrVector([ "LME_Cu_Spot", "LME_Al_Spot", "LME_Pb_Spot", "LME_Zn_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Cu_Open", "LME_Cu_High", "LME_Cu_Low", "LME_Cu_Close", "LME_Cu_Volume", "LME_Cu_OI",
if not os.path.exists(os.path.join(filepath, f)): ans[validation_dates[i] + "_mae"].append(0) ans[validation_dates[i] + "_mse"].append(0) ans[validation_dates[i] + "_acc"].append(0) ans[validation_dates[i] + "_coverage"].append(0) ans[validation_dates[i] + "_length"].append( len(label.index)) continue #generate labels temp = pd.read_csv(os.path.join(filepath, f), index_col=0) if label.index[-1] > date: label = label.iloc[:-1, :] data, LME_dates, length = read_data_with_specified_columns( args.source, 'exp/LMCADY_v3.conf', '2003-11-12') spot = data.loc[label.index[0]:label.index[-1], gt].to_frame() if args.regression == "ret": temp = (temp - np.array(spot.loc[temp.index, :]) ) / np.array(spot.loc[temp.index, :]) label = (label - np.array(spot)) / np.array(spot) #generate metrics if len(temp.index) == 0: mae = 0 mse = 0 acc = 0 else: mae = mean_absolute_error(label.loc[temp.index, :], temp)
def train( self,split = 0.9, num_epochs=50, drop_out=0.0, drop_out_mc = 0.0, repeat_mc = 10, embedding_size=5, batch_size=512, hidden_state=50, lrate=0.001, attention_size=2, interval=1, lambd=0, save_loss=0, save_prediction=0, method =""): """ drop_out: the dropout rate of LSTM network hidden: number of hidden_state of encoder/decoder embdedding_size: the size of embedding layer batch: the mini-batch size hidden_satte: number of hidden_state of encoder/decoder lrate: learning rate attention_size: the head number in MultiheadAttention Mechanism interval: save models every interval epoch lambd: the weight of classfication loss save_loss: whether to save loss results save_prediction: whether to save prediction results """ sys.path[0] = os.curdir print("begin to train") #assert that the configuration path is correct self.path = gn.generate_config_path(self.version) #retrieve column list based on configuration path time_series,LME_dates,config_length = gn.read_data_with_specified_columns(self.source,self.path,"2003-11-12") #begin to split the train data for date in self.date.split(","): torch.manual_seed(1) np.random.seed(1) random.seed(1) today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time,train_time,evalidate_date = gn.get_relevant_dates(today,length,"train") split_dates = [train_time,evalidate_date,str(today)] #generate the version version_params = generate_version_params(self.version) print("the train date is {}".format(split_dates[0])) print("the test date is {}".format(split_dates[1])) norm_volume = "v1" norm_3m_spread = "v1" norm_ex = "v1" len_ma = 5 len_update = 30 tol = 1e-7 norm_params = {'vol_norm':norm_volume,'ex_spread_norm':norm_ex,'spot_spread_norm':norm_3m_spread, 'len_ma':len_ma,'len_update':len_update,'both':3,'strength':0.01,'xgboost':False} tech_params = {'strength':0.01,'both':3,'Win_VSD':[10,20,30,40,50,60],'Win_EMA':12,'Win_Bollinger':22, 'Fast':12,'Slow':26,'Win_NATR':10,'Win_VBM':22,'acc_initial':0.02,'acc_maximum':0.2,"live":None} #for versions that tune over 6 metals final_X_tr = [] final_y_tr = [] final_X_val = [] final_y_val = [] final_X_te = [] final_y_te = [] final_y_te_class_list = [] final_y_te_class_top_list = [] final_y_te_top_ind_list = [] final_y_te_class_bot_list = [] final_y_te_bot_ind_list = [] final_train_X_embedding = [] final_test_X_embedding = [] final_val_X_embedding = [] i = 0 #toggle metal id metal_id = False ground_truths_list = ["LME_Cu_Spot","LME_Al_Spot","LME_Ni_Spot","LME_Xi_Spot","LME_Zn_Spot","LME_Pb_Spot"] for ground_truth in ground_truths_list: print(ground_truth) new_time_series = copy(time_series) ts = new_time_series.loc[start_time:split_dates[2]] #load data for use X_tr, y_tr, X_va, y_va, val_dates, column_lag_list = gn.prepare_data(ts,LME_dates,self.horizon,[ground_truth],self.lag,copy(split_dates),version_params,metal_id_bool = metal_id,reshape = False) # split validation X_ta = X_tr[:int(len(X_tr) * split), :, :] y_ta = y_tr[:int(len(y_tr) * split),0] X_val = X_tr[int(len(X_tr) * split):, :, :] y_val = y_tr[int(len(y_tr) * split):,0] X_te = X_va y_te = y_va[:,0] # generate metal id for embedding lookup train_X_id_embedding = [i]*len(X_ta) val_X_id_embedding = [i]*len(X_val) test_X_id_embedding = [i]*len(X_te) if len(final_X_tr) == 0: final_X_tr = copy(X_ta) else: final_X_tr = np.concatenate((final_X_tr, X_ta), axis=0) if len(final_y_tr) == 0: final_y_tr = copy(y_ta) else: final_y_tr = np.concatenate((final_y_tr, y_ta), axis=0) if len(final_X_te) == 0: final_X_te = copy(X_te) else: final_X_te = np.concatenate((final_X_te, X_te), axis=0) if len(final_y_te) == 0: final_y_te = copy(y_te) else: final_y_te = np.concatenate((final_y_te, y_te), axis=0) y_te_rank = np.argsort(y_te) y_te_class = [] for item in y_te: y_te_class.append(item) final_y_te_class_list.append(y_te_class) split_position = len(y_te) // 3 final_y_te_bot_ind_list.append(y_te_rank[:split_position]) final_y_te_top_ind_list.append(y_te_rank[-split_position:]) y_te_class = np.array(y_te_class) final_y_te_class_bot_list.append( y_te_class[y_te_rank[:split_position]]) final_y_te_class_top_list.append( y_te_class[y_te_rank[-split_position:]]) if len(final_X_val) == 0: final_X_val = copy(X_val) else: final_X_val = np.concatenate((final_X_val, X_val), axis=0) if len(final_y_val) == 0: final_y_val = copy(y_val) else: final_y_val = np.concatenate((final_y_val, y_val), axis=0) final_train_X_embedding+=train_X_id_embedding final_test_X_embedding+=test_X_id_embedding final_val_X_embedding+=val_X_id_embedding # update metal index i+=1 print('Dataset statistic: #examples') print('Train:', len(final_X_tr), len(final_y_tr), len(final_train_X_embedding)) print(np.max(final_X_tr), np.min(final_X_tr), np.max(final_y_tr), np.min(final_y_tr)) print('Validation:', len(final_X_val), len(final_y_val), len(final_val_X_embedding)) print('Testing:', len(final_X_te), len(final_y_te), len(final_test_X_embedding)) # begin to train the model input_dim = final_X_tr.shape[-1] window_size = self.lag case_number = len(ground_truths_list) start = time.time() trainer = Trainer(input_dim, hidden_state, window_size, lrate, drop_out, case_number, attention_size, embedding_size, drop_out_mc,repeat_mc, final_X_tr, final_y_tr, final_X_te, final_y_te, final_X_val, final_y_val, final_train_X_embedding, final_test_X_embedding, final_val_X_embedding, final_y_te_class_list, final_y_te_class_top_list, final_y_te_class_bot_list, final_y_te_top_ind_list, final_y_te_bot_ind_list, self.mc ) end = time.time() print("pre-processing time: {}".format(end-start)) print("the split date is {}".format(split_dates[1])) save = 1 net=trainer.train_minibatch(num_epochs, batch_size, interval, self.lag, self.version, self.horizon, split_dates, method)
def test(self,split = 0.9, num_epochs=50, drop_out=0.0, drop_out_mc = 0.0, repeat_mc = 10, embedding_size=5, batch_size=512, hidden_state=50, lrate=0.001, attention_size=2, interval=1, lambd=0, save_loss=0, save_prediction=0, method = ""): sys.path[0] = os.curdir print(sys.path) print("begin to test") #assert that the configuration path is correct self.path = gn.generate_config_path(self.version) #retrieve column list based on configuration path time_series,LME_dates,config_length = gn.read_data_with_specified_columns(self.source,self.path,"2003-11-12") #begin to split the train data for date in self.date.split(","): torch.manual_seed(1) np.random.seed(1) random.seed(1) today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time,train_time,evalidate_date = gn.get_relevant_dates(today,length,"test") split_dates = [train_time,evalidate_date,str(today)] #generate the version version_params=generate_version_params(self.version) print("the test date is {}".format(split_dates[1])) norm_volume = "v1" norm_3m_spread = "v1" norm_ex = "v1" len_ma = 5 len_update = 30 tol = 1e-7 norm_params = {'vol_norm':norm_volume,'ex_spread_norm':norm_ex,'spot_spread_norm':norm_3m_spread, 'len_ma':len_ma,'len_update':len_update,'both':3,'strength':0.01,'xgboost':False} tech_params = {'strength':0.01,'both':3,'Win_VSD':[10,20,30,40,50,60],'Win_EMA':12,'Win_Bollinger':22, 'Fast':12,'Slow':26,'Win_NATR':10,'Win_VBM':22,'acc_initial':0.02,'acc_maximum':0.2,"live":None} #for versions that tune over 6 metals final_X_tr = [] final_y_tr = [] final_X_val = [] final_y_val = [] final_X_te = [] final_y_te = [] final_y_te_class_list = [] final_y_te_class_top_list = [] final_y_te_top_ind_list = [] final_y_te_class_bot_list = [] final_y_te_bot_ind_list = [] final_train_X_embedding = [] final_test_X_embedding = [] final_val_X_embedding = [] spot_list = [] i = 0 #toggle metal id metal_id = False ground_truths_list = ["LME_Cu_Spot","LME_Al_Spot","LME_Ni_Spot","LME_Xi_Spot","LME_Zn_Spot","LME_Pb_Spot"] for ground_truth in ground_truths_list: print(ground_truth) new_time_series = copy(time_series) ts = new_time_series.loc[start_time:split_dates[2]] #load data for use X_tr, y_tr, X_va, y_va, val_dates, column_lag_list = gn.prepare_data(ts,LME_dates,self.horizon,[ground_truth],self.lag,copy(split_dates),version_params,metal_id_bool = metal_id,reshape = False,live = True) # split validation X_ta = X_tr[:int(len(X_tr) * split), :, :] y_ta = y_tr[:int(len(y_tr) * split),0] X_val = X_tr[int(len(X_tr) * split):, :, :] y_val = y_tr[int(len(y_tr) * split):,0] X_te = X_va y_te = y_va[:,0] spot_list = np.concatenate([spot_list,y_va[:,1]],axis = 0) if len(spot_list) > 0 else y_va[:,1] # generate metal id for embedding lookup train_X_id_embedding = [i]*len(X_ta) val_X_id_embedding = [i]*len(X_val) test_X_id_embedding = [i]*len(X_te) if len(final_X_tr) == 0: final_X_tr = copy(X_ta) else: final_X_tr = np.concatenate((final_X_tr, X_ta), axis=0) if len(final_y_tr) == 0: final_y_tr = copy(y_ta) else: final_y_tr = np.concatenate((final_y_tr, y_ta), axis=0) if len(final_X_te) == 0: final_X_te = copy(X_te) else: final_X_te = np.concatenate((final_X_te, X_te), axis=0) if len(final_y_te) == 0: final_y_te = copy(y_te) else: final_y_te = np.concatenate((final_y_te, y_te), axis=0) y_te_rank = np.argsort(y_te) y_te_class = [] for item in y_te: y_te_class.append(item) final_y_te_class_list.append(y_te_class) split_position = len(y_te) // 3 final_y_te_bot_ind_list.append(y_te_rank[:split_position]) final_y_te_top_ind_list.append(y_te_rank[-split_position:]) y_te_class = np.array(y_te_class) final_y_te_class_bot_list.append( y_te_class[y_te_rank[:split_position]]) final_y_te_class_top_list.append( y_te_class[y_te_rank[-split_position:]]) if len(final_X_val) == 0: final_X_val = copy(X_val) else: final_X_val = np.concatenate((final_X_val, X_val), axis=0) if len(final_y_val) == 0: final_y_val = copy(y_val) else: final_y_val = np.concatenate((final_y_val, y_val), axis=0) final_train_X_embedding+=train_X_id_embedding final_test_X_embedding+=test_X_id_embedding final_val_X_embedding+=val_X_id_embedding i+=1 print('Dataset statistic: #examples') print('Testing:', len(final_X_te), len(final_y_te), len(final_test_X_embedding)) # begin to train the model input_dim = final_X_tr.shape[-1] window_size = self.lag case_number = len(ground_truths_list) # begin to predict start = time.time() test_loss_list = [] test_X = torch.from_numpy(final_X_te).float() test_Y = torch.from_numpy(final_y_te).float() var_x_test_id = torch.LongTensor(np.array(final_test_X_embedding)) if self.mc: net = torch.load(os.path.join('result','model','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+str(drop_out)+"_"+str(hidden_state)+"_"+str(embedding_size)+"_"+str(self.lag)+"_"+str(drop_out_mc)+"_"+str(repeat_mc)+"_"+self.version+"_"+'alstm.pkl')) final_test_output = [[],[],[],[],[],[]] for i in range(len(test_X)//6): clone_test_X = test_X.clone()[i::(len(test_X)//6)] clone_var_x_test_id = var_x_test_id.clone()[i::(len(test_X)//6)] for rep in range(repeat_mc): if rep == 0: test_output = net(clone_test_X, clone_var_x_test_id).detach().numpy() else: test_output = np.append(test_output,net(clone_test_X, clone_var_x_test_id).detach().numpy(),axis = 1) final_test_output[0].append(test_output[0].tolist()) final_test_output[1].append(test_output[1].tolist()) final_test_output[2].append(test_output[2].tolist()) final_test_output[3].append(test_output[3].tolist()) final_test_output[4].append(test_output[4].tolist()) final_test_output[5].append(test_output[5].tolist()) final_test_output = np.array(final_test_output[0] + final_test_output[1] + final_test_output[2] + final_test_output[3] + final_test_output[4] + final_test_output[5]) standard_dev = final_test_output.std(axis = 1) test_output = final_test_output.sum(axis = 1)/repeat_mc print(len(standard_dev),len(test_output)) else: net = torch.load(os.path.join('result','model','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+str(drop_out)+"_"+str(hidden_state)+"_"+str(embedding_size)+"_"+str(self.lag)+"_"+self.version+"_"+'alstm.pkl')) net.eval() test_output = net(test_X, var_x_test_id).detach().view(-1,) current_test_pred = list((1+test_output) * spot_list) pred_length = int(len(current_test_pred)/6) for num,gt in enumerate(ground_truths_list): final_list = pd.DataFrame(current_test_pred[num*pred_length:(num+1)*pred_length],index = val_dates, columns = ["Prediction"]) sd_list = pd.DataFrame(standard_dev[num*pred_length:(num+1)*pred_length],index = val_dates, columns = ["uncertainty"]) if self.mc: pred_path = os.path.join(os.getcwd(),"result","prediction","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version,str(self.mc)])+".csv") sd_path = os.path.join(os.getcwd(),"result","uncertainty","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version,str(self.mc)])+".csv") sd_list.to_csv(sd_path) else: pred_path = os.path.join(os.getcwd(),"result","prediction","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version])+".csv") final_list.to_csv(pred_path) end = time.time() print("predict time: {}".format(end-start))
def test(self): print("begin to test") pure_LogReg = LogReg(parameters={}) #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "test") split_dates = [train_time, evalidate_date, str(today)] if gn.even_version(self.version): model = pure_LogReg.load(self.version, "LME_All_Spot", self.horizon, self.lag, evalidate_date) else: model = pure_LogReg.load(self.version, self.gt, self.horizon, self.lag, evalidate_date) #generate the version version_params = generate_version_params(self.version) metal_id = False if gn.even_version(self.version): metal_id = True #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, [self.gt], self.lag, copy(split_dates), version_params, metal_id_bool=metal_id, live=True) prob = model.predict(final_X_va) probability = model.predict_proba(final_X_va) np.savetxt( os.path.join( "result", "probability", "logistic", "_".join([ self.gt + str(self.horizon), date, "lr", self.version, "probability.txt" ])), probability) final_list = [] piece_list = [] for i, val_date in enumerate(val_dates): piece_list.append(val_date) piece_list.append(prob[i]) final_list.append(piece_list) piece_list = [] final_dataframe = pd.DataFrame(prob, columns=['prediction'], index=val_dates) final_dataframe.to_csv( os.path.join( "result", "prediction", "logistic", "_".join([self.gt, date, str(self.horizon), self.version]) + ".csv"))
def tune(self, max_iter): print("begin to tune") #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") #generate list of list of dates for rolling window today = self.date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, end_time = gn.get_relevant_dates(today, length, "tune") split_dates = gn.rolling_half_year(start_time, end_time, length) #generate the version parameters (parameters that control the preprocess) version_params = generate_version_params(self.version) #prepare holder for results ans = {"C": []} #loop over each half year for s, split_date in enumerate(split_dates): print("the train date is {}".format(split_date[1])) print("the test date is {}".format(split_date[2])) #toggle metal id metal_id = False ground_truth_list = [self.gt] if gn.even_version(self.version): metal_id = True ground_truth_list = [ "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Zn_Spot", "LME_Pb_Spot" ] #extract copy of data to process ts = copy(time_series.loc[split_date[0]:split_date[-1]]) tvt_date = split_date[1:-1] #prepare data according to model type and version parameters final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, ground_truth_list, self.lag, copy(tvt_date), version_params, metal_id_bool=metal_id) #generate hyperparameters instances if self.horizon == 1: C_list = [0.01, 0.1, 1.0, 10.0, 100.0] elif self.horizon == 3: C_list = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] elif self.horizon == 5: C_list = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] elif self.horizon == 10: C_list = [0.001, 0.01, 0.1, 1.0, 10.0] elif self.horizon == 20: C_list = [0.001, 0.01, 0.1, 1.0, 10.0] elif self.horizon == 60: C_list = [1.0, 10.0, 100.0, 1000.0] # if self.horizon <=5: # if self.version == "v23": # C_list = [0.01,0.1,1.0,10.0,100.0,1000.0] # else: # C_list = [0.001,0.01,0.1,1.0,10.0,100.0] # else: # if self.version == "v24": # C_list = [0.1,1.0,10.0,100.0,1000.0,10000.0] # else: # C_list = [1e-5,0.0001,0.001,0.01,0.1,1.0,10.0] #generate model results for each hyperparameter instance for each half year for C in C_list: if C not in ans['C']: ans["C"].append(C) if split_date[2] + "_acc" not in ans.keys(): ans[split_date[2] + "_acc"] = [] ans[split_date[2] + "_pos_f1_score"] = [] ans[split_date[2] + "_neg_f1_score"] = [] ans[split_date[2] + "_f1_score"] = [] ans[split_date[2] + "_length"] = [] pure_LogReg = LogReg(parameters={}) max_iter = max_iter parameters = { "penalty": "l2", "C": C, "solver": "lbfgs", "tol": 1e-7, "max_iter": 6 * 4 * config_length * max_iter, "verbose": 0, "warm_start": False, "n_jobs": -1 } pure_LogReg.train(final_X_tr, final_y_tr.flatten(), parameters) pred = pure_LogReg.predict(final_X_va) y_label = pd.DataFrame(final_y_va.flatten(), columns=["prediction"], index=val_dates) y_pred = pd.DataFrame(pred, columns=["prediction"], index=val_dates) acc = accuracy_score(y_label, y_pred) pos_f1 = f1_score(y_label, y_pred) y_label = 1 * (y_label == 0.0) y_pred = 1 * (y_pred == 0.0) neg_f1 = f1_score(y_label, y_pred) f1 = (pos_f1 + neg_f1) / 2 ans[split_date[2] + "_acc"].append(acc) ans[split_date[2] + "_pos_f1_score"].append(pos_f1) ans[split_date[2] + "_neg_f1_score"].append(neg_f1) ans[split_date[2] + "_f1_score"].append(f1) ans[split_date[2] + "_length"].append(len( final_y_va.flatten())) ans = pd.DataFrame(ans) ave_acc = None length = None #generate total average across all half years for col in ans.columns.values.tolist(): if "_acc" in col: if ave_acc is None: ave_acc = ans.loc[:, col] * ans.loc[:, col[:-3] + "length"] ave_f1 = ans.loc[:, col[:-3 + "f1_score"]] * ans.loc[:, col[:-3] + "length"] length = ans.loc[:, col[:-3] + "length"] else: ave_acc = ave_acc + ans.loc[:, col] * ans.loc[:, col[:-3] + "length"] ave_f1 = ave_f1 + ans.loc[:, col[:-3 + "f1_score"]] * ans.loc[:, col[:-3] + "length"] length = length + ans.loc[:, col[:-3] + "length"] ave_acc = ave_acc / length ave_f1 = ave_f1 / length ans = pd.concat([ ans, pd.DataFrame({ "average accuracy": ave_acc, "average_f1": ave_f1 }) ], axis=1) #store results in csv pd.DataFrame(ans).to_csv(os.path.join(os.getcwd(),'result','validation','logistic',\ "_".join(["log_reg",self.gt,self.version,str(self.lag),str(self.horizon)+".csv"])))
def test(self): #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "test") split_dates = [train_time, evalidate_date, str(today)] #generate the version version_params = generate_version_params(self.version) metal_id = False if gn.even_version(self.version): metal_id = True #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, [self.gt], self.lag, copy(split_dates), version_params, metal_id_bool=metal_id, live=True) train_dataframe = pd.DataFrame(final_X_tr, columns=column_lag_list) train_X = train_dataframe.loc[:, column_lag_list] train_y = pd.DataFrame(final_y_tr, columns=['prediction']) test_dataframe = pd.DataFrame(final_X_va, columns=column_lag_list) test_X = test_dataframe.loc[:, column_lag_list] n_splits = 10 pos = sum(train_y.values)[0] from sklearn.metrics import accuracy_score model = xgb.XGBClassifier( n_estimators=500, silent=True, nthread=10, colsample_bytree=0.7, colsample_bylevel=1, reg_alpha=0.0001, reg_lambda=1, scale_pos_weight=(len(train_y.values) - pos) / pos, seed=1440, missing=None) folds = KFold(n_splits=n_splits) scores = [] prediction = np.zeros((len(final_X_va), 1)) folder_index = [] #load the model for fold_n, (train_index, valid_index) in enumerate(folds.split(train_X)): if not gn.even_version(self.version): model = pickle.load( open( os.path.join( "result", "model", "xgboost", split_dates[1] + "_" + self.gt + "_" + str(self.horizon) + "_" + str(self.lag) + "_" + str(fold_n) + "_" + self.version + "_" + 'xgb.model'), "rb")) else: model = pickle.load( open( os.path.join( "result", "model", "xgboost", split_dates[1] + "_LME_All_Spot_" + str(self.horizon) + "_" + str(self.lag) + "_" + str(fold_n) + "_" + self.version + "_" + 'xgb.model'), "rb")) y_pred = model.predict_proba( test_X, ntree_limit=model.best_ntree_limit)[:, 1] y_pred = y_pred.reshape(-1, 1) if fold_n == 0: folder_1 = y_pred folder_1 = folder_1.reshape(len(folder_1), 1) elif fold_n == 1: folder_2 = y_pred folder_2 = folder_2.reshape(len(folder_2), 1) elif fold_n == 2: folder_3 = y_pred folder_3 = folder_3.reshape(len(folder_3), 1) elif fold_n == 3: folder_4 = y_pred folder_4 = folder_4.reshape(len(folder_4), 1) elif fold_n == 4: folder_5 = y_pred folder_5 = folder_5.reshape(len(folder_5), 1) elif fold_n == 5: folder_6 = y_pred folder_6 = folder_6.reshape(len(folder_6), 1) elif fold_n == 6: folder_7 = y_pred folder_7 = folder_7.reshape(len(folder_7), 1) elif fold_n == 7: folder_8 = y_pred folder_8 = folder_8.reshape(len(folder_8), 1) elif fold_n == 8: folder_9 = y_pred folder_9 = folder_9.reshape(len(folder_9), 1) elif fold_n == 9: folder_10 = y_pred folder_10 = folder_10.reshape(len(folder_10), 1) #calculate the all folder voting result = np.concatenate( (folder_1, folder_2, folder_3, folder_4, folder_5, folder_6, folder_7, folder_8, folder_9, folder_10), axis=1) np.savetxt( os.path.join( "result", "probability", "xgboost", self.gt + "_h" + str(self.horizon) + "_" + date + "_xgboost" + self.version + ".txt"), result) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) print("the all folder voting precision is {}".format( metrics.accuracy_score(final_y_va, final_list))) final_list = pd.DataFrame(final_list, index=val_dates, columns=["prediction"]) final_list.to_csv( os.path.join( os.getcwd(), "result", "prediction", "xgboost", "_".join([self.gt, date, str(self.horizon), self.version]) + ".csv"))
def train(self, C=0.01, tol=1e-7, max_iter=100): print("begin to train") #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "train") split_dates = [train_time, evalidate_date, str(today)] #generate the version version_params = generate_version_params(self.version) print("the train date is {}".format(split_dates[0])) print("the test date is {}".format(split_dates[1])) #toggle metal id metal_id = False ground_truth_list = [self.gt] if gn.even_version(self.version): metal_id = True ground_truth_list = [ "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Zn_Spot", "LME_Pb_Spot" ] #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, ground_truth_list, self.lag, copy(split_dates), version_params, metal_id_bool=metal_id) pure_LogReg = LogReg(parameters={}) parameters = { "penalty": "l2", "C": C, "solver": "lbfgs", "tol": tol, "max_iter": 6 * 4 * config_length * max_iter, "verbose": 0, "warm_start": False, "n_jobs": -1 } pure_LogReg.train(final_X_tr, final_y_tr.flatten(), parameters) pure_LogReg.save(self.version, self.gt, self.horizon, self.lag, evalidate_date)
def train(self, max_depth, learning_rate, gamma, min_child_weight, subsample): print("begin to train") #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "train") split_dates = [train_time, evalidate_date, str(today)] #generate the version version_params = generate_version_params(self.version) print("the train date is {}".format(split_dates[0])) print("the test date is {}".format(split_dates[1])) #toggle metal id metal_id = False ground_truth_list = [self.gt] if gn.even_version(self.version): metal_id = True ground_truth_list = [ "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Zn_Spot", "LME_Pb_Spot" ] #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, ground_truth_list, self.lag, copy(split_dates), version_params, metal_id_bool=metal_id) train_dataframe = pd.DataFrame(final_X_tr, columns=column_lag_list) train_X = train_dataframe.loc[:, column_lag_list] train_y = pd.DataFrame(final_y_tr, columns=['result']) test_dataframe = pd.DataFrame(final_X_va, columns=column_lag_list) test_X = test_dataframe.loc[:, column_lag_list] n_splits = 10 pos = sum(train_y.values)[0] from sklearn.metrics import accuracy_score model = xgb.XGBClassifier( max_depth=max_depth, learning_rate=learning_rate, n_estimators=500, silent=True, nthread=10, gamma=gamma, min_child_weight=min_child_weight, subsample=subsample, colsample_bytree=0.7, colsample_bylevel=1, reg_alpha=0.0001, reg_lambda=1, scale_pos_weight=(len(train_y.values) - pos) / pos, seed=1440, missing=None) folds = KFold(n_splits=n_splits) scores = [] prediction = np.zeros((len(final_X_va), 1)) folder_index = [] #save the model for fold_n, (train_index, valid_index) in enumerate(folds.split(train_X)): X_train, X_valid = train_X[column_lag_list].iloc[ train_index], train_X[column_lag_list].iloc[valid_index] y_train, y_valid = train_y.iloc[train_index], train_y.iloc[ valid_index] model.fit(X_train, y_train, eval_metric='error', verbose=True, eval_set=[(X_valid, y_valid)], early_stopping_rounds=5) y_pred_valid = model.predict(X_valid) pickle.dump( model, open( os.path.join( 'result', 'model', 'xgboost', split_dates[1] + "_" + self.gt + "_" + str(self.horizon) + "_" + str(self.lag) + "_" + str(fold_n) + "_" + self.version + "_" + 'xgb.model'), "wb")) y_pred = model.predict_proba( test_X, ntree_limit=model.best_ntree_limit)[:, 1] y_pred = y_pred.reshape(-1, 1) if fold_n == 0: folder_1 = y_pred folder_1 = folder_1.reshape(len(folder_1), 1) elif fold_n == 1: folder_2 = y_pred folder_2 = folder_2.reshape(len(folder_2), 1) elif fold_n == 2: folder_3 = y_pred folder_3 = folder_3.reshape(len(folder_3), 1) elif fold_n == 3: folder_4 = y_pred folder_4 = folder_4.reshape(len(folder_4), 1) elif fold_n == 4: folder_5 = y_pred folder_5 = folder_5.reshape(len(folder_5), 1) elif fold_n == 5: folder_6 = y_pred folder_6 = folder_6.reshape(len(folder_6), 1) elif fold_n == 6: folder_7 = y_pred folder_7 = folder_7.reshape(len(folder_7), 1) elif fold_n == 7: folder_8 = y_pred folder_8 = folder_8.reshape(len(folder_8), 1) elif fold_n == 8: folder_9 = y_pred folder_9 = folder_9.reshape(len(folder_9), 1) elif fold_n == 9: folder_10 = y_pred folder_10 = folder_10.reshape(len(folder_10), 1) #calculate the all folder voting result = np.concatenate( (folder_1, folder_2, folder_3, folder_4, folder_5, folder_6, folder_7, folder_8, folder_9, folder_10), axis=1)
def tune(self): print("begin to choose the parameter") #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") #generate list of list of dates for rolling window today = self.date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, end_time = gn.get_relevant_dates(today, length, "tune") split_dates = gn.rolling_half_year(start_time, end_time, length) #generate the version parameters (parameters that control the preprocess) version_params = generate_version_params(self.version) #prepare holder for results ans = {"C": []} #loop over each half year for s, split_date in enumerate(split_dates): print("the train date is {}".format(split_date[1])) print("the test date is {}".format(split_date[2])) #toggle metal id metal_id = False ground_truth_list = [self.gt] if gn.even_version(self.version): metal_id = True ground_truth_list = [ "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Zn_Spot", "LME_Pb_Spot" ] #extract copy of data to process ts = copy(time_series.loc[split_date[0]:split_date[-1]]) tvt_date = split_date[1:-1] #prepare data according to model type and version parameters final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, ground_truth_list, self.lag, copy(tvt_date), version_params, metal_id_bool=metal_id) train_dataframe = pd.DataFrame(final_X_tr, columns=column_lag_list) train_X = train_dataframe.loc[:, column_lag_list] train_y = pd.DataFrame(final_y_tr, columns=['result']) test_dataframe = pd.DataFrame(final_X_va, columns=column_lag_list) test_X = test_dataframe.loc[:, column_lag_list] n_splits = 10 #tune xgboost hyper parameter with grid search for max_depth in [3, 4, 5]: for learning_rate in [0.6, 0.7, 0.8, 0.9]: for gamma in [0.6, 0.7, 0.8, 0.9]: for min_child_weight in [3, 4, 5, 6]: for subsample in [0.6, 0.7, 0.85, 0.9]: from sklearn.metrics import accuracy_score model = xgb.XGBClassifier( max_depth=max_depth, learning_rate=learning_rate, n_estimators=500, silent=True, nthread=10, gamma=gamma, min_child_weight=min_child_weight, subsample=subsample, colsample_bytree=0.7, colsample_bylevel=1, reg_alpha=0.0001, reg_lambda=1, scale_pos_weight=1, seed=1440, missing=None) folds = KFold(n_splits=n_splits) scores = [] prediction = np.zeros((len(final_X_va), 1)) folder_index = [] #generate k fold and train xgboost model for fold_n, (train_index, valid_index) in enumerate( folds.split(train_X)): X_train, X_valid = train_X[ column_lag_list].iloc[ train_index], train_X[ column_lag_list].iloc[ valid_index] y_train, y_valid = train_y.iloc[ train_index], train_y.iloc[valid_index] model.fit(X_train, y_train, eval_metric='error', verbose=True, eval_set=[(X_valid, y_valid)], early_stopping_rounds=5) y_pred_valid = model.predict(X_valid) y_pred = model.predict_proba( test_X, ntree_limit=model.best_ntree_limit)[:, 1] y_pred = y_pred.reshape(-1, 1) if fold_n == 0: folder_1 = y_pred folder_1 = folder_1.reshape( len(folder_1), 1) elif fold_n == 1: folder_2 = y_pred folder_2 = folder_2.reshape( len(folder_2), 1) elif fold_n == 2: folder_3 = y_pred folder_3 = folder_3.reshape( len(folder_3), 1) elif fold_n == 3: folder_4 = y_pred folder_4 = folder_4.reshape( len(folder_4), 1) elif fold_n == 4: folder_5 = y_pred folder_5 = folder_5.reshape( len(folder_5), 1) elif fold_n == 5: folder_6 = y_pred folder_6 = folder_6.reshape( len(folder_6), 1) elif fold_n == 6: folder_7 = y_pred folder_7 = folder_7.reshape( len(folder_7), 1) elif fold_n == 7: folder_8 = y_pred folder_8 = folder_8.reshape( len(folder_8), 1) elif fold_n == 8: folder_9 = y_pred folder_9 = folder_9.reshape( len(folder_9), 1) elif fold_n == 9: folder_10 = y_pred folder_10 = folder_10.reshape( len(folder_10), 1) #calculate the all folder voting result = np.concatenate( (folder_1, folder_2, folder_3, folder_4, folder_5, folder_6, folder_7, folder_8, folder_9, folder_10), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) #print("the lag is {}".format(lag)) print("the all folder voting precision is {}". format( metrics.accuracy_score( final_y_va, final_list))) #calculate the near folder voting result = np.concatenate( (folder_6, folder_7, folder_8, folder_9, folder_10), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) print("the near precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) #calculate the far folder voting result = np.concatenate( (folder_1, folder_2, folder_3, folder_4, folder_5), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) print("the far precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) #calculate the same folder voting if split_date[1].split("-")[1] == '01': result = np.concatenate( (folder_1, folder_3, folder_5, folder_7, folder_9), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) #print("the lag is {}".format(lag)) print("the same precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) #calculate the reverse folder voting result = np.concatenate( (folder_2, folder_4, folder_6, folder_8, folder_10), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) #print("the lag is {}".format(lag)) print("the reverse precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) print("the max_depth is {}".format( max_depth)) print("the learning_rate is {}".format( learning_rate)) print("the gamma is {}".format(gamma)) print("the min_child_weight is {}".format( min_child_weight)) print("the subsample is {}".format( subsample)) else: #calculate the same folder voting result = np.concatenate( (folder_2, folder_4, folder_6, folder_8, folder_10), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) #print("the lag is {}".format(lag)) print("the same precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) #calculate the reverse folder voting result = np.concatenate( (folder_1, folder_3, folder_5, folder_7, folder_9), axis=1) final_list = [] for j in range(len(result)): count_1 = 0 count_0 = 0 for item in result[j]: if item > 0.5: count_1 += 1 else: count_0 += 1 if count_1 > count_0: final_list.append(1) else: final_list.append(0) #print("the lag is {}".format(lag)) print("the reverse precision is {}".format( metrics.accuracy_score( final_y_va, final_list))) print("the max_depth is {}".format( max_depth)) print("the learning_rate is {}".format( learning_rate)) print("the gamma is {}".format(gamma)) print("the min_child_weight is {}".format( min_child_weight)) print("the subsample is {}".format( subsample)) print("the lag is {}".format(self.lag)) print("the train date is {}".format(split_date[0])) print("the test date is {}".format(split_date[1])) print("the length is {}".format(len(test_X)))
def test(self,split = 0.9, num_epochs=50, drop_out=0.0, embedding_size=5, batch_size=512, hidden_state=50, lrate=0.001, attention_size=2, interval=1, lambd=0, save_loss=0, save_prediction=0, method = ""): sys.path[0] = os.curdir print(sys.path) print("begin to test") #identify the configuration file for data based on version self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database with configuration file to determine columns to that are required time_series,LME_dates,config_length = gn.read_data_with_specified_columns(self.source,self.path,"2003-11-12") for date in self.date.split(","): torch.manual_seed(1) np.random.seed(1) random.seed(1) today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time,train_time,evalidate_date = gn.get_relevant_dates(today,length,"test") split_dates = [train_time,evalidate_date,str(today)] #generate the version parameters version_params=generate_version_params(self.version) print("the test date is {}".format(split_dates[1])) norm_volume = "v1" norm_3m_spread = "v1" norm_ex = "v1" len_ma = 5 len_update = 30 tol = 1e-7 norm_params = {'vol_norm':norm_volume,'ex_spread_norm':norm_ex,'spot_spread_norm':norm_3m_spread, 'len_ma':len_ma,'len_update':len_update,'both':3,'strength':0.01,'xgboost':False} tech_params = {'strength':0.01,'both':3,'Win_VSD':[10,20,30,40,50,60],'Win_EMA':12,'Win_Bollinger':22, 'Fast':12,'Slow':26,'Win_NATR':10,'Win_VBM':22,'acc_initial':0.02,'acc_maximum':0.2,"live":None} #for versions that tune over 6 metals final_X_tr = [] final_y_tr = [] final_X_val = [] final_y_val = [] final_X_te = [] final_y_te = [] final_y_te_class_list = [] final_y_te_class_top_list = [] final_y_te_top_ind_list = [] final_y_te_class_bot_list = [] final_y_te_bot_ind_list = [] final_train_X_embedding = [] final_test_X_embedding = [] final_val_X_embedding = [] i = 0 #toggle metal id metal_id = False ground_truths_list = ["LME_Cu_Spot","LME_Al_Spot","LME_Ni_Spot","LME_Xi_Spot","LME_Zn_Spot","LME_Pb_Spot"] for ground_truth in ground_truths_list: new_time_series = copy(time_series) spot_list = np.array(new_time_series[ground_truth]) new_time_series['spot_price'] = spot_list ts = new_time_series.loc[start_time:split_dates[2]] #load data for use X_tr, y_tr, X_va, y_va, val_dates, column_lag_list = gn.prepare_data(ts,LME_dates,self.horizon,[ground_truth],self.lag,copy(split_dates),version_params,metal_id_bool = metal_id,reshape = False,live = True) # split validation X_ta = X_tr[:int(len(X_tr) * split), :, :] y_ta = y_tr[:int(len(y_tr) * split)] X_val = X_tr[int(len(X_tr) * split):, :, :] y_val = y_tr[int(len(y_tr) * split):] X_te = X_va y_te = y_va # generate metal id for embedding lookup train_X_id_embedding = [i]*len(X_ta) val_X_id_embedding = [i]*len(X_val) test_X_id_embedding = [i]*len(X_te) if len(final_X_tr) == 0: final_X_tr = copy(X_ta) else: final_X_tr = np.concatenate((final_X_tr, X_ta), axis=0) if len(final_y_tr) == 0: final_y_tr = copy(y_ta) else: final_y_tr = np.concatenate((final_y_tr, y_ta), axis=0) if len(final_X_te) == 0: final_X_te = copy(X_te) else: final_X_te = np.concatenate((final_X_te, X_te), axis=0) if len(final_y_te) == 0: final_y_te = copy(y_te) else: final_y_te = np.concatenate((final_y_te, y_te), axis=0) y_te_rank = np.argsort(y_te[:,0]) y_te_class = [] for item in y_te: if item >= thresh: y_te_class.append(1) else: y_te_class.append(0) final_y_te_class_list.append(y_te_class) split_position = len(y_te) // 3 final_y_te_bot_ind_list.append(y_te_rank[:split_position]) final_y_te_top_ind_list.append(y_te_rank[-split_position:]) y_te_class = np.array(y_te_class) final_y_te_class_bot_list.append( y_te_class[y_te_rank[:split_position]]) final_y_te_class_top_list.append( y_te_class[y_te_rank[-split_position:]]) if len(final_X_val) == 0: final_X_val = copy(X_val) else: final_X_val = np.concatenate((final_X_val, X_val), axis=0) if len(final_y_val) == 0: final_y_val = copy(y_val) else: final_y_val = np.concatenate((final_y_val, y_val), axis=0) final_train_X_embedding+=train_X_id_embedding final_test_X_embedding+=test_X_id_embedding final_val_X_embedding+=val_X_id_embedding # update metal index i+=1 print('Dataset statistic: #examples') print('Testing:', len(final_X_te), len(final_y_te), len(final_test_X_embedding)) # begin to train the model input_dim = final_X_tr.shape[-1] window_size = self.lag case_number = len(ground_truths_list) # begin to predict start = time.time() test_loss_list = [] test_X = torch.from_numpy(final_X_te).float() test_Y = torch.from_numpy(final_y_te).float() var_x_test_id = torch.LongTensor(np.array(final_test_X_embedding)) net = torch.load(os.path.join('result','model','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+str(drop_out)+"_"+str(hidden_state)+"_"+str(embedding_size)+"_"+str(self.lag)+"_"+self.version+"_"+'alstm.pkl')) net.eval() test_output = net(test_X, var_x_test_id) current_test_pred = list(test_output.detach().view(-1,)) current_test_class = [1 if ele>thresh else 0 for ele in current_test_pred] np.savetxt(os.path.join('result','probability','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+self.version+".txt"),current_test_class) pred_length = int(len(current_test_class)/6) for num,gt in enumerate(ground_truths_list): final_list = pd.DataFrame(current_test_class[num*pred_length:(num+1)*pred_length],index = val_dates, columns = ["Prediction"]) final_list.to_csv(os.path.join(os.getcwd(),"result","prediction","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version])+".csv")) end = time.time() print("predict time: {}".format(end-start))
def train(self): print("begin to train") #assert that the configuration path is correct self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "train") split_dates = [train_time, evalidate_date, str(today)] #generate the version version_params = generate_version_params(self.version) print("the train date is {}".format(split_dates[0])) print("the test date is {}".format(split_dates[1])) #toggle metal id metal_id = False ground_truth_list = [self.gt] if gn.even_version(self.version): metal_id = True ground_truth_list = [ "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot", "LME_Zn_Spot", "LME_Pb_Spot" ] #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, ground_truth_list, self.lag, copy(split_dates), version_params, metal_id_bool=metal_id) LR = LinearRegression(n_jobs=-1) LR.fit(final_X_tr, final_y_tr[:, 0]) if gn.even_version(self.version): joblib.dump( LR, os.path.join( os.getcwd(), 'result', 'model', 'linear', self.version + "_ALL_" + str(self.horizon) + "_" + str(self.lag) + "_" + evalidate_date + '.pkl')) else: joblib.dump( LR, os.path.join( os.getcwd(), 'result', 'model', 'linear', self.version + "_" + self.gt + "_" + str(self.horizon) + "_" + str(self.lag) + "_" + evalidate_date + '.pkl'))
def test(self): #split the date print("begin to test") #assert that the configuration path is correct self.path = gn.generate_config_path(self.version) #read the data from the 4E or NExT database time_series, LME_dates, config_length = gn.read_data_with_specified_columns( self.source, self.path, "2003-11-12") for date in self.date.split(","): #generate list of dates for today's model training period today = date length = 5 if gn.even_version(self.version) and self.horizon > 5: length = 4 start_time, train_time, evalidate_date = gn.get_relevant_dates( today, length, "test") split_dates = [train_time, evalidate_date, str(today)] if gn.even_version(self.version): model = joblib.load( os.path.join( os.getcwd(), 'result', 'model', 'linear', self.version + "_ALL_" + str(self.horizon) + "_" + str(self.lag) + "_" + evalidate_date + '.pkl')) else: model = joblib.load( os.path.join( os.getcwd(), 'result', 'model', 'linear', self.version + "_" + self.gt + "_" + str(self.horizon) + "_" + str(self.lag) + "_" + evalidate_date + '.pkl')) #generate the version version_params = generate_version_params(self.version) metal_id = False if gn.even_version(self.version): metal_id = True #extract copy of data to process ts = copy(time_series.loc[start_time:split_dates[2]]) #load data for use final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data( ts, LME_dates, self.horizon, [self.gt], self.lag, copy(split_dates), version_params, metal_id_bool=metal_id, live=True) prob = (1 + model.predict(final_X_va)) * final_y_va[:, 1] final_list = [] piece_list = [] for i, val_date in enumerate(val_dates): piece_list.append(val_date) piece_list.append(prob[i]) final_list.append(piece_list) piece_list = [] final_dataframe = pd.DataFrame(prob, columns=['prediction'], index=val_dates) final_dataframe.to_csv( os.path.join( "result", "prediction", "linear", self.version, "_".join([self.gt, date, str(self.horizon), self.version]) + ".csv"))