def train(model, trainDataloader, valDataloader, testDataloader, optimizer, scaler_y, opt, logger): val_losses = [] test_losses = [] model_save_path = utils.make_date_dir("./model_save") logger.info(f"Model save path : {model_save_path}") logger.info(f"Learning Rate : {opt.lr}") if opt.model_mode == 'single': train_epoch = train_single_epoch eval_epoch = eval_single_epoch test = test_single elif opt.model_mode == 'twice': train_epoch = train_twice_epoch eval_epoch = eval_twice_epoch test = test_twice best_loss = float('inf') patience = 0 for epoch in range(int(opt.n_epochs)): patience += 1 logger.info( "====================================Train====================================" ) train_loss, _ = train_epoch(model, trainDataloader, optimizer) logger.info(f"[Train Epoch {epoch+1}] train Loss : {train_loss}") logger.info( "====================================Val====================================" ) val_loss, _ = eval_epoch(model, valDataloader) logger.info(f"[Eval Epoch {epoch+1}] val Loss : {val_loss}") logger.info( "====================================Test====================================" ) test_loss, test_mae = test(model, testDataloader, scaler_y) logger.info( f"[Epoch {epoch+1}] Test_throughput_Loss: {test_loss[0]}, Test_latency_Loss: {test_loss[1]} , Test_throughput_MAE_Loss: {test_mae[0]}, Test_latency_MAE_Loss: {test_mae[1]}" ) if sum(test_loss) < best_loss: torch.save( model.state_dict(), os.path.join(model_save_path, "model_" + str(epoch + 1) + ".pt")) best_th_loss, best_la_loss, best_th_mae_loss, best_la_mae_loss = test_loss[ 0], test_loss[1], test_mae[0], test_mae[1] best_loss = sum(test_loss) patience = 0 best_epoch = epoch + 1 if patience == 10: break val_losses.append(val_loss) test_losses.append(test_loss) return best_epoch, best_th_loss, best_la_loss, best_th_mae_loss, best_la_mae_loss, model_save_path
def main(): config = Config() logger, log_dir = get_logger(os.path.join(config.model, "logs/")) logger.info("=======Model Configuration=======") logger.info(config.desc) logger.info("=================================") try: _, _, test_x, _, _, test_y, _, _, test_m, test_dt = load_agg_selected_data_mem(data_path=config.data_path, \ x_len=config.x_len, \ y_len=config.y_len, \ foresight=config.foresight, \ cell_ids=config.test_cell_ids, \ dev_ratio=config.dev_ratio, \ test_len=config.test_len, \ seed=config.seed) model = Model(config) if config.latest_model: model_dir = find_latest_dir( os.path.join(config.model, 'model_save/')) else: if not model_dir: raise Exception( "model_dir or latest_model=True should be defined in config" ) model_dir = config.model_dir model.restore_session(model_dir) if len(test_y) > 100000: # Batch mode test_data = list(zip(test_x, test_m, test_y)) test_batches = batch_loader(test_data, config.batch_size) total_pred = np.empty(shape=(0, test_y.shape[1])) for batch in test_batches: batch_x, batch_m, batch_y = zip(*batch) pred, _, _, _, _ = model.eval(batch_x, batch_m, batch_y) total_pred = np.r_[total_pred, pred] else: # Not batch mode total_pred, test_loss, test_rse, test_smape, test_mae = model.eval( test_x, test_m, test_y) result_dir = make_date_dir(os.path.join(config.model, 'results/')) np.save(os.path.join(result_dir, 'pred.npy'), total_pred) np.save(os.path.join(result_dir, 'test_y.npy'), test_y) np.save(os.path.join(result_dir, 'test_dt.npy'), test_dt) logger.info("Saving results at {}".format(result_dir)) logger.info("Testing finished, exit program") except: logger.exception("ERROR")
def _create_big_data_table(self): '''Compute all scores, collect in dataframe and store as csv.''' data = pd.concat([ self.get_exp2score(quantifier_properties.Monotonicity), self.get_exp2score(quantifier_properties.Quantity), self.get_exp2score(quantifier_properties.Conservativity), self.get_exp2score(quantifier_properties.LempelZiv), self.get_exp2score(quantifier_properties.Uniformity) ], axis=1).reset_index() data.columns = [ "expression", "monotonicity", "quantity", "conservativity", "lempel_ziv", "uniformity" ] # When using lambda on pd.DataFrame, axis 1, it gives as input # to lambda function, each row, transposed, as a series. data.insert( loc=0, column="expr_length", value=data.apply( lambda row_series: utils.get_exp_len(row_series.expression), axis=1)) # Add extra "admin properties" to identify from which language # generator this data stems. data["max_model_size"] = self.max_model_size data["lot"] = self.language_name data["subsets"] = self.subset_description current_date = datetime.datetime.now().strftime("%d-%m-%Y") data["date"] = current_date data.sort_values(by=["expr_length"], inplace=True) data.reset_index(drop=True, inplace=True) max_expr_len = data["expr_length"].max() filename = utils.make_csv_filename(self.max_model_size, max_expr_len, self.language_name) file_loc = utils.make_date_dir(self.csv_dir) data.to_csv(file_loc / filename, index=False) self.big_data_table = data return self.big_data_table
def main(): config = Config() logger, log_dir = get_logger(os.path.join(config.model, "logs/")) logger.setLevel(30) # set loglevel WARNING(30) logger.info("=======Model Configuration=======") logger.info(config.desc) logger.info("=================================") try: _, _, test_x, _, _, test_y, _, _, test_m, test_dt = load_agg_selected_data_mem(data_path=config.data_path, \ x_len=config.x_len, \ y_len=config.y_len, \ foresight=config.foresight, \ cell_ids=config.test_cell_ids, \ dev_ratio=config.dev_ratio, \ test_len=config.test_len, \ seed=config.seed) # add dummy data test_x = np.concatenate([test_x] * 10, axis=0) test_m = np.concatenate([test_m] * 10, axis=0) test_y = np.concatenate([test_y] * 10, axis=0) print("Size of x,m,y : {}, {}, {} bytes, total {} GB".format( test_x.nbytes, test_m.nbytes, test_y.nbytes, (test_x.nbytes + test_m.nbytes + test_y.nbytes) / 1024 / 1024 / 1024)) print("Batch Size : {}".format(config.batch_size)) model = Model(config) if config.latest_model: model_dir = find_latest_dir( os.path.join(config.model, 'model_save/')) else: if not config.model_dir: raise Exception( "model_dir or latest_model=True should be defined in config" ) model_dir = config.model_dir model.restore_session(model_dir) # always run as batch mode test_data = list(zip(test_x, test_m, test_y)) test_batches = batch_loader(test_data, config.batch_size) total_pred = np.empty(shape=(0, test_y.shape[1])) time_start = time() for idx, batch in enumerate(test_batches): batch_x, batch_m, batch_y = zip(*batch) pred, _, _, _, _ = model.eval(batch_x, batch_m, batch_y) total_pred = np.r_[total_pred, pred] print("Batch looped {} times".format(idx + 1)) time_end = time() print("Elapsed Time in Inferencing: {}".format(time_end - time_start)) result_dir = make_date_dir(os.path.join(config.model, 'results/')) np.save(os.path.join(result_dir, 'pred.npy'), total_pred) np.save(os.path.join(result_dir, 'test_y.npy'), test_y) np.save(os.path.join(result_dir, 'test_dt.npy'), test_dt) logger.info("Saving results at {}".format(result_dir)) logger.info("Testing finished, exit program") except: logger.exception("ERROR")
def main(): config = Config() logger, log_dir = get_logger(os.path.join(config.model, "logs/")) logger.info("=======Model Configuration=======") logger.info(config.desc) logger.info("=================================") try: train_x, dev_x, test_x, train_y, dev_y, test_y, test_dt = load_agg_data(data_path=config.data_path, \ x_len=config.x_len, \ y_len=config.y_len, \ ncells=config.ncells, \ foresight=config.foresight, \ dev_ratio=config.dev_ratio,\ test_len=config.test_len, \ seed=config.seed) model = Model(config) train_data = list(zip(train_x,train_y)) no_improv = 0 best_loss = 100 model_dir = make_date_dir(os.path.join(config.model, 'model_save/')) result_dir = make_date_dir(os.path.join(config.model, 'results/')) logger.info("Start training") dev_x = np.asarray(dev_x) dev_y = np.asarray(dev_y) start_time = time() for i in range(config.num_epochs): train_batches = batch_loader(train_data, config.batch_size) epoch = i+1 for batch in train_batches: batch_x, batch_y = zip(*batch) batch_x = np.asarray(batch_x) batch_y = np.asarray(batch_y) loss, rse, smape, mae, step = model.train(batch_x, batch_y) if step % 100 == 0: logger.info("epoch: %d, step: %d, loss: %.4f, rse: %.4f, smape: %.4f, mae: %.4f" % (epoch, step, loss, rse, smape, mae)) # dev score for each epoch (no mini batch) _, dev_loss, dev_rse, dev_smape, dev_mae = model.eval(dev_x, dev_y) if dev_loss < best_loss: best_loss = dev_loss no_improv = 0 logger.info("New score! : dev_loss: %.4f, dev_rse: %.4f, dev_smape: %.4f, dev_mae: %.4f" % (dev_loss, dev_rse, dev_smape, dev_mae)) logger.info("Saving model at {}".format(model_dir)) model.save_session(os.path.join(model_dir, config.model)) else: no_improv += 1 if no_improv == config.nepoch_no_improv: logger.info("No improvement for %d epochs" % no_improv) # break # model.save_session(os.path.join(model_dir, config.model)) elapsed = time() - start_time # generating results (no mini batch) model.restore_session(model_dir) pred, test_loss, test_rse, test_smape, test_mae = model.eval(test_x, test_y) logger.info("test_loss: %.4f, test_rse: %.4f, test_smape: %.4f, test_mae: %.4f" % (test_loss, test_rse, test_smape, test_mae)) # save results # np.save(os.path.join(result_dir, 'pred.npy'), pred) # np.save(os.path.join(result_dir, 'test_y.npy'), test_y) # np.save(os.path.join(result_dir, 'test_dt.npy'), test_dt) # logger.info("Saving results at {}".format(result_dir)) logger.info("Elapsed training time {0:0.2f} mins".format(elapsed/60)) logger.info("Training finished, exit program") except: logger.exception("ERROR")
saver = tf.train.Saver() saver.restore(sess, os.path.join(model_dir, config.model)) tfnet = TFNet.from_session( sess, inputs=[model.input_x, model.memories], # dropout is never used outputs=[model.predictions]) data_x_rdd = sc.parallelize(test_x, PARALLELISM) data_m_rdd = sc.parallelize(test_m, PARALLELISM) # create a RDD of Sample sample_rdd = data_x_rdd.zip(data_m_rdd).map( lambda x: Sample.from_ndarray(features=x, labels=np.zeros([1]))) # distributed inference on Spark and return an RDD outputs = tfnet.predict(sample_rdd, batch_per_thread=config.batch_size, distributed=True) # check time when trigger actions time_start = time.time() outputs.collect() time_end = time.time() print("Elapsed Time in Inferencing: {}".format(time_end - time_start)) result_dir = make_date_dir(os.path.join(config.model, 'zoo_results/')) # outputs.saveAsTextFile(os.path.join(result_dir, "result.txt"))
def main(opt: argparse, logger: logging, log_dir: str) -> Config: # Target workload loading logger.info("====================== {} mode ====================\n".format( opt.persistence)) logger.info("Target workload name is {}".format(opt.target)) """ load knob data and IM datas, EM datas. """ ### data load ### knob_data, aggregated_IM_data, aggregated_ops_data, aggregated_latency_data, target_knob_data, ops_target_external_data, latency_target_external_data = data_preprocessing( opt.target, opt.persistence, logger) ### clustering ### logger.info( "====================== Metrics_Simplification ====================\n") pruned_metrics = metric_simplification(aggregated_IM_data, logger, opt) logger.info("Done pruning metrics for workload {} (# of pruned metrics: {}).\n\n""Pruned metrics: {}\n".format( opt.persistence, len(pruned_metrics), pruned_metrics)) metric_idxs = [i for i, metric_name in enumerate( aggregated_IM_data['columnlabels']) if metric_name in pruned_metrics] ranked_metric_data = { 'data': aggregated_IM_data['data'][:, metric_idxs], 'rowlabels': copy.deepcopy(aggregated_IM_data['rowlabels']), 'columnlabels': [aggregated_IM_data['columnlabels'][i] for i in metric_idxs] } """ For example, pruned_metrics : ['allocator_rss_bytes', 'rss_overhead_bytes', 'used_memory_dataset', 'rdb_last_cow_size'] """ ### KNOBS RANKING STAGE ### rank_knob_data = copy.deepcopy(knob_data) logger.info( "====================== Run_Knobs_Ranking ====================\n") logger.info("use mode = {}".format(opt.rki)) ranked_knobs = knobs_ranking(knob_data=rank_knob_data, metric_data=ranked_metric_data, mode=opt.rki, logger=logger) logger.info("Done ranking knobs for workload {} (# ranked knobs: {}).\n\n" "Ranked knobs: {}\n".format(opt.persistence, len(ranked_knobs), ranked_knobs)) top_k: dict = opt.topk top_k_knobs = utils.get_ranked_knob_data(ranked_knobs, knob_data, top_k) target_knobs = utils.get_ranked_knob_data( ranked_knobs, target_knob_data, top_k) knob_save_path = utils.make_date_dir('./save_knobs') logger.info("Knob save path : {}".format(knob_save_path)) logger.info("Choose Top {} knobs : {}".format( top_k, top_k_knobs['columnlabels'])) np.save(os.path.join(knob_save_path, 'knobs_{}.npy'.format(top_k)), np.array(top_k_knobs['columnlabels'])) # In double version aggregated_data = [aggregated_ops_data, aggregated_latency_data] target_external_data = [ ops_target_external_data, latency_target_external_data] if not opt.atr: model, optimizer = set_model(opt) model_save_path = utils.make_date_dir("./model_save") logger.info("Model save path : {}".format(model_save_path)) logger.info("Learning Rate : {}".format(opt.lr)) best_epoch, best_loss, best_mae = defaultdict( int), defaultdict(float), defaultdict(float) columns = ['Totals_Ops/sec', 'Totals_p99_Latency'] ### train dnn ### for i in range(2): trainDataloader, valDataloader, testDataloader, scaler_y = prepareForTraining( opt, top_k_knobs, target_knobs, aggregated_data[i], target_external_data[i], i) logger.info( "====================== {} Pre-training Stage ====================\n".format(opt.model_mode)) best_epoch[columns[i]], best_loss[columns[i]], best_mae[columns[i]] = train( model, trainDataloader, valDataloader, testDataloader, optimizer, scaler_y, opt, logger, model_save_path, i) for name in best_epoch.keys(): logger.info("\n\n[{} Best Epoch {}] Best_Loss : {} Best_MAE : {}".format( name, best_epoch[name], best_loss[name], best_mae[name])) config = Config(opt.persistence, opt.db, opt.cluster, opt.rki, opt.topk, opt.model_mode, opt.n_epochs, opt.lr) config.save_double_results(opt.target, best_epoch['Totals_Ops/sec'], best_epoch[name], best_loss['Totals_Ops/sec'], best_loss[name], best_mae['Totals_Ops/sec'], best_mae[name], model_save_path, log_dir, knob_save_path) return config else: models = set_rf_model() for i in range(2): X_tr, y_train = prepare_ATR_learning( opt, top_k_knobs, target_knobs, aggregated_data[i], target_external_data[i], i) models[i].fit(X_tr, y_train) pruned_configs, external_datas, defaults, scaler_X, scaler_ys = double_prepareForGA(opt, top_k_knobs['columnlabels']) current_solution_pools, targets = make_solution_pool(opt, pruned_configs, external_datas, defaults) fitness_function = RF_fitness n_configs = top_k_knobs['columnlabels'].shape[0] #set remain ratio n_pool_half = opt.n_pool//2 #mutation ratio mutation = int(n_configs*0.5) GA_options = [n_configs, n_pool_half, mutation] top_k_config_path, name, connect = ATR_GA(opt, models, targets, top_k_knobs, current_solution_pools, fitness_function, GA_options, scaler_X, scaler_ys, logger) if connect: server_connection(opt, top_k_config_path, name) else: logger.info("Because appednfsync is 'always', Fin GA") return 0 import datetime #save results i = 0 today = datetime.datetime.now() name = 'result_'+opt.persistence+'-'+today.strftime('%Y%m%d')+'-'+'%02d'%i+'.csv' while os.path.exists(os.path.join('./GA_config/', name)): i += 1 name = 'result_'+opt.persistence+'-'+today.strftime('%Y%m%d')+'-'+'%02d'%i+'.csv' os.rename(f'./GA_config/result_{opt.persistence.lower()}_external_GA.csv', './GA_config/'+name) logger.info(name) df = pd.read_csv('./GA_config/'+name) logger.info(df["Totals_Ops/sec"]) logger.info(df["Totals_p99_Latency"])
def main(): parser = argparse.ArgumentParser() #Required parameters parser.add_argument("-m", "--model", type=str, default="Bin_normal", required=True, choices=["Bin_normal", "Bin-uniform", "Bin_3d"], help="Model selected in the list: Bin_normal, Bin-uniform,Bin_3d") #Optional parameters args = parser.parse_args() args = parser.parse_args() if args.model == "Bin_normal": from Bin_normal.config import Config from Bin_normal.model import Model config = Config() elif args.model == "Bin_3d": from Bin_3d.config import Config from Bin_3d.model import Model config = Config() else: from MANN.config import Config from MANN.model import Model config = Config() logger = get_logger(os.path.join(config.model, "logs/")) logger.info("=======Model Configuration======") logger.info(config.desc) logger.info("================================") try: train_x, dev_x, test_x, train_y, dev_y, test_y = load_data_from_csv(data_path=config.data_path, x_len=config.x_len, y_len=config.y_len, foresight=config.foresight, dev_ratio=config.dev_ratio, #test_ratio=config.test_ratio, seed=config.seed) logger.info("train_x shape: {}, dev_x shape: {}, test_x shape: {}" .format(train_x.shape, dev_x.shape, test_x.shape)) logger.info("train_y shape: {}, dev_y shape: {}, test_y shape: {}" .format(train_y.shape, dev_y.shape, test_y.shape)) model = Model(config) train_data = list(zip(train_x, train_y)) no_improv = 0 best_loss = 100 model_dir = make_date_dir(os.path.join(config.model, 'model_save/')) result_dir = make_date_dir(os.path.join(config.model, 'results/')) start_time = time() for i in range(config.num_epochs): train_batches = batch_loader(train_data, config.batch_size) epoch = i+1 for batch in train_batches: batch_x, batch_y = zip(*batch) loss, acc, step = model.train(batch_x, batch_y) if step % 100 == 0: logger.info("epoch: %d, step: %d, loss: %4f, acc: %4f" % (epoch, step, loss, acc)) # dev score for each epoch (no mini batch) _, dev_loss, dev_acc = model.eval(dev_x, dev_y) if dev_loss < best_loss: best_loss = dev_loss no_improv = 0 logger.info("New score! : dev_loss: %4f, dev_acc: %4f" % (dev_loss, dev_acc)) logger.info("Saving model at {}".format(model_dir)) model.save_session(os.path.join(model_dir, config.model)) else: no_improv += 1 if no_improv == config.nepoch_no_improv: logger.info("No improvement for %d epochs" % no_improv) break elapsed = time()-start_time # generating results (no mini batch) model.restore_session(model_dir) pred, test_loss, test_acc = model.eval(test_x, test_y) logger.info("test_loss: %4f, test_acc: %4f" % (test_loss, test_acc, )) # save results np.save(os.path.join(result_dir, 'pred.npy'), pred) np.save(os.path.join(result_dir, 'test_y.npy'), test_y) logger.info("Saving results at {}".format(result_dir)) logger.info("Elapsed training time {0:0.4f}".format(elapsed)) logger.info("Training finished, exit program") except: logger.exception("ERROR")
def main(): config = Config() logger, log_dir = get_logger(os.path.join(config.model, "logs/")) logger.info("=======Model Configuration=======") logger.info(config.desc) logger.info("=================================") try: train_x, dev_x, test_x, train_y, dev_y, test_y, train_m, dev_m, test_m, test_dt = load_agg_selected_data_mem(data_path=config.data_path, \ x_len=config.x_len, \ y_len=config.y_len, \ foresight=config.foresight, \ cell_ids=config.train_cell_ids, \ dev_ratio=config.dev_ratio, \ test_len=config.test_len, \ seed=config.seed) model = Model(config) if config.allow_gpu: model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=config.lr) # get train data TrainDataSet = BatchDataset(train_x, train_m, train_y) TrainSampler = tud.RandomSampler(TrainDataSet) TrainDataLoader = tud.DataLoader(TrainDataSet, batch_size=config.batch_size, sampler=TrainSampler, num_workers=2) # get valid Data dev_x, dev_m, dev_y = torch.Tensor(dev_x), torch.Tensor( dev_m), torch.Tensor(dev_y) if config.allow_gpu: dev_x, dev_m, dev_y = dev_x.cuda(), dev_m.cuda(), dev_y.cuda() step = 0 no_improv = 0 best_loss = 100 model_dir = make_date_dir(os.path.join(config.model, 'model_save/')) logger.info("Start training") start_time = time() for i in range(config.num_epochs): epoch = i + 1 # train model.train() for batch_x, batch_m, batch_y in TrainDataLoader: step = step + 1 if config.allow_gpu: batch_x, batch_m, batch_y = batch_x.cuda(), batch_m.cuda( ), batch_y.cuda() optimizer.zero_grad() prediction, loss, rse, smape, mae = model( batch_x, batch_m, batch_y) loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), config.clip) optimizer.step() if step % 100 == 0: logger.info( "epoch: %d, step: %d, loss: %.4f, rse: %.4f, smape: %.4f, mae: %.4f" % (epoch, step, loss, rse, smape, mae)) # dev score for each epoch (no mini batch) with torch.no_grad(): model.eval() prediction, dev_loss, dev_rse, dev_smape, dev_mae = model( dev_x, dev_m, dev_y) if dev_loss < best_loss: best_loss = dev_loss no_improv = 0 # logger.info("New score! : dev_loss: %.4f, dev_rse: %.4f, dev_smape: %.4f, dev_mae: %.4f" % # (dev_loss, dev_rse, dev_smape, dev_mae)) # logger.info("Saving model at {}".format(model_dir)) torch.save(model, model_dir + "/" + config.model + ".pth") else: no_improv += 1 if no_improv == config.nepoch_no_improv: logger.info("No improvement for %d epochs" % no_improv) break elapsed = time() - start_time # generating results (no mini batch) logger.info("Saving model at {}".format(model_dir)) logger.info("Elapsed training time {0:0.2f} mins".format(elapsed / 60)) logger.info("Training finished, exit program") except: logger.exception("ERROR")
def main(opt: argparse, logger: logging, log_dir: str) -> Config: #Target workload loading logger.info( f"====================== {opt.persistence} mode ====================\n" ) logger.info(f"Target workload name is {opt.target}") knob_data, aggregated_IM_data, aggregated_EM_data, target_knob_data, target_external_data = data_preprocessing( opt.target, opt.persistence, logger) logger.info( "====================== Metrics_Simplification ====================\n") pruned_metrics = metric_simplification(aggregated_IM_data, logger, opt) logger.info( f"Done pruning metrics for workload {opt.persistence} (# of pruned metrics: {len(pruned_metrics)}).\n\n" f"Pruned metrics: {pruned_metrics}\n") metric_idxs = [ i for i, metric_name in enumerate(aggregated_IM_data['columnlabels']) if metric_name in pruned_metrics ] ranked_metric_data = { 'data': aggregated_IM_data['data'][:, metric_idxs], 'rowlabels': copy.deepcopy(aggregated_IM_data['rowlabels']), 'columnlabels': [aggregated_IM_data['columnlabels'][i] for i in metric_idxs] } ### KNOBS RANKING STAGE ### rank_knob_data = copy.deepcopy(knob_data) logger.info( "====================== Run_Knobs_Ranking ====================\n") logger.info(f"use mode = {opt.rki}") ranked_knobs = knobs_ranking(knob_data=rank_knob_data, metric_data=ranked_metric_data, mode=opt.rki, logger=logger) logger.info( f"Done ranking knobs for workload {opt.persistence} (# ranked knobs: {len(ranked_knobs)}).\n\n" f"Ranked knobs: {ranked_knobs}\n") top_k: int = opt.topk top_k_knobs = utils.get_ranked_knob_data(ranked_knobs, knob_data, top_k) target_knobs = utils.get_ranked_knob_data(ranked_knobs, target_knob_data, top_k) knob_save_path = utils.make_date_dir('./save_knobs') logger.info(f"Knob save path : {knob_save_path}") logger.info(f"Choose Top {top_k} knobs : {top_k_knobs['columnlabels']}") np.save(os.path.join(knob_save_path, f'knobs_{top_k}.npy'), np.array(top_k_knobs['columnlabels'])) model, optimizer, trainDataloader, valDataloader, testDataloader, scaler_y = prepare_for_training( opt, top_k_knobs, target_knobs, aggregated_EM_data, target_external_data) logger.info( f"====================== {opt.model_mode} Pre-training Stage ====================\n" ) best_epoch, best_th_loss, best_la_loss, best_th_mae_loss, best_la_mae_loss, model_path = train( model, trainDataloader, valDataloader, testDataloader, optimizer, scaler_y, opt, logger) logger.info( f"\n\n[Best Epoch {best_epoch}] Best_th_Loss : {best_th_loss} Best_la_Loss : {best_la_loss} Best_th_MAE : {best_th_mae_loss} Best_la_MAE : {best_la_mae_loss}" ) config = Config(opt.persistence, opt.db, opt.cluster, opt.rki, opt.topk, opt.model_mode, opt.n_epochs, opt.lr) config.save_results(opt.target, best_epoch, best_th_loss, best_la_loss, best_th_mae_loss, best_la_mae_loss, model_path, log_dir, knob_save_path) return config
def train(model, trainDataset, valDataset, testDataset, optimizer, scheduler, tokenizer): """ Train using train_epoch, eval_epoch, test_score_model. Adopt EarlyStopping checking valid loss. """ val_losses = [] test_accuracy = [] model_save_path = utils.make_date_dir("./model_save") logger.info("Model save path: {}".format(model_save_path)) best_loss = float('inf') best_acc = 0 patience = 0 if args.num_labels == 1: test_score = test_MSE_score_model else: test_score = test_CE_score_model for epoch in range(int(args.n_epochs)): patience += 1 logger.info("=====================Train======================") train_loss, text_loss, visual_loss, speech_loss, label_loss = train_epoch( model, trainDataset, optimizer, scheduler, tokenizer) logger.info( "[Train Epoch {}] Joint Loss : {} Text Loss : {} Visual Loss : {} Speech Loss : {} Label Loss : {}" .format(epoch + 1, train_loss, text_loss, visual_loss, speech_loss, label_loss)) logger.info("=====================Valid======================") valid_loss, text_loss, visual_loss, speech_loss, label_loss, preds, labels = eval_epoch( model, valDataset, optimizer, scheduler, tokenizer) logger.info( "[Val Epoch {}] Joint Loss : {} Text Loss : {} Visual Loss : {} Speech Loss : {} Label Loss : {}" .format(epoch + 1, valid_loss, text_loss, visual_loss, speech_loss, label_loss)) logger.info("=====================Test======================") test_acc, test_mae, test_f_score = test_score(preds, labels) logger.info( "[Epoch {}] Test_ACC : {}, Test_MAE : {}, Test_F_Score: {}".format( epoch + 1, test_acc, test_mae, test_f_score)) if test_acc > best_acc: torch.save( model.state_dict(), os.path.join(model_save_path, 'model_' + str(epoch + 1) + ".pt")) best_acc = test_acc patience = 0 if patience == 15: break val_losses.append(valid_loss) test_accuracy.append(test_acc)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "-m", "--model", type=str, default="LSTNet", required=True, choices=["LSTNet", "MANN", "AR_reg", "AR", "AR_mem"], help="Model selected in the list: LSTNet, MANN, AR_reg, AR, AR_mem") # Optional parameters args = parser.parse_args() if args.model == "LSTNet": from LSTNet.config import Config from LSTNet.model import Model config = Config() elif args.model == "MANN": from MANN.config import Config from MANN.model import Model config = Config() elif args.model == "AR_reg": from AR_reg.config import Config from AR_reg.model import Model config = Config() elif args.model == "AR_mem": from AR_mem.config import Config from AR_mem.model import Model config = Config() elif args.model == "AR": from AR.config import Config from AR.model import Model config = Config() COL_LIST = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'] logger = get_logger(os.path.join(config.model, "logs/")) logger.info("=======Model Configuration======") logger.info(config.desc) logger.info("================================") try: train_x, dev_x, test_x, train_y, dev_y, test_y, train_m, dev_m, test_m, test_dt = load_data_mem( data_path=config.data_path, x_col_list=COL_LIST, y_col_list=COL_LIST, x_len=config.x_len, y_len=config.y_len, mem_len=config.mem_len, foresight=config.foresight, dev_ratio=config.dev_ratio, test_len=config.test_len, seed=config.seed) model = Model(config) train_data = list(zip(train_x, train_m, train_y)) no_improv = 0 best_loss = 100 model_dir = make_date_dir(os.path.join(config.model, 'model_save/')) result_dir = make_date_dir(os.path.join(config.model, 'results/')) logger.info("Start training") dev_x = np.asarray(dev_x) dev_y = np.asarray(dev_y) start_time = time() for i in range(config.num_epochs): train_batches = batch_loader(train_data, config.batch_size) epoch = i + 1 for batch in train_batches: batch_x, batch_m, batch_y = zip(*batch) loss, rmse, rse, smape, mae, step = model.train( batch_x, batch_m, batch_y) if step % 100 == 0: logger.info( "epoch: %d, step: %d, loss: %4f, rmse: %4f, rse: %4f, smape: %4f, mae: %4f" % (epoch, step, loss, rmse, rse, smape, mae)) # dev score for each epoch (no mini batch) _, dev_loss, dev_rmse, dev_rse, dev_smape, dev_mae = model.eval( dev_x, dev_m, dev_y) if dev_loss < best_loss: best_loss = dev_loss no_improv = 0 logger.info( "New score! : dev_loss: %4f, rmse: %4f, dev_rse: %4f, dev_smape: %4f, dev_mae: %4f" % (dev_loss, dev_rmse, dev_rse, dev_smape, dev_mae)) logger.info("Saving model at {}".format(model_dir)) model.save_session(os.path.join(model_dir, config.model)) else: no_improv += 1 if no_improv == config.nepoch_no_improv: logger.info("No improvement for %d epochs" % no_improv) break elapsed = time() - start_time # generating results (no mini batch) model.restore_session(model_dir) pred, test_loss, test_rmse, test_rse, test_smape, test_mae = model.eval( test_x, test_m, test_y) logger.info( "test_loss: %4f, test_rmse: %4f, test_rse: %4f, test_smape: %4f, test_mae: %4f" % (test_loss, test_rmse, test_rse, test_smape, test_mae)) # save results np.save(os.path.join(result_dir, 'pred.npy'), pred) np.save(os.path.join(result_dir, 'test_y.npy'), test_y) np.save(os.path.join(result_dir, 'test_dt.npy'), test_y) logger.info("Saving results at {}".format(result_dir)) logger.info("Elapsed training time {0:0.2f}".format(elapsed / 60)) logger.info("Training finished, exit program") t = np.linspace(0, pred.shape[0], num=pred.shape[0]) mae = np.mean(np.abs(test_y - pred)) mape = np.mean(np.abs((test_y - pred) / test_y)) plt.rcParams['figure.figsize'] = [20, 4] plt.plot(t, test_y, "r", alpha=0.5) #plt.ylim(0.5,1.0) plt.plot(t, pred, "b") #plt.title("{}, mape:{mape:.5f}, mae:{mae:.5f}".format(raw.columns[1], mape=mape, mae=mae), size=20) plt.legend(("actual", "pred"), loc="upper left") plt.grid() plt.show() plt.savefig(os.path.join(config.model, "image/figure.png")) except: logger.exception("ERROR")