def load_data_val_6(testList, W, index, batch): x_train_1 = [] x_train_2 = [] x_train_3 = [] for i in range(0, batch): true_index = index + i if (true_index >= len(testList)): true_index = len(testList) - 1 items = testList[true_index].split(' ') q_words = items[2].split('_') a_words = items[3].split('_') x_train_1_words = [] x_train_2_words = [] x_train_3_words = [] for i in range(50): x_train_1_words.append( ModelUtils.build_text_image(W, q_words[i], padding=1)) x_train_2_words.append( ModelUtils.build_text_image(W, a_words[i], padding=1)) x_train_3_words.append( ModelUtils.build_text_image(W, a_words[i], padding=1)) x_train_1.append(np.array(x_train_1_words).reshape((50, 300))) x_train_2.append(np.array(x_train_2_words).reshape((50, 300))) x_train_3.append(np.array(x_train_3_words).reshape((50, 300))) return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def _validation(self, valid_loader): with torch.no_grad(): self.model.eval() loss_tracker = LossTracker() pred_scores = [] true_scores = [] for i, (img, label) in enumerate(valid_loader): img = torch.cat(img) img = img.to(cfg.device) label = torch.cat(label) label = label.to(cfg.device) classification_output = self.model(img) loss = self.criterion(classification_output, label) ModelUtils.append_results(label, classification_output, pred_scores, true_scores) loss_tracker.increment_loss(loss) if i % 100 == 0: weighted_AUC = CompetitionMetric.alaska_weighted_auc(true_scores, pred_scores) loss_tracker.print_losses(self.epoch, i, len(valid_loader), weighted_AUC) weighted_AUC = CompetitionMetric.alaska_weighted_auc(true_scores, pred_scores) loss_dict = loss_tracker.write_dict(weighted_AUC) loss_tracker.print_losses(self.epoch, i, len(valid_loader), weighted_AUC) self.writer.write_scalars(loss_dict, tag='val', n_iter=self.train_step) self.scheduler.step(metrics=loss_tracker.loss.avg) lr = self.optimizer.param_groups[-1]['lr'] self.writer.write_scalars({'lr': lr}, tag='val', n_iter=self.train_step)
def load_data_6(W, alist, raw, size): x_train_1 = [] x_train_2 = [] x_train_3 = [] for i in range(0, size): items = raw[random.randint(0, len(raw) - 1)] nega = rand_qa(alist) q_words = items[2].split('_') a_words = items[3].split('_') neg_words = nega.split('_') x_train_1_words = [] x_train_2_words = [] x_train_3_words = [] for i in range(50): x_train_1_words.append( ModelUtils.build_text_image(W, q_words[i], padding=1)) x_train_2_words.append( ModelUtils.build_text_image(W, a_words[i], padding=1)) x_train_3_words.append( ModelUtils.build_text_image(W, neg_words[i], padding=1)) x_train_1.append(np.array(x_train_1_words).reshape((50, 300))) x_train_2.append(np.array(x_train_2_words).reshape((50, 300))) x_train_3.append(np.array(x_train_3_words).reshape((50, 300))) return np.array(x_train_1), np.array(x_train_2), np.array(x_train_3)
def _train(self, train_loader): # Show and log loss results every 100 steps loss_tracker = LossTracker() pred_scores = [] true_scores = [] self.model.train() print('Epoch: {} : LR = {}'.format(self.epoch, self.lr)) for i, img in enumerate(train_loader): self.train_step += 1 # Split the batch in 4 sub-batches, such that each sub-batch contains either the cover, JUNIWARD, JMiPOD or # UERD version of each image. batch_splits, labels = ModelUtils.batch_splitter(img, num_imgs=4) classification_out = [self.model(sub_batch) for sub_batch in batch_splits] losses = [self.criterion(c, l) for c, l in zip(classification_out, labels)] total_loss = sum(losses)/len(losses) self.optimizer.zero_grad() total_loss.backward() self.optimizer.step() for batch_labs, batch_res in zip(labels, classification_out): ModelUtils.append_results(batch_labs, batch_res, pred_scores, true_scores) loss_tracker.increment_loss(total_loss) if i % cfg.log_freq == 0 and i > 0: weighted_AUC = CompetitionMetric.alaska_weighted_auc(true_scores, pred_scores) pred_scores = [] true_scores = [] loss_dict = loss_tracker.write_dict(weighted_AUC) loss_tracker.print_losses(self.epoch, i, len(train_loader), weighted_AUC) loss_tracker = LossTracker() # Reinitialize the loss tracking self.writer.write_scalars(loss_dict, tag='train', n_iter=self.train_step) if i % cfg.save_freq == 0 and i > 0: ModelUtils.save_model(self)
def main(self): if cfg.device.type == 'cuda': torch.cuda.set_enabled_lms(True) cudnn.benchmark = True ModelUtils.load_model(self) self.model = self.model.to(cfg.device) test_data = TestDataLoader( img_root=os.path.join(cfg.data_path, 'test')) self.data_loader = data.DataLoader(test_data, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers) print(f'Testing {len(test_data) * cfg.batch_size} images') results = self._infere() self._write_results(results)
def main(self): if cfg.device.type == 'cuda': if cfg.enable_lms: torch.cuda.set_enabled_lms(True) cudnn.benchmark = True # Loads a model only if there is an existing file in cfg.save_path ModelUtils.load_model(self) if self.train_step == 0: SrmFiltersSetter.initialize_filters(self.model) print('SRM High Pass filters initialized') self.model = self.model.to(cfg.device) train_data = TrainDataLoader( img_root=os.path.join(cfg.data_path, 'train'), transform=AugmentatedTransform(sizes=cfg.input_size), is_training=True ) train_loader = data.DataLoader(train_data, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.num_workers) val_data = TrainDataLoader( img_root=os.path.join(cfg.data_path, 'train'), transform=BaseTransform(sizes=cfg.input_size), is_training=False ) val_loader = data.DataLoader(val_data, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers) print(f'Training on {len(train_data)} images') print('Start training.') for e in range(self.epoch, cfg.max_epoch): self._train(train_loader) self.epoch += 1 if self.epoch % cfg.val_freq == 0: self._validation(val_loader)
def runModel(config, data_dictionary, data_statistics, train_test_folds): program_start_time = time() # assign all program arguments to local variables with open(config['model']['path']) as handle: ModelDict = json.loads(handle.read()) # check if station and grid time invariant features should be used and set the list of desired parameters if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config[ 'grid_time_invariant_parameters'] = [] if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config[ 'station_parameters'] = [] # update general static model information experiment_info = config experiment_info['model'] = ModelDict experiment_info['code_commit'] = ModelUtils.get_git_revision_short_hash() # if needed, load time invariant features with open("%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % ( config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file: time_invarian_data = pkl.load(input_file) # initialize feature scaling function for each feature featureScaleFunctions = DataUtils.getFeatureScaleFunctions(ModelUtils.ParamNormalizationDict, data_statistics) # get optimizer config optimizer_config = config['optimizer'] # generate output path for experiment information setting_string = '%s_grid_%s_bs_%s_tf_%s_optim_%s_lr_%s_sl_%s' % ( config['model']['name'], config['grid_size'], config['batch_size'], config['test_fraction'], optimizer_config['algorithm'], optimizer_config['learning_rate'], config['slice_size']) output_path = '%s/%s' % (config['experiment_path'], setting_string) if not os.path.exists(output_path): raise Exception('Node folder of training run has been found for "%s"' % output_path) ds = xr.Dataset() # cross validation for run in range(config['runs']): print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1))) stations = sorted(config['stations']) # take the right preprocessed train/test data set for the current run train_fold, test_fold = train_test_folds[run] # get all inits all_inits_set = set(config['inits']) # get train and test inits train_inits_set = set([t[1] for t in train_fold]) test_inits_set = set([t[1] for t in test_fold]) # get all filtered inits filtere_inits = set( [init for init in all_inits_set if init not in train_inits_set and init not in test_inits_set]) # make sure, that all sets are distinct assert filtere_inits ^ train_inits_set ^ test_inits_set == all_inits_set init_type_mapping = {} for init in train_inits_set: init_type_mapping[init] = 'train' for init in test_inits_set: init_type_mapping[init] = 'test' for init in filtere_inits: init_type_mapping[init] = 'filterd' all_inits = sorted(list(all_inits_set)) all_data = [(station, init) for init in all_inits for station in stations] n_data_points = len(all_data) # keep mappings from init and station to index of result numpy array station_index_dict = {} for station_idx, station in enumerate(stations): station_index_dict[station] = station_idx init_index_dict = {} for init_idx, init in enumerate(all_inits): init_index_dict[init] = init_idx # initialize train and test dataloaders dataset = DataLoaders.ErrorPredictionCosmoData( config=config, station_data_dict=data_dictionary, files=all_data, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) dataloader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=False, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) # initialize network, optimizer and loss function net = Baseline.model_factory(model_dict=ModelDict, params=dataset.n_parameters, time_invariant_params=dataset.n_grid_time_invariant_parameters, grid=config['grid_size'], prediction_times=config['prediction_times']) if torch.cuda.device_count() > 1: net = nn.DataParallel(net) optimizer = optim.SGD(net.parameters(), lr=optimizer_config['learning_rate'], momentum=optimizer_config['momentum']) net, optimizer, *_ = ModelUtils.load_checkpoint(output_path + '/stored_models/run_%s' % run, model=net, optimizer=optimizer) if torch.cuda.is_available(): net.cuda() # we do not train, but only output the evaluation of the network on train and test data net.eval() # initialize result array of errors per init and station and initialize it with NaN run_error_statistics = np.empty((len(init_index_dict), len(station_index_dict), 5)) run_error_statistics.fill(np.nan) # loop over complete data set for i, data in enumerate(dataloader, 0): try: # get training batch, e.g. label, cosmo-1 output and time inv. features for station DATA = data # DATA has only length 4 if we do not use the station time invariant features if len(DATA) == 4: Blabel, Bip2d, BTimeData, init_station_temp = DATA station_time_inv_input = None elif len(DATA) == 5: Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float() else: raise Exception('Unknown data format for training...') input = ModelUtils.getVariable(Bip2d).float() time_data = ModelUtils.getVariable(BTimeData).float() target = ModelUtils.getVariable(Blabel).float() except TypeError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue out = net(input, time_data, station_time_inv_input).squeeze() target = target.squeeze() diff = (out - target).squeeze() for item in range(Blabel.shape[0]): init = init_station_temp[0][item] station = init_station_temp[1][item].item() cosmo_temperature = init_station_temp[2][item].item() target_temperature = init_station_temp[3][item].item() station_idx = station_index_dict[station] init_idx = init_index_dict[init] run_error_statistics[init_idx, station_idx, :] = np.array((out[item].item(), cosmo_temperature, target[item].item(), diff[item].item(), target_temperature)) processed_samples = (i + 1) * int(config['batch_size']) if (i+1) % np.max((1, ((n_data_points // config['batch_size']) // 100))) == 0: print("%s samples have been processed. [%2.1f%%]" % (processed_samples, (processed_samples / n_data_points) * 100)) sys.stdout.flush() da = xr.DataArray(run_error_statistics, dims=('init', 'station', 'data'), coords=[all_inits, stations, ['prediction', 'cosmo', 'target', 'difference', 'target_temperature']]) da = da.sortby(variables='init') da.attrs['init_type_mapping'] = sorted(list(init_type_mapping.items())) ds['run_%s' % run] = da ds.attrs['config'] = config print('Error results of run %s have been processed.' % run) # flush output to see progress sys.stdout.flush() if not os.path.exists(output_path): raise Exception('Node folder of training run has been found for "%s"' % output_path) # dump experiment statistic with open(output_path + '/model_run_error.pkl', 'wb') as handle: pkl.dump(ds, handle, protocol=pkl.HIGHEST_PROTOCOL) # print program execution time m, s = divmod(time() - program_start_time, 60) h, m = divmod(m, 60) print('Experiment has successfully finished in %dh %02dmin %02ds' % (h, m, s))
def model_eva(stock, state_dt, para_window, para_dc_window): if DBUtils.select_ev_result(state_dt, stock): print('Already ev:' + stock + ':' + state_dt) return 0 # 建评估时间序列, para_window参数代表回测窗口长度 ev_start = Utils.date2d( (Utils.to_date(state_dt) - datetime.timedelta(days=para_window))) ev_end = state_dt date_temp = DBUtils.get_stock_calender(ev_start, ev_end) ev_dt_seq = [(Utils.d2date(x)) for x in date_temp] # 清空评估用的中间表model_ev_mid DBUtils.clear_ev_mid() return_flag = 0 # 开始回测,其中para_dc_window参数代表建模时数据预处理所需的时间窗长度 for d in range(len(ev_dt_seq)): dc_start_dt = Utils.d2date( Utils.to_date(ev_dt_seq[d]) - datetime.timedelta(days=para_dc_window)) dc_end_dt = ev_dt_seq[d] try: dc = DC.data_collect(stock, dc_start_dt, dc_end_dt) if len(set(dc.data_target)) <= 1: print('WARN: DC target is less than 1 record.') continue except Exception as exp: print("DC Error") print(exp) return_flag = 1 break train = dc.data_train target = dc.data_target test_case = [dc.test_case] aresult = ModelUtils.use_svm(train, target, test_case) # 将预测结果插入到中间表 DBUtils.insert_predict(dc_end_dt, stock, aresult) if return_flag == 1: print('WARN: something maybe wrong... when svm') acc = recall = acc_neg = f1 = 0 return -1 # 在中间表中刷真实值 for i in range(len(ev_dt_seq)): r = DBUtils.update_ev_mid_with_real(stock, ev_dt_seq[i]) if r != 0: print('WARN: break ev mid with real:' + stock) break # 计算查全率 recall = DBUtils.count_recall() # 计算查准率 acc = DBUtils.count_acc() # 计算查准率(负样本) acc_neg = DBUtils.count_acc_neg() # 计算 F1 分值 f1 = Utils.count_F1(acc, recall) # 将评估结果存入结果表model_ev_resu中 predict = DBUtils.get_predict(ev_dt_seq[-1]) DBUtils.insert_ev_result(state_dt, stock, acc, recall, f1, acc_neg, 'svm', predict) print( str(state_dt) + ' Precision : ' + str(acc) + ' Recall : ' + str(recall) + ' F1 : ' + str(f1) + ' Acc_Neg : ' + str(acc_neg)) return 1
def runModel(config, data_dictionary, data_statistics, train_test_folds): program_start_time = time() # assign all program arguments to local variables with open(config['model']['path']) as handle: ModelDict = json.loads(handle.read()) # check if station and grid time invariant features should be used and set the list of desired parameters if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config['grid_time_invariant_parameters'] =[] if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config['station_parameters'] = [] # update general static model information experiment_info = config experiment_info['model'] = ModelDict experiment_info['code_commit'] = ModelUtils.get_git_revision_short_hash() # if needed, load time invariant features with open("%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file: time_invarian_data = pkl.load(input_file) # initialize feature scaling function for each feature featureScaleFunctions = DataUtils.getFeatureScaleFunctions(ModelUtils.ParamNormalizationDict, data_statistics) # get optimizer config optimizer_config = config['optimizer'] # generate output path for experiment information setting_string = '%s_grid_%s_bs_%s_tf_%s_optim_%s_lr_%s_sl_%s' % ( config['model']['name'], config['grid_size'], config['batch_size'], config['test_fraction'], optimizer_config['algorithm'], optimizer_config['learning_rate'], config['slice_size']) output_path = '%s/%s' % (config['experiment_path'], setting_string) if not os.path.exists(output_path): os.makedirs(output_path) # time for the set up until first run experiment_info['set_up_time'] = time() - program_start_time print('[Time]: Set-up %s' % strftime("%H:%M:%S", gmtime(experiment_info['set_up_time']))) sys.stdout.flush() # initialize statistics error_statistics = None run_times = None skip_statistics = None if 'per_station_rmse' in config: error_per_station_statistics = None # keep used learning rates experiment_info['scheduled_learning_rates'] = [] # cross validation for run in range(config['runs']): # logger for tensorboardX train_logger = Logger(output_path + '/logs/run_%s/train' % run) test_logger = Logger(output_path + '/logs/run_%s/test' % run) print('[Run %s] Cross-validation test fold %s' % (str(run + 1), str(run + 1))) # take the right preprocessed train/test data set for the current run train_fold, test_fold = train_test_folds[run] # initialize best epoch test error best_epoch_test_rmse = float("inf") # use different data loader if we want to train a 3nn model approach if "knn" in ModelDict: # initialize train and test dataloaders trainset = DataLoaders.CosmoData3NNData( config=config, station_data_dict=data_dictionary, files=train_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) testset = DataLoaders.CosmoData3NNData( config=config, station_data_dict=data_dictionary, files=test_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) else: # initialize train and test dataloaders trainset = DataLoaders.CosmoDataGridData( config=config, station_data_dict=data_dictionary, files=train_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) testset = DataLoaders.CosmoDataGridData( config=config, station_data_dict=data_dictionary, files=test_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) # initialize network, optimizer and loss function net = Baseline.model_factory(ModelDict, trainset.n_parameters, trainset.n_grid_time_invariant_parameters, config['grid_size'], config['prediction_times']) # store class name experiment_info['model_class'] = net.__class__.__name__ if torch.cuda.device_count() > 1: net = nn.DataParallel(net) if torch.cuda.is_available(): net.cuda() # load number of train and test samples n_train_samples, n_test_samples = len(train_fold), len(test_fold) optimizer, scheduler = ModelUtils.initializeOptimizer(optimizer_config, net) criterion = nn.MSELoss() # keep number of processed smaples over all epochs for tensorboard processed_train_samples_global = 0 processed_test_samples_global = 0 # start learning for epoch in range(config['epochs']): epoch_train_time = np.zeros((5,)) epoch_start_time = time() print('Epoch: ' + str(epoch + 1) + '\n------------------------------------------------------------') # adapt learning rate and store information in experiment attributes if scheduler is not None: scheduler.step() if run == 0: experiment_info['scheduled_learning_rates'] += scheduler.get_lr() print('Using learning rate %s' % str(scheduler.get_lr())) # TRAINING # initialize variables for epoch statistics LABELS, MODELoutputs, COSMOoutputs = None, None, None processed_train_samples = 0 net.train(True) train_start_time = time() # loop over complete train set for i, data in enumerate(trainloader, 0): time_start = time() try: # get training batch, e.g. label, cosmo-1 output and time inv. features for station DATA = data # DATA has only length 4 if we do not use the station time invariant features if len(DATA) == 4: Blabel, Bip2d, BTimeData, init_station_temp = DATA station_time_inv_input = None elif len(DATA) == 5: Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float() else: raise Exception('Unknown data format for training...') input = ModelUtils.getVariable(Bip2d).float() time_data = ModelUtils.getVariable(BTimeData).float() target = ModelUtils.getVariable(Blabel).float() except TypeError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue time_after_data_preparation = time() processed_train_samples += len(Blabel) optimizer.zero_grad() out = net(input, time_data, station_time_inv_input) time_after_forward_pass = time() loss = criterion(out, target) loss.backward() optimizer.step() time_after_backward_pass = time() if LABELS is None: LABELS = Blabel.data MODELoutputs = out.data COSMOoutputs = init_station_temp[2].data else: LABELS = np.vstack((LABELS, Blabel.data)) MODELoutputs = np.vstack((MODELoutputs, out.data)) COSMOoutputs = np.vstack((COSMOoutputs, init_station_temp[2].data)) time_after_label_stack = time() if (i + 1) % 64 == 0: print('Sample: %s \t Loss: %s' % (processed_train_samples, float(np.sqrt(loss.data)))) # ============ TensorBoard logging ============# # (1) Log the scalar values info = { setting_string: np.sqrt(loss.item()), } for tag, value in info.items(): train_logger.scalar_summary(tag, value, processed_train_samples_global + processed_train_samples) # (2) Log values and gradients of the parameters (histogram) for tag, value in net.named_parameters(): tag = tag.replace('.', '/') train_logger.histo_summary(tag, ModelUtils.to_np(value), i + 1) train_logger.histo_summary(tag + '/grad', ModelUtils.to_np(value.grad), i + 1) epoch_train_time += np.array((time_start - time_end, time_after_data_preparation - time_start, time_after_forward_pass - time_after_data_preparation, time_after_backward_pass - time_after_forward_pass, time_after_label_stack - time_after_backward_pass)) time_end = time() # calculate error statistic of current epoch diff_model = MODELoutputs - LABELS diff_cosmo = COSMOoutputs - LABELS epoch_train_rmse_model = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_model, axis=0) epoch_train_rmse_cosmo = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_cosmo, axis=0) # update global processed samples processed_train_samples_global += processed_train_samples if np.isnan(epoch_train_rmse_model).any(): print("Learning rate too large resulted in NaN-error while training. Stopped training...") return # print epoch training times print('Timing: Waiting on data=%s, Data Preparation=%s,' 'Forward Pass=%s, Backward Pass=%s, Data Stacking=%s' % tuple(list(epoch_train_time / len(epoch_train_time)))) # RMSE of epoch print('Train/test statistic for epoch: %s' % str(epoch + 1)) print('Train RMSE COSMO: ' , ", ".join(["T=%s: %s" % (idx, epoch_train_rmse_cosmo[idx]) for idx in range(len(epoch_train_rmse_cosmo))])) print('Train RMSE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_train_rmse_model[idx]) for idx in range(len(epoch_train_rmse_model))])) sys.stdout.flush() train_time = time() - train_start_time # TESTING test_start_time = time() LABELS, MODELoutputs, COSMOoutputs, STATION = None, None, None, None processed_test_samples = 0 net.eval() for i, data in enumerate(testloader, 0): try: # get training batch, e.g. label, cosmo-1 output and time inv. features for station DATA = data # DATA has only length 4 if we do not use the station time invariant features if len(DATA) == 4: Blabel, Bip2d, BTimeData, init_station_temp = DATA station_time_inv_input = None elif len(DATA) == 5: Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA station_time_inv_input = ModelUtils.getVariable(StationTimeInv).float() else: raise Exception('Unknown data format for training...') input = ModelUtils.getVariable(Bip2d).float() time_data = ModelUtils.getVariable(BTimeData).float() target = ModelUtils.getVariable(Blabel).float() except TypeError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue processed_test_samples += len(Blabel) out = net(input, time_data, station_time_inv_input) loss = criterion(out, target) if LABELS is None: LABELS = Blabel.data MODELoutputs = out.data COSMOoutputs = init_station_temp[2].data STATION = init_station_temp[1].data else: LABELS = np.vstack((LABELS, Blabel.data)) MODELoutputs = np.vstack((MODELoutputs, out.data)) COSMOoutputs = np.vstack((COSMOoutputs, init_station_temp[2].data)) STATION = np.hstack((STATION, init_station_temp[1].data)) if i % 16: # ============ TensorBoard logging ============# # (1) Log the scalar values info = { setting_string: np.sqrt(loss.item()), } for tag, value in info.items(): test_logger.scalar_summary(tag, value, processed_test_samples_global + processed_test_samples) # calculate error statistic of current epoch diff_model = MODELoutputs - LABELS diff_cosmo = COSMOoutputs - LABELS # rmse epoch_test_rmse_model = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_model, axis=0) epoch_test_rmse_cosmo = np.apply_along_axis(func1d=ModelUtils.rmse, arr=diff_cosmo, axis=0) overall_test_rmse_model = ModelUtils.rmse(diff_model) overall_test_rmse_cosmo = ModelUtils.rmse(diff_cosmo) # mae epoch_test_mae_model = np.apply_along_axis(func1d=ModelUtils.mae, arr=diff_model, axis=0) epoch_test_mae_cosmo = np.apply_along_axis(func1d=ModelUtils.mae, arr=diff_cosmo, axis=0) overall_test_mae_model = ModelUtils.mae(diff_model) overall_test_mae_cosmo = ModelUtils.mae(diff_cosmo) # calculate per station rmse if desired (especially for K-fold station generalization experiment if "per_station_rmse" in config: max_station_id = 1435 squared_errors_per_epoch = np.array((np.square(diff_model), np.square(diff_cosmo))).squeeze() # the highest index of data is 1435, thus we expect at least 1435 entries, which we can access by # station id test_samples_per_station = np.bincount(STATION, minlength=max_station_id+1) model_squared_error_per_station = np.bincount(STATION, weights=squared_errors_per_epoch[0], minlength=max_station_id+1) cosmo_squared_error_per_station = np.bincount(STATION, weights=squared_errors_per_epoch[1], minlength=max_station_id+1) # set division by zero/NaN warning to 'ignore' np.seterr(divide='ignore', invalid='ignore') # calculate rmse per station rmse_per_station = np.vstack((np.sqrt(np.divide(model_squared_error_per_station, test_samples_per_station)), np.sqrt(np.divide(cosmo_squared_error_per_station, test_samples_per_station)))).T # set division by zero/NaN warning to 'warn' np.seterr(divide='warn', invalid='warn') # update global processed samples processed_test_samples_global += processed_test_samples # RMSE of epoch print('Test RMSE COSMO: ', ", ".join( ["T=%s: %s" % (idx, epoch_test_rmse_cosmo[idx]) for idx in range(len(epoch_test_rmse_cosmo))]), " (Overall: %s" % overall_test_rmse_cosmo) print('Test RMSE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_test_rmse_model[idx]) for idx in range(len(epoch_test_rmse_model))]), " (Overall: %s" % overall_test_rmse_model) # mae of epoch print('Test MAE COSMO: ', ", ".join( ["T=%s: %s" % (idx, epoch_test_mae_cosmo[idx]) for idx in range(len(epoch_test_mae_cosmo))]), " (Overall: %s" % overall_test_mae_cosmo) print('Test MAE Model: ' , ", ".join(["T=%s: %s" % (idx, epoch_test_mae_model[idx]) for idx in range(len(epoch_test_mae_model))]), " (Overall: %s" % overall_test_mae_model) sys.stdout.flush() test_time = time() - test_start_time # time for epoch epoch_time = time() - epoch_start_time # update error statistics error_statistics = ModelUtils.updateErrorStatistic(error_statistics, np.array([epoch_train_rmse_model, epoch_test_rmse_model])[None, None, ...], run, epoch, config['prediction_times']) # update run times statistic run_times = ModelUtils.updateRuntimeStatistic(run_times, np.array([epoch_time, train_time, test_time])[None, None, ...], run, epoch) # update skip statistic skip_statistics = ModelUtils.updateSkipStatistic(skip_statistics, np.array([n_train_samples, processed_train_samples, n_test_samples, processed_test_samples])[None, None, ...], run, epoch) # update per station rmse data array over runs if desired (especially for K-fold station generalization experiment if "per_station_rmse" in config: error_per_station_statistics = ModelUtils.updatePerStationErrorStatistic(error_per_station_statistics, rmse_per_station, run, epoch, np.arange(max_station_id+1)) # store model if it was the best yes is_best = overall_test_rmse_model <= best_epoch_test_rmse best_epoch_test_rmse = min(overall_test_rmse_model, best_epoch_test_rmse) ModelUtils.save_checkpoint({ 'epoch': epoch, 'run': run, 'arch': net.__class__.__name__, 'state_dict': net.state_dict(), 'overall_test_rmse': overall_test_rmse_model, 'lead_test_rmse' : overall_test_rmse_model, 'best_epoch_test_rmse': best_epoch_test_rmse, 'optimizer': optimizer.state_dict(), }, is_best, output_path + '/stored_models/run_%s' % run) # flush output to see progress sys.stdout.flush() # update statistics dict ModelUtils.get_model_details(experiment_info, net, optimizer, criterion) # complete program runtime experiment_info['program_runtime'] = time() - program_start_time # generate data set of all experiment statistics and additional information experiment_statistic = xr.Dataset({ 'error_statistic' : error_statistics, 'run_time_statistic': run_times, 'samples_statistic' : skip_statistics}).assign_attrs(experiment_info) # dump experiment statistic with open(output_path + '/experiment_statistic.pkl', 'wb') as handle: pkl.dump(experiment_statistic, handle, protocol=pkl.HIGHEST_PROTOCOL) if 'per_station_rmse' in config: # dump experiment statistic with open(output_path + '/rmse_per_station.pkl', 'wb') as handle: pkl.dump(error_per_station_statistics, handle, protocol=pkl.HIGHEST_PROTOCOL) # print program execution time m, s = divmod(experiment_info['program_runtime'], 60) h, m = divmod(m, 60) print('Experiment has successfully finished in %dh %02dmin %02ds' % (h, m, s))
from sklearn.linear_model import LinearRegression import utils.AlphaVantageUtils as av import utils.PostgresUtils as pg import utils.ModelUtils as mdl name = pg.get_symbol_name(av._TIC_MICROSOFT) df_prices = pg.get_prices_with_features(av._TIC_MICROSOFT, av._INT_DAILY, None, None, None) df_prices.drop(columns=['open', 'high', 'low', 'volume'], inplace=True) print(df_prices.info()) df_train, df_test = mdl.train_test_split(df_prices, 1000) predictions = [] train = df_train.drop(pg._COL_DATETIME, axis=1) test = df_test.drop(pg._COL_DATETIME, axis=1) print(train.shape) print(test.shape) train_X = train.drop(pg._COL_CLOSE, axis=1) train_y = train[pg._COL_CLOSE] print(train_X.iloc[0:1, :]) test_X = test.drop(pg._COL_CLOSE, axis=1)
def CreateData(config, data_dictionary, data_statistics, train_test_folds): # assign all program arguments to local variables with open(config['model']['path']) as handle: ModelDict = json.loads(handle.read()) # check if station and grid time invariant features should be used and set the list of desired parameters if not ('grid_time_invariant' in ModelDict and ModelDict['grid_time_invariant']): config['grid_time_invariant_parameters'] = [] if not ('station_time_invariant' in ModelDict and ModelDict['station_time_invariant']): config['station_parameters'] = [] # if needed, load time invariant features with open( "%s/%s/grid_size_%s/time_invariant_data_per_station.pkl" % (config['input_source'], config['preprocessing'], config['original_grid_size']), "rb") as input_file: time_invarian_data = pkl.load(input_file) # initialize feature scaling function for each feature featureScaleFunctions = DataUtils.getFeatureScaleFunctions( ModelUtils.ParamNormalizationDict, data_statistics) # add revision short hash to the config config['code_commit'] = ModelUtils.get_git_revision_short_hash() # take the right preprocessed train/test data set for the first run train_fold, test_fold = train_test_folds[0] # initialize train and test dataloaders trainset = DataLoaders.CosmoDataGridData( config=config, station_data_dict=data_dictionary, files=train_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) trainloader = DataLoader(trainset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) testset = DataLoaders.CosmoDataGridData( config=config, station_data_dict=data_dictionary, files=test_fold, featureScaling=featureScaleFunctions, time_invariant_data=time_invarian_data) testloader = DataLoader(testset, batch_size=config['batch_size'], shuffle=True, num_workers=config['n_loaders'], collate_fn=DataLoaders.collate_fn) # loop over complete train set train_data = None train_inits = [] train_stations = None for i, data in enumerate(trainloader, 0): try: # get training batch, e.g. label, cosmo-1 output and time inv. features for station DATA = data # DATA has only length 4 if we do not use the station time invariant features if len(DATA) == 4: Blabel, Bip2d, BTimeData, init_station_temp = DATA station_time_inv_input = None elif len(DATA) == 5: Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA station_time_inv_input = ModelUtils.getVariable( StationTimeInv).float() else: raise Exception('Unknown data format for training...') input = ModelUtils.getVariable(Bip2d).float() time_data = ModelUtils.getVariable(BTimeData).float() target = ModelUtils.getVariable(Blabel).float() try: batch_data = np.concatenate( (input.squeeze(), station_time_inv_input, time_data, target, init_station_temp[2]), axis=1) except: batch_data = np.concatenate( (input.squeeze(), time_data, target, init_station_temp[2]), axis=1) train_inits += init_station_temp[0] if train_data is None: train_data = batch_data train_stations = init_station_temp[1] else: train_data = np.vstack((train_data, batch_data)) train_stations = np.hstack( (train_stations, init_station_temp[1])) except TypeError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue # define column names for data frame column_names = [ 'Pressure', 'Wind U-Comp.', 'Wind V-Comp.', 'Wind VMAX', '2m-Temperature', 'Temp. of Dew Point', 'Cloud Coverage (High)', 'Cloud Coverage (Medium)', 'Cloud Coverage (Low)', 'Tot. Precipitation', 'ALB_RAD', 'ASOB', 'ATHB', 'HPBL', '2m-Temperature (Lead=0)' ] column_names += [ 'Grid Height', 'Grid-Station Height Diff.', 'Fraction of Land', 'Soiltype', 'Latitiude', 'Longitued', 'Grid-Station 2d Distance' ] if train_data.shape[1] >= 31: column_names += [ 'Station Height', 'Station Latitude', 'Station Longitude' ] column_names += [ 'Hour (Cosine)', 'Hour (Sine)', 'Month (Cosine)', 'Month (Sine)', 'Lead-Time' ] column_names += ['Target 2m-Temp.'] column_names += ['COSMO 2m-Temp.'] train_keys = pd.DataFrame.from_dict({ 'Station': train_stations, 'Init': train_inits }) train_data = pd.DataFrame(data=train_data, columns=column_names) train_ds = pd.concat([train_keys, train_data], axis=1) test_data = None test_inits = [] test_stations = None for i, data in enumerate(testloader, 0): try: # get training batch, e.g. label, cosmo-1 output and time inv. features for station DATA = data # DATA has only length 4 if we do not use the station time invariant features if len(DATA) == 4: Blabel, Bip2d, BTimeData, init_station_temp = DATA station_time_inv_input = None elif len(DATA) == 5: Blabel, Bip2d, BTimeData, StationTimeInv, init_station_temp = DATA station_time_inv_input = ModelUtils.getVariable( StationTimeInv).float() else: raise Exception('Unknown data format for training...') input = ModelUtils.getVariable(Bip2d).float() time_data = ModelUtils.getVariable(BTimeData).float() target = ModelUtils.getVariable(Blabel).float() try: batch_data = np.concatenate( (input.squeeze(), station_time_inv_input, time_data, target, init_station_temp[2]), axis=1) except: batch_data = np.concatenate( (input.squeeze(), time_data, target, init_station_temp[2]), axis=1) test_inits += init_station_temp[0] if test_data is None: test_data = batch_data test_stations = init_station_temp[1] else: test_data = np.vstack((test_data, batch_data)) test_stations = np.hstack( (test_stations, init_station_temp[1])) except TypeError: # when the batch size is small, it could happen, that all labels have been corrupted and therefore # collate_fn would return an empty list print('Value error...') continue test_keys = pd.DataFrame.from_dict({ 'Station': test_stations, 'Init': test_inits }) test_data = pd.DataFrame(data=test_data, columns=column_names) test_ds = pd.concat([test_keys, test_data], axis=1) network_ready_data_path = config['input_source'] + '/network_ready_data' if not os.path.exists(network_ready_data_path): os.makedirs(network_ready_data_path) network_ready_train_data_path = network_ready_data_path + '/train_data' network_ready_test_data_path = network_ready_data_path + '/test_data' train_ds.to_pickle(network_ready_train_data_path) test_ds.to_pickle(network_ready_test_data_path) # shap specific config entries for analysis in jupyter notebook config['train_data_path'] = network_ready_data_path + '/train_data' config['test_data_path'] = network_ready_data_path + '/test_data' # dump config with open(network_ready_data_path + '/config.pkl', 'wb') as handle: pkl.dump(config, handle, protocol=pkl.HIGHEST_PROTOCOL) print('Network ready data analysis successfully executed.')
elif options.script == 'validatePreprocessing': print('Starting to run %s' % options.script) S = [x for x in range(144)] ValidatePreprocessing.GetData(S, G, withTopo, DB, DE, T, isLocal) # main methods to run nerual network model runs # this requires >=1 model config file in the "models" folder of an experiment and an "experiment_parameters.txt" file # sample model configs can be found under /results/runs/neural_network elif options.script == 'runModel': # take time of start of the experiment experiment_start = time() # this method executes all prerequisite steps to run a model, i.e. preparation of run config, generating/loading data splits, # loading the data into a dictionary for "station" preprocessed data, loading data statistic of features for normalization config, train_test_folds, data_dictionary, data_statistics = ModelUtils.setUpModelRun( options=options, G=G) print('Starting to run %s' % options.script) # validation method of feature normalization, can be ignored if options.model_type == "featureNormalizationValidation": ValidateFeatureNormalization.runModel( config=config, data_dictionary=data_dictionary, data_statistics=data_statistics, train_test_folds=train_test_folds) print('Finished validation of feature normalization.') # all model runs, or experiment with a similar setup else: # get all paths of model configuration files. This allows to define several models at a time to be run models = [