def train(model, trainX, trainTE, trainY, valX, valTE, valY, mean, std): num_train = trainX.shape[0] min_loss = 10000000.0 model.train() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 15], # gamma=0.2) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=False, threshold=0.001, threshold_mode='rel', cooldown=0, min_lr=2e-6, eps=1e-08) for epoch in tqdm(range(1,args.max_epoch+1)): model.train() train_l_sum, train_acc_sum, n, batch_count, start = 0.0, 0.0, 0, 0, time.time() permutation = np.random.permutation(num_train) trainX = trainX[permutation] # trainTE = trainTE[permutation] trainY = trainY[permutation] num_batch = math.ceil(num_train / args.batch_size) with tqdm(total=num_batch) as pbar: for batch_idx in range(num_batch): start_idx = batch_idx * args.batch_size end_idx = min(num_train, (batch_idx + 1) * args.batch_size) X = torch.from_numpy(trainX[start_idx : end_idx]).float().to(device) y = torch.from_numpy(trainY[start_idx : end_idx]).float().to(device) # te = torch.from_numpy(trainTE[start_idx : end_idx]).to(device) optimizer.zero_grad() y_hat = model(X) y_d = y y_hat_d = y_hat loss = _compute_loss(y, y_hat*std+mean) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() train_l_sum += loss.cpu().item() # print(f"\nbatch loss: {l.cpu().item()}") n += y.shape[0] batch_count += 1 pbar.update(1) # lr = lr_scheduler.get_lr() log_string(log, 'epoch %d, lr %.6f, loss %.4f, time %.1f sec' % (epoch, optimizer.param_groups[0]['lr'], train_l_sum / batch_count, time.time() - start)) # print('epoch %d, lr %.6f, loss %.4f, time %.1f sec' # % (epoch, optimizer.param_groups[0]['lr'], train_l_sum / batch_count, time.time() - start)) mae, rmse, mape = res(model, valX, valTE, valY, mean, std) # lr_scheduler.step() lr_scheduler.step(mae[-1]) if mae[-1] < min_loss: min_loss = mae[-1] torch.save(model, args.model_file)
def res(model, valX, valTE, valY, mean, std): model.eval() # 评估模式, 这会关闭dropout # it = test_iter.get_iterator() num_val = valX.shape[0] pred = [] label = [] num_batch = math.ceil(num_val / args.batch_size) with torch.no_grad(): for batch_idx in range(num_batch): if isinstance(model, torch.nn.Module): start_idx = batch_idx * args.batch_size end_idx = min(num_val, (batch_idx + 1) * args.batch_size) X = torch.from_numpy(valX[start_idx : end_idx]).float().to(device) y = valY[start_idx : end_idx] # te = torch.from_numpy(valTE[start_idx : end_idx]).to(device) y_hat = model(X) pred.append(y_hat.cpu().numpy()*std+mean) label.append(y) pred = np.concatenate(pred, axis = 0) label = np.concatenate(label, axis = 0) # print(pred.shape, label.shape) maes = [] rmses = [] mapes = [] wapes = [] for i in range(12): mae, rmse , mape, wape = metric(pred[:,i,:], label[:,i,:]) maes.append(mae) rmses.append(rmse) mapes.append(mape) wapes.append(wape) # if i == 11: log_string(log,'step %d, mae: %.4f, rmse: %.4f, mape: %.4f, wape: %.4f' % (i+1, mae, rmse, mape, wape)) # print('step %d, mae: %.4f, rmse: %.4f, mape: %.4f' % (i+1, mae, rmse, mape)) mae, rmse, mape, wape = metric(pred, label) maes.append(mae) rmses.append(rmse) mapes.append(mape) wapes.append(wape) log_string(log, 'average, mae: %.4f, rmse: %.4f, mape: %.4f, wape: %.4f' % (mae, rmse, mape, wape)) # print('average, mae: %.4f, rmse: %.4f, mape: %.4f' % (mae, rmse, mape)) return np.stack(maes, 0), np.stack(rmses, 0), np.stack(mapes, 0)
parser.add_argument('--traffic_file', default='data/PeMS.h5', help='traffic file') parser.add_argument('--SE_file', default='data/SE(PeMS).txt', help='spatial emebdding file') parser.add_argument('--model_file', default='data/GMAN(PeMS)', help='save the model to disk') parser.add_argument('--log_file', default='data/log(PeMS)', help='log file') args = parser.parse_args() start = time.time() log = open(args.log_file, 'w') utils.log_string(log, str(args)[10:-1]) # load data utils.log_string(log, 'loading data...') (trainX, trainTE, trainY, valX, valTE, valY, testX, testTE, testY, SE, mean, std) = utils.loadData(args) utils.log_string(log, 'trainX: %s\ttrainY: %s' % (trainX.shape, trainY.shape)) utils.log_string(log, 'valX: %s\t\tvalY: %s' % (valX.shape, valY.shape)) utils.log_string(log, 'testX: %s\t\ttestY: %s' % (testX.shape, testY.shape)) utils.log_string(log, 'data loaded!') # train model utils.log_string(log, 'compiling model...') T = 24 * 60 // args.time_slot num_train, _, N = trainX.shape X, TE, label, is_training = model.placeholder(args.P, args.Q, N)
parser.add_argument('--traffic_file', default = '**.npz', help = 'traffic file') parser.add_argument('--SE_file', default = '**.npy', help = 'spatial emebdding file') parser.add_argument('--model_file', default = 'PEMS', help = 'save the model to disk') parser.add_argument('--log_file', default = 'log(PEMS)', help = 'log file') args = parser.parse_args() log = open(args.log_file, 'w') device = torch.device("cuda:5" if torch.cuda.is_available() else "cpu") log_string(log, "loading data....") trainX, trainTE, trainY, valX, valTE, valY, testX, testTE, testY, SE, mean, std = loadPEMSData(args) SE = torch.from_numpy(SE).to(device) log_string(log, "loading end....") def res(model, valX, valTE, valY, mean, std): model.eval() # 评估模式, 这会关闭dropout # it = test_iter.get_iterator() num_val = valX.shape[0] pred = [] label = [] num_batch = math.ceil(num_val / args.batch_size) with torch.no_grad():
parser.add_argument('--traffic_file', default='data/PeMS.h5', help='traffic file') parser.add_argument('--SE_file', default='data/SE(PeMS).txt', help='spatial emebdding file') parser.add_argument('--model_file', default='data/GMAN(PeMS)', help='pre-trained model') parser.add_argument('--log_file', default='data/log(PeMS)', help='log file') args = parser.parse_args() start = time.time() log = open(args.log_file, 'w') utils.log_string(log, str(args)[10:-1]) # load data utils.log_string(log, 'loading data...') (trainX, trainTE, trainY, valX, valTE, valY, testX, testTE, testY, SE, mean, std) = utils.loadData(args) num_train, num_val, num_test = trainX.shape[0], valX.shape[0], testX.shape[0] utils.log_string(log, 'trainX: %s\ttrainY: %s' % (trainX.shape, trainY.shape)) utils.log_string(log, 'valX: %s\t\tvalY: %s' % (valX.shape, valY.shape)) utils.log_string(log, 'testX: %s\t\ttestY: %s' % (testX.shape, testY.shape)) utils.log_string(log, 'data loaded!') # test model utils.log_string(log, '**** testing model ****') utils.log_string(log, 'loading model from %s' % args.model_file) graph = tf.Graph()
def main(args): '''create dir''' experiment_dir = Path('./experiment/') experiment_dir.mkdir(exist_ok=True) checkpoints_dir = Path('./experiment/checkpoints/') checkpoints_dir.mkdir(exist_ok=True) log_dir = Path('./experiment/logs/') log_dir.mkdir(exist_ok=True) ctx = [mxnet.gpu(gpu_id) for gpu_id in args.gpu] '''initialize the network''' net = MVRNN(cnn_arch='vgg11_bn', cnn_feature_length=4096, num_views=args.num_views, num_class=args.num_classes, pretrained=True, pretrained_cnn=args.pretrained_cnn, ctx=ctx) if args.checkpoint: net.load_parameters(args.checkpoint, ctx=ctx) else: net.initialize(init=init.MSRAPrelu(), ctx=ctx) net.hybridize() '''set grad_req to 'add' to manually aggregate gradients''' net.collect_params().setattr('grad_req', 'add') net._cnn2.collect_params().setattr('lr_mult', args.output_lr_mult) '''Setup loss function''' loss_fun = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=not args.label_smoothing) '''Loading dataset''' train_ds = MultiViewImageDataset(os.path.join(args.dataset_path, 'train'), args.num_views, transform=Compose([ ToTensor(), Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ])) test_ds = MultiViewImageDataset(os.path.join(args.dataset_path, 'test'), args.num_views, transform=Compose([ ToTensor(), Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ])) loader = gluon.data.DataLoader train_data = loader(train_ds, args.batch_size, shuffle=True, last_batch='keep', num_workers=4) test_data = loader(test_ds, args.batch_size, shuffle=False, last_batch='keep', num_workers=4) current_time = datetime.datetime.now() time_str = '%d-%d-%d--%d-%d-%d' % ( current_time.year, current_time.month, current_time.day, current_time.hour, current_time.minute, current_time.second) log_filename = time_str + '.txt' checkpoint_name = 'checkpoint_' + time_str checkpoint_dir = Path(os.path.join(checkpoints_dir, checkpoint_name)) checkpoint_dir.mkdir(exist_ok=True) with open(os.path.join( log_dir, log_filename, ), 'w') as log_out: try: kv = mxnet.kv.create('device') utils.log_string(log_out, sys.argv[0]) utils.train(net, train_data, test_data, loss_fun, kv, log_out, str(checkpoint_dir), args) except Exception as e: raise e