示例#1
0
def train_main():
    if (start_epoch
            == 0) and (not os.path.exists(params_path)):  # 从头开始训练,就要重新构建文件夹
        os.makedirs(params_path)
        print('create params directory %s' % (params_path), flush=True)
    elif (start_epoch == 0) and (os.path.exists(params_path)):
        shutil.rmtree(params_path)
        os.makedirs(params_path)
        print('delete the old one and create params directory %s' %
              (params_path),
              flush=True)
    elif (start_epoch >
          0) and (os.path.exists(params_path)):  # 从中间开始训练,就要保证原来的目录存在
        print('train from params directory %s' % (params_path), flush=True)
    else:
        raise SystemExit('Wrong type of model!')

    criterion = nn.L1Loss().to(DEVICE)  # 定义损失函数
    optimizer = optim.Adam(net.parameters(),
                           lr=learning_rate)  # 定义优化器,传入所有网络参数
    sw = SummaryWriter(logdir=params_path, flush_secs=5)

    total_param = 0
    print('Net\'s state_dict:', flush=True)
    for param_tensor in net.state_dict():
        print(param_tensor,
              '\t',
              net.state_dict()[param_tensor].size(),
              flush=True)
        total_param += np.prod(net.state_dict()[param_tensor].size())
    print('Net\'s total params:', total_param, flush=True)

    print('Optimizer\'s state_dict:')
    for var_name in optimizer.state_dict():
        print(var_name, '\t', optimizer.state_dict()[var_name], flush=True)

    global_step = 0
    best_epoch = 0
    best_val_loss = np.inf

    # train model
    if start_epoch > 0:

        params_filename = os.path.join(params_path,
                                       'epoch_%s.params' % start_epoch)

        net.load_state_dict(torch.load(params_filename))

        print('start epoch:', start_epoch, flush=True)

        print('load weight from: ', params_filename, flush=True)

    start_time = time()

    for epoch in range(start_epoch, epochs):

        params_filename = os.path.join(params_path, 'epoch_%s.params' % epoch)

        # apply model on the validation data set
        val_loss = compute_val_loss(net, val_loader, criterion, sw, epoch)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            torch.save(net.state_dict(), params_filename)
            print('save parameters to file: %s' % params_filename, flush=True)

        net.train()  # ensure dropout layers are in train mode

        train_start_time = time()

        for batch_index, batch_data in enumerate(train_loader):

            encoder_inputs, decoder_inputs, labels = batch_data

            encoder_inputs = encoder_inputs.transpose(-1, -2)  # (B, N, T, F)

            decoder_inputs = decoder_inputs.unsqueeze(-1)  # (B, N, T, 1)

            labels = labels.unsqueeze(-1)

            optimizer.zero_grad()

            outputs = net(encoder_inputs, decoder_inputs)

            loss = criterion(outputs, labels)

            loss.backward()

            optimizer.step()

            training_loss = loss.item()

            global_step += 1

            sw.add_scalar('training_loss', training_loss, global_step)

        print('epoch: %s, train time every whole data:%.2fs' %
              (epoch, time() - train_start_time),
              flush=True)
        print('epoch: %s, total time:%.2fs' % (epoch, time() - start_time),
              flush=True)

    print('best epoch:', best_epoch, flush=True)

    print('apply the best val model on the test data set ...', flush=True)

    predict_main(best_epoch, test_loader, test_target_tensor, _max, _min,
                 'test')

    # fine tune the model
    optimizer = optim.Adam(net.parameters(), lr=learning_rate * 0.1)
    print('fine tune the model ... ', flush=True)
    for epoch in range(epochs, epochs + fine_tune_epochs):

        params_filename = os.path.join(params_path, 'epoch_%s.params' % epoch)

        net.train()  # ensure dropout layers are in train mode

        train_start_time = time()

        for batch_index, batch_data in enumerate(train_loader):

            encoder_inputs, decoder_inputs, labels = batch_data

            encoder_inputs = encoder_inputs.transpose(-1, -2)  # (B, N, T, F)

            decoder_inputs = decoder_inputs.unsqueeze(-1)  # (B, N, T, 1)

            labels = labels.unsqueeze(-1)
            predict_length = labels.shape[2]  # T

            optimizer.zero_grad()

            encoder_output = net.encode(encoder_inputs)

            # decode
            decoder_start_inputs = decoder_inputs[:, :, :1, :]
            decoder_input_list = [decoder_start_inputs]

            for step in range(predict_length):
                decoder_inputs = torch.cat(decoder_input_list, dim=2)
                predict_output = net.decode(decoder_inputs, encoder_output)
                decoder_input_list = [decoder_start_inputs, predict_output]

            loss = criterion(predict_output, labels)

            loss.backward()

            optimizer.step()

            training_loss = loss.item()

            global_step += 1

            sw.add_scalar('training_loss', training_loss, global_step)

        print('epoch: %s, train time every whole data:%.2fs' %
              (epoch, time() - train_start_time),
              flush=True)
        print('epoch: %s, total time:%.2fs' % (epoch, time() - start_time),
              flush=True)

        # apply model on the validation data set
        val_loss = compute_val_loss(net, val_loader, criterion, sw, epoch)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            torch.save(net.state_dict(), params_filename)
            print('save parameters to file: %s' % params_filename, flush=True)

    print('best epoch:', best_epoch, flush=True)

    print('apply the best val model on the test data set ...', flush=True)

    predict_main(best_epoch, test_loader, test_target_tensor, _max, _min,
                 'test')
示例#2
0
文件: train.py 项目: zhhhzhang/ASTGCN
    net = model(num_for_predict, all_backbones)
    net.initialize(ctx = ctx)
    for val_w, val_d, val_r, val_t in val_loader:
        net([val_w, val_d, val_r])
        break
    net.initialize(ctx = ctx, init = MyInit(), force_reinit = True)

    # initialize a trainer to train model
    trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': learning_rate})

    # initialize a SummaryWriter to write information into logs dir
    sw = SummaryWriter(logdir = params_path, flush_secs = 5)

    # compute validation loss before training
    compute_val_loss(net, val_loader, loss_function, sw, 0)

    # compute testing set MAE, RMSE, MAPE before training
    evaluate(net, test_loader, true_value, num_of_vertices, sw, 0)

    # train model
    global_step = 1
    for epoch in range(1, epochs + 1):
        
        for train_w, train_d, train_r, train_t in train_loader:
            
            start_time = time()
            
            with autograd.record():
                output = net([train_w, train_d, train_r])
                l = loss_function(output, train_t)
示例#3
0
                num_nodes=num_nodes,
                week=24,
                day=12,
                recent=24,
                K=3,
                Kt=3)
    net.to(device)  #to cuda

    optimizer = optim.Adam(net.parameters(),
                           lr=learning_rate,
                           weight_decay=wdecay)
    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, decay)
    #scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [20,30], gamma=0.7, last_epoch=-1)

    #calculate origin loss in epoch 0
    compute_val_loss(net, val_loader, loss_function, supports, device, epoch=0)

    # compute testing set MAE, RMSE, MAPE before training
    evaluate(net, test_loader, true_value, supports, device, epoch=0)

    clip = 5
    his_loss = []
    train_time = []
    for epoch in range(1, epochs + 1):
        train_l = []
        start_time_train = time()
        for train_w, train_d, train_r, train_t in train_loader:
            train_w = train_w.to(device)
            train_d = train_d.to(device)
            train_r = train_r.to(device)
            train_t = train_t.to(device)
示例#4
0
        val_r = val_r.as_in_context(ctx)
        val_t = val_t.as_in_context(ctx)

        net([val_w, val_d, val_r])
        break
    net.initialize(ctx=ctx, init=MyInit(), force_reinit=True)

    # initialize a trainer to train model
    trainer = gluon.Trainer(net.collect_params(), optimizer,
                            {'learning_rate': learning_rate})

    # initialize a SummaryWriter to write information into logs dir
    sw = SummaryWriter(logdir=params_path, flush_secs=5)

    # compute validation loss before training
    compute_val_loss(net, val_loader, loss_function, sw, epoch=0, ctx=ctx)

    # compute testing set MAE, RMSE, MAPE before training
    evaluate(net,
             test_loader,
             true_value,
             num_of_vertices,
             sw,
             epoch=0,
             ctx=ctx)

    # train model
    global_step = 1
    for epoch in range(1, epochs + 1):
        for train_w, train_d, train_r, train_t in train_loader:
            # running on single gpu
示例#5
0
                                        time: {batch_end_time - batch_start_time:.2f}'
                )
                print(
                    f's:[{epoch:d}, {i + 1:5d}] loss: {running_loss_s / group_num:.2f}, \
                                        time: {batch_end_time - batch_start_time:.2f}'
                )
                print(
                    '--------------------------------------------------------------------'
                )
                running_loss = 0.0
                running_loss_f = 0.0
                running_loss_o = 0.0
                running_loss_s = 0.0
                batch_start_time = batch_end_time

        epoch_end_time = time.perf_counter()
        print(f'Epoch cost {epoch_end_time - epoch_start_time:.2f} seconds')

        # probably not need to run this after every epoch:可能不需要在每一个时代之后都运行这个
        with torch.no_grad():
            # compute validation loss:计算验证损失
            compute_val_loss(net, val_loader, loss_function, None, epoch,
                             device, all_data['stats']['stats'])

            # testing:测试
            evaluate(net, test_loader, true_value, num_of_vertices, None,
                     epoch, device, all_data['stats']['stats'])

    end_time = time.perf_counter()
    print(f'Total running time is {end_time - start_time:.2f} seconds.')
示例#6
0
            train_t = train_t.to(device)

            outputs = net([train_w, train_d, train_r])

            loss = loss_function(outputs, train_t)  # loss is a tensor on the same device as outpus and train_t
            loss.backward()
            optimizer.step()

            running_loss += loss.item()  # type of running_loss is float, loss.item() is a float on CPU

            if i % group_num == group_num - 1:
                batch_end_time = time.perf_counter()
                print(f'[{epoch:d}, {i + 1:5d}] loss: {running_loss / group_num:.2f}, \
                        time: {batch_end_time - batch_start_time:.2f}')
                running_loss = 0.0
                batch_start_time = batch_end_time

        epoch_end_time = time.perf_counter()
        print(f'Epoch cost {epoch_end_time - epoch_start_time:.2f} seconds')

        # probably not need to run this after every epoch
        with torch.no_grad():
            # compute validation loss
            compute_val_loss(net, val_loader, loss_function, None, epoch, device)

        # testing
            evaluate(net, test_loader, true_value, num_of_vertices, None, epoch, device)

    end_time = time.perf_counter()
    print(f'Total running time is {end_time - start_time:.2f} seconds.')