def train(**kwargs): # opt = Config() for k, v in kwargs.items(): setattr(opt, k, v) vis = Visualizer(opt.env, opt.port) device = t.device('cuda') if opt.use_gpu else t.device('cpu') lr = opt.lr #网络配置 featurenet = FeatureNet(4, 5) if opt.model_path: featurenet.load_state_dict( t.load(opt.model_path, map_location=lambda _s, _: _s)) featurenet.to(device) #加载数据 data_set = dataset.FeatureDataset(root=opt.data_root, train=True, test=False) dataloader = DataLoader(data_set, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataset = dataset.FeatureDataset(root=opt.data_root, train=False, test=False) val_dataloader = DataLoader(val_dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers) #定义优化器和随时函数 optimizer = t.optim.SGD(featurenet.parameters(), lr) criterion = t.nn.CrossEntropyLoss().to(device) #计算重要指标 loss_meter = AverageValueMeter() #开始训练 for epoch in range(opt.max_epoch): loss_meter.reset() for ii, (data, label) in enumerate(dataloader): feature = data.to(device) target = label.to(device) optimizer.zero_grad() prob = featurenet(feature) # print(prob) # print(target) loss = criterion(prob, target) loss.backward() optimizer.step() loss_meter.add(loss.item()) if (ii + 1) % opt.plot_every: vis.plot('train_loss', loss_meter.value()[0]) if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() t.save( featurenet.state_dict(), 'checkpoints/{epoch}_{time}_{loss}.pth'.format( epoch=epoch, time=time.strftime('%m%d_%H_%M_%S'), loss=loss_meter.value()[0])) #验证和可视化 accu, loss = val(featurenet, val_dataloader, criterion) featurenet.train() vis.plot('val_loss', loss) vis.log('epoch: {epoch}, loss: {loss}, accu: {accu}'.format( epoch=epoch, loss=loss, accu=accu)) lr = lr * 0.9 for param_group in optimizer.param_groups: param_group['lr'] = lr
def train(**kwargs): for k_, v_ in kwargs.items(): setattr(config, k_, v_) vis_ = Visualizer() # data train_dataset = get_data.Ali(config.train_path, 'train', config.feature_index_path) val_dataset = get_data.Ali(config.val_path, 'val', config.feature_index_path) train_loader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True, drop_last=True) val_loader = DataLoader(dataset=val_dataset, batch_size=config.batch_size) # model model = deepfm.FNN(config.feature_index_path) print(model) # print('initializing...') # model.apply(weight_init) # testing if config.test_flag: test_dataset = get_data.Ali(config.test_path, 'test', config.feature_index_path) test_loader = DataLoader(dataset=test_dataset, batch_size=config.batch_size) model.load_state_dict( torch.load(os.path.join(config.model_path, '_best'))) test(model, test_loader, config.output_path) # criterion and optimizer criterion = torch.nn.BCELoss() lr = config.lr optimizer = Adam(model.parameters(), lr=lr, betas=(config.beta1, config.beta2), weight_decay=config.weight_decay) previous_loss = 1e6 if torch.cuda.is_available(): model.cuda() criterion.cuda() # meters loss_meter = tnt.meter.AverageValueMeter() # class_err = tnt.meter.ClassErrorMeter() # confusion_matrix = tnt.meter.ConfusionMeter(2, normalized=True) # val(model, val_loader, criterion) # resume training start = 0 if config.resume: model_epoch = [ int(fname.split('_')[-1]) for fname in os.listdir(config.model_path) if 'best' not in fname ] start = max(model_epoch) model.load_state_dict( torch.load(os.path.join(config.model_path, '_epoch_{start}'))) if start >= config.epochs: print('Training already Done!') return # train print('start training...') for i in range(start, config.epochs): loss_meter.reset() # class_err.reset() # confusion_matrix.reset() for ii, (c_data, labels) in tqdm(enumerate(train_loader)): c_data = to_var(c_data) labels = to_var(labels).float() # labels = labels.view(-1, 1) pred = model(c_data) # print(pred, labels) loss = criterion(pred, labels) optimizer.zero_grad() loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.data[0]) # confusion_matrix.add(pred.data.squeeze(), labels.data.type(torch.LongTensor)) if (ii + 1) % config.print_every == 0: vis_.plot('train_loss', loss_meter.value()[0]) print( f'''epochs: {i + 1}/{config.epochs} batch: {ii + 1}/{len(train_loader)} train_loss: {loss.data[0]}''') print('evaluating...') # train_cm = confusion_matrix.value() val_cm, val_accuracy, val_loss = val(model, val_loader, criterion) vis_.plot('val_loss', val_loss) vis_.log(f"epoch:{start + 1},lr:{lr},loss:{val_loss}") torch.save(model.state_dict(), os.path.join(config.model_path, f'_epoch_{i}')) # update learning rate if loss_meter.value()[0] > previous_loss: torch.save(model.state_dict(), os.path.join(config.model_path, '_best')) lr = lr * config.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]