import warnings warnings.filterwarnings(action='ignore') if __name__ == '__main__': opt = parse_opts() # Detect devices use_cuda = torch.cuda.is_available() # check if GPU exists device = torch.device("cuda" if use_cuda else "cpu") # use CPU or GPU train_loader, valid_loader = dataLoadFunc(opt) StudentModel = StudentModel(opt) StudentModel = StudentModel.to(device) smartModel = None perturbation_model = None if opt.smart_model: opt.is_smart_model = True smartModel = SmartModel(opt) smartModel = smartModel.to(device) # Parallelize model to multiple GPUs if torch.cuda.device_count() > 1: print("Using", torch.cuda.device_count(), "GPUs!") StudentModel = nn.DataParallel(StudentModel) parms = list(StudentModel.module.parameters()) if opt.isSource: smartModel = nn.DataParallel(smartModel)
bridge = bridges.to(args.device) t_model.eval() elif args.task=='student': model = StudentModel(params=args, pretrained_embedding=torch.tensor(pretrained_embedding).float()) param_groups = [{'params': model.parameters(), 'lr': args.learning_rate}] elif args.task=='teacher': model = TeacherModel(params=args, pretrained_embedding=torch.tensor(pretrained_embedding).float()) param_groups = [{'params': model.parameters(), 'lr': args.learning_rate}] if args.classify_loss: classifier = PathClassifier(params=args) param_groups.append({'params': classifier.parameters(),'lr':args.learning_rate}) classifiers = classifier.to(args.device) optimizer = SGD(param_groups, lr=args.learning_rate) scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=args.gamma) model = model.to(args.device) total_step = 0 eval_result = {} accum_train_link_loss, accum_train_label_loss = 0, 0 accum_distill_loss, accum_classify_loss = 0, 0 accum_eval_loss = 0 scheduler_step = 0 best_eval_result = None stop_sign=0 for epoch in range(args.epoches): print('{} epoch training..'.format(epoch + 1)) print('dialogue model learning rate {:.4f}'.format(optimizer.param_groups[0]['lr'])) model.train() for batch in tqdm(train_dataloader):