def test_model(dataset=dataset, save_dir=save_dir, num_classes=num_classes, num_epochs=nEpochs): torch.cuda.empty_cache() model = C3D_model.C3D(num_classes=num_classes, pretrained=False) model.load_state_dict(torch.load(os.path.listdir(save_dir)[-1])) params = model.parameters() criterion = nn.CrossEntropyLoss() test_dataloader = DataLoader(VideoDataset(dataset=dataset, split='test', clip_len=16), batch_size=batch_size, num_workers=num_workers) test_size = len(test_dataloader.dataset) model.to(device) criterion.to(device) #print(model) model.eval() start_time = timeit.default_timer() running_loss = 0.0 running_corrects = 0.0 for inputs, labels in tqdm(test_dataloader): inputs = inputs.to(device) labels = labels.to(device) with torch.no_grad(): outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] loss = criterion(outputs, labels) running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) loss = running_loss / test_size acc = running_corrects.double() / test_size print("[test] Loss: {} Acc: {}".format(loss, acc) stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time) + "\n") if __name__ == "__main__": test_model()
def train_model(dataset=dataset, save_dir=save_dir, num_classes=num_classes, lr=lr, num_epochs=nEpochs, save_epoch=snapshot, useTest=useTest, test_interval=nTestInterval): """ Args: num_classes (int): Number of classes in the data num_epochs (int, optional): Number of epochs to train for. """ if modelName == 'C3D': model = C3D_model.C3D(num_classes=num_classes, pretrained=True) train_params = [{'params': C3D_model.get_1x_lr_params(model), 'lr': lr}, {'params': C3D_model.get_10x_lr_params(model), 'lr': lr * 10}] else: print('We only implemented C3D models.') raise NotImplementedError criterion = nn.CrossEntropyLoss() # standard crossentropy loss for classification optimizer = optim.SGD(train_params, lr=lr, momentum=0.9, weight_decay=5e-4) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # the scheduler divides the lr by 10 every 10 epochs if resume_epoch == 0: print("Training {} from scratch...".format(modelName)) else: checkpoint = torch.load(os.path.join(save_dir, 'models', saveName + '_epoch-' + str(resume_epoch - 1) + '.pth.tar'), map_location=lambda storage, loc: storage) # Load all tensors onto the CPU print("Initializing weights from: {}...".format( os.path.join(save_dir, 'models', saveName + '_epoch-' + str(resume_epoch - 1) + '.pth.tar'))) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['opt_dict']) print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) model.to(device) criterion.to(device) log_dir = os.path.join(save_dir, 'models', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) print('Training model on {} dataset...'.format(dataset)) train_dataloader = DataLoader(VideoDataset(dataset=dataset, split='train',clip_len=16), batch_size=1, shuffle=True, num_workers=0) val_dataloader = DataLoader(VideoDataset(dataset=dataset, split='val', clip_len=16), batch_size=1, num_workers=0) test_dataloader = DataLoader(VideoDataset(dataset=dataset, split='test', clip_len=16), batch_size=1, num_workers=0) trainval_loaders = {'train': train_dataloader, 'val': val_dataloader} trainval_sizes = {x: len(trainval_loaders[x].dataset) for x in ['train', 'val']} test_size = len(test_dataloader.dataset) for epoch in range(resume_epoch, num_epochs): # each epoch has a training and validation step for phase in ['train', 'val']: start_time = timeit.default_timer() # reset the running loss and corrects running_loss = 0.0 running_corrects = 0.0 # set model to train() or eval() mode depending on whether it is trained # or being validated. Primarily affects layers such as BatchNorm or Dropout. if phase == 'train': # scheduler.step() is to be called once every epoch during training scheduler.step() model.train() else: model.eval() for inputs, labels in tqdm(trainval_loaders[phase]): # move inputs and labels to the device the training is taking place on inputs = Variable(inputs, requires_grad=True).to(device) labels = Variable(labels).to(device) optimizer.zero_grad() if phase == 'train': outputs = model(inputs) else: with torch.no_grad(): outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] labels=labels.long() loss = criterion(outputs, labels) if phase == 'train': loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / trainval_sizes[phase] epoch_acc = running_corrects.double() / trainval_sizes[phase] if phase == 'train': writer.add_scalar('data/train_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/train_acc_epoch', epoch_acc, epoch) else: writer.add_scalar('data/val_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/val_acc_epoch', epoch_acc, epoch) print("[{}] Epoch: {}/{} Loss: {} Acc: {}".format(phase, epoch+1, nEpochs, epoch_loss, epoch_acc)) stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time) + "\n") if epoch % save_epoch == (save_epoch - 1): torch.save({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'opt_dict': optimizer.state_dict(), }, os.path.join(save_dir, 'models', saveName + '_epoch-' + str(epoch) + '.pth.tar')) print("Save model at {}\n".format(os.path.join(save_dir, 'models', saveName + '_epoch-' + str(epoch) + '.pth.tar'))) writer.close()
def start(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Device being used:", device) nEpochs = 101 # 训练集epoch resume_epoch = 0 # 是否从头开始训练,0表示从头,如果是99,表示从99次训练之后的模型开始训练 useTest = True # 训练的时候是否看下训练集的效果 nTestInterval = 20 # 训练集的epoch snapshot = 25 # 训练多少个epoch之后保存 lr = 1e-5 # Learning rate dataset = "ucf101" num_classes = 101 # 获取当前路径和文件所属的目录 save_dir_root = os.path.join(os.path.dirname(os.path.abspath(__file__))) # videoprocess exp_name = os.path.dirname(os.path.abspath(__file__)).split('/')[-1] # 如果是要从之前的model开始训练,拿到最后一次run的id,如果从头开始,最后一次的id再加1 # glob.glob()函数的作用:将拼接的符合的文件或者文件夹名字全部读取 if resume_epoch != 0: runs = sorted(glob.glob(os.path.join(save_dir_root, 'run', 'run_*'))) run_id = int(runs[-1].split('_')[-1]) if runs else 0 else: runs = sorted(glob.glob(os.path.join(save_dir_root, 'run', 'run_*'))) run_id = int(runs[-1].split('_')[-1]) + 1 if runs else 0 save_dir = os.path.join(save_dir_root, 'run', 'run_' + str(run_id)) modelName = 'C3D' saveName = modelName + '-' + dataset # 创建模型 model = C3D_model.C3D(num_classes=num_classes, pretrained=False) train_params = [{'params': C3D_model.get_1x_lr_params(model), 'lr': lr}, {'params': C3D_model.get_10x_lr_params(model), 'lr': lr * 10}] criterion = nn.CrossEntropyLoss() # 交叉熵损失函数 optimizer = optim.SGD(train_params, lr=lr, momentum=0.9, weight_decay=5e-4) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) if resume_epoch == 0: print("Training {} from scratch...".format(modelName)) else: checkpoint = torch.load( os.path.join(save_dir, 'models', saveName + '_epoch-' + str(resume_epoch - 1) + '.pth.tar'), map_location=lambda storage, loc: storage) # Load all tensors onto the CPU print("Initializing weights from: {}...".format( os.path.join(save_dir, 'models', saveName + '_epoch-' + str(resume_epoch - 1) + '.pth.tar'))) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['opt_dict']) # 查看网络的参数个数 print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) model.to(device) criterion.to(device) log_dir = os.path.join(save_dir, 'models', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) print('Training model on {} dataset...'.format(dataset)) train_dataloader = DataLoader(VideoDataset(dataset=dataset, split='train', clip_len=16), batch_size=6, shuffle=True, num_workers=0) val_dataloader = DataLoader(VideoDataset(dataset=dataset, split='val', clip_len=16), batch_size=6, num_workers=0) test_dataloader = DataLoader(VideoDataset(dataset=dataset, split='test', clip_len=16), batch_size=6, num_workers=0) trainval_loaders = {'train': train_dataloader, 'val': val_dataloader} trainval_sizes = {x: len(trainval_loaders[x].dataset) for x in ['train', 'val']} test_size = len(test_dataloader.dataset) for epoch in range(resume_epoch, nEpochs): # each epoch has a training and validation step for phase in ['train', 'val']: start_time = timeit.default_timer() # reset the running loss and corrects running_loss = 0.0 running_corrects = 0.0 # set model to train() or eval() mode depending on whether it is trained # or being validated. Primarily affects layers such as BatchNorm or Dropout. if phase == 'train': # scheduler.step() is to be called once every epoch during training scheduler.step() model.train() else: model.eval() # tqdm可以展示训练的进度条 for inputs, labels in tqdm(trainval_loaders[phase]): # move inputs and labels to the device the training is taking place on inputs = torch.tensor(inputs, requires_grad=True).to(device) labels = torch.tensor(labels).to(device) optimizer.zero_grad() if phase == 'train': outputs = model(inputs) else: with torch.no_grad(): outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] loss = criterion(outputs, labels.long()) if phase == 'train': loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / trainval_sizes[phase] epoch_acc = running_corrects.double() / trainval_sizes[phase] if phase == 'train': writer.add_scalar('data/train_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/train_acc_epoch', epoch_acc, epoch) else: writer.add_scalar('data/val_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/val_acc_epoch', epoch_acc, epoch) print("[{}] Epoch: {}/{} Loss: {} Acc: {}".format(phase, epoch+1, nEpochs, epoch_loss, epoch_acc)) stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time) + "\n") if epoch % snapshot == (snapshot - 1): torch.save({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'opt_dict': optimizer.state_dict(), }, os.path.join(save_dir, 'models', saveName + '_epoch-' + str(epoch) + '.pth.tar')) print("Save model at {}\n".format(os.path.join(save_dir, 'models', saveName + '_epoch-' + str(epoch) + '.pth.tar'))) if useTest and epoch % nTestInterval == (nTestInterval - 1): model.eval() start_time = timeit.default_timer() running_loss = 0.0 running_corrects = 0.0 for inputs, labels in tqdm(test_dataloader): inputs = inputs.to(device) labels = labels.to(device) with torch.no_grad(): outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] loss = criterion(outputs, labels.long()) running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / test_size epoch_acc = running_corrects.double() / test_size writer.add_scalar('data/test_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/test_acc_epoch', epoch_acc, epoch) print("[test] Epoch: {}/{} Loss: {} Acc: {}".format(epoch+1, nEpochs, epoch_loss, epoch_acc)) stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time) + "\n") writer.close()
def train_model(dataset=dataset, save_dir=save_dir, num_classes=num_classes, lr=lr, num_epochs=nEpochs, save_epoch=snapshot): torch.cuda.empty_cache() ##model = C3D_model.C3D_Dilation(num_classes=num_classes, pretrained=False) model = C3D_model.C3D(num_classes=num_classes, pretrained=False) train_params = model.parameters() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(train_params, lr=lr, momentum=0.9, weight_decay=5e-4) #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) print("Training {} from scratch...".format(modelName)) print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) model.to(device) criterion.to(device) #print(model) log_dir = os.path.join( save_dir, 'models', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) print('Training model on {} dataset...'.format(dataset)) train_dataloader = DataLoader(VideoDataset(dataset=dataset, split='train', clip_len=16), batch_size=batch_size, shuffle=True, num_workers=num_workers) val_dataloader = DataLoader(VideoDataset(dataset=dataset, split='val', clip_len=16), batch_size=batch_size, num_workers=num_workers) trainval_loaders = {'train': train_dataloader, 'val': val_dataloader} trainval_sizes = { x: len(trainval_loaders[x].dataset) for x in ['train', 'val'] } for epoch in range(num_epochs): for phase in ['train', 'val']: confusion_matrix = torch.zeros(num_classes, num_classes) start_time = timeit.default_timer() running_loss = 0.0 running_corrects = 0.0 if phase == 'train': model.train() #scheduler.step() else: model.eval() for inputs, labels in tqdm(trainval_loaders[phase]): inputs = Variable(inputs, requires_grad=True).to(device) labels = Variable(labels).to(device=device, dtype=torch.int64) optimizer.zero_grad() if phase == 'train': outputs = model(inputs) else: with torch.no_grad(): outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] loss = criterion(outputs, labels) if phase == 'train': loss.backward() optimizer.step() for t, p in zip(labels.view(-1), preds.view(-1)): confusion_matrix[t.long(), p.long()] += 1 running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / trainval_sizes[phase] epoch_acc = running_corrects.double() / trainval_sizes[phase] if phase == 'train': writer.add_scalar('data/train_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/train_acc_epoch', epoch_acc, epoch) else: writer.add_scalar('data/val_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/val_acc_epoch', epoch_acc, epoch) print("[{}] Epoch: {}/{} Loss: {} Acc: {}".format( phase, epoch + 1, nEpochs, epoch_loss, epoch_acc)) stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time) + "\n") print(confusion_matrix.diag() / confusion_matrix.sum(1)) if epoch % save_epoch == (save_epoch - 1): PATH = os.path.join( save_dir, saveName + '_epoch-' + str(epoch + 1) + '.pth.tar') torch.save(model.state_dict(), PATH) print("Save model at {}\n".format(PATH)) writer.close()
def train_model(dataset=dataset, save_dir=SAVE_FILE_FOLDER, num_classes=num_classes, lr=lr, num_epochs=nEpochs, save_epoch=snapshot, useTest=useTest, test_interval=nTestInterval): """ Args: num_classes (int): Number of classes in the data num_epochs (int, optional): Number of epochs to train for. """ if modelName == 'C3D': model = C3D_model.C3D(num_classes=num_classes, pretrained=IF_PRETRAIN) train_params = [{'params': C3D_model.get_1x_lr_params(model), 'lr': lr}, {'params': C3D_model.get_10x_lr_params(model), 'lr': lr * 10}] elif modelName == 'C3D_td5': model = C3D_model.C3D_td5(num_classes=num_classes, pretrained=IF_PRETRAIN) train_params = [{'params': C3D_model.get_1x_lr_params(model), 'lr': lr}, {'params': C3D_model.get_10x_lr_params(model), 'lr': lr * 10}] elif modelName == 'R2Plus1D': model = R2Plus1D_model.R2Plus1DClassifier(num_classes=num_classes, layer_sizes=(2, 2, 2, 2)) train_params = [{'params': R2Plus1D_model.get_1x_lr_params(model), 'lr': lr}, {'params': R2Plus1D_model.get_10x_lr_params(model), 'lr': lr * 10}] elif modelName == 'R3D': model = R3D_model.R3DClassifier(num_classes=num_classes, layer_sizes=(2, 2, 2, 2)) train_params = model.parameters() else: print('We only implemented C3D and R2Plus1D models.') raise NotImplementedError criterion = nn.CrossEntropyLoss() # standard crossentropy loss for classification if _optimizer == "SGD": optimizer = optim.SGD(train_params, lr=lr, momentum=MOMENTUM, weight_decay=WD) elif _optimizer == "Adam": optimizer = optim.Adam(train_params, lr=lr, weight_decay=WD) # print(optimizer) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE, gamma=SCHEDULER_GAMMA) # the scheduler divides the lr by 10 every 10 epochs model.to(device) criterion.to(device) # if resume_epoch == 0: if resume_model_path == None: print("Training {} from scratch...".format(modelName)) else: checkpoint = torch.load( resume_model_path, map_location=lambda storage, loc: storage) # Load all tensors onto the CPU print("Initializing weights from: {}...".format(resume_model_path)) model.load_state_dict(checkpoint['state_dict']) if RESUM_OPTIMIZER: optimizer.load_state_dict(checkpoint['opt_dict']) # resume_epoch # else: # checkpoint = torch.load(os.path.join(SAVE_FILE_FOLDER, 'models', EXP_NAME + '_epoch-' + str(resume_epoch - 1) + '.pth.tar'), # map_location=lambda storage, loc: storage) # Load all tensors onto the CPU # print("Initializing weights from: {}...".format( # os.path.join(SAVE_FILE_FOLDER, EXP_NAME + '_epoch-' + str(resume_epoch - 1) + '.pth.tar'))) # model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['opt_dict']) print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) model.to(device) criterion.to(device) writer = SummaryWriter(logdir=LOG_PATH) print('Training model on {} dataset...'.format(dataset)) train_dataloader = DataLoader(VideoDataset(dataset=dataset, split='train', clip_len=clip_len, preprocess=IF_PREPROCESS_TRAIN, grayscale=grayscale), batch_size=BS, shuffle=True, num_workers=N_WORKERS) val_dataloader = DataLoader(VideoDataset(dataset=dataset, split='val', clip_len=clip_len, preprocess=IF_PREPROCESS_VAL, grayscale=grayscale), batch_size=BS, num_workers=N_WORKERS) test_dataloader = DataLoader(VideoDataset(dataset=dataset, split='test', clip_len=clip_len, preprocess=IF_PREPROCESS_TEST, grayscale=grayscale), batch_size=BS, num_workers=N_WORKERS) trainval_loaders = {'train': train_dataloader, 'val': val_dataloader} trainval_sizes = {x: len(trainval_loaders[x].dataset) for x in ['train', 'val']} test_size = len(test_dataloader.dataset) cudnn.benchmark = True global_best_val_acc = 0 for epoch in range(num_epochs): # each epoch has a training and validation step for phase in ['train', 'val']: start_time = timeit.default_timer() # reset the running loss and corrects running_loss = 0.0 running_corrects = 0.0 # running_roc = 0.0 list_pred = list() list_label = list() # print(optimizer) # set model to train() or eval() mode depending on whether it is trained # or being validated. Primarily affects layers such as BatchNorm or Dropout. if phase == 'train': # scheduler.step() is to be called once every epoch during training scheduler.step() model.train() else: model.eval() # for inputs, labels in tqdm(trainval_loaders[phase]): run_count = 0 for inputs, labels in trainval_loaders[phase]: # move inputs and labels to the device the training is taking place on inputs = Variable(inputs, requires_grad=True).to(device) labels = Variable(labels).to(device) optimizer.zero_grad() if phase == 'train': outputs = model(inputs) else: with torch.no_grad(): outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] loss = criterion(outputs, labels) if phase == 'train': loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) # try: # running_roc += roc_auc_score(labels.data.cpu(), preds.cpu()) # except: # y_true = labels.data.cpu().tolist() # y_true_2 = y_true.copy() # for i_cls in range(N_CLASSES): # y_true_2.append(i_cls) # # y_pred = preds.cpu().tolist() # y_pred_2 = y_pred.copy() # for i_cls in range(N_CLASSES): # y_pred_2.append(i_cls) # # running_roc += roc_auc_score(y_true_2, y_pred_2) # # run_count += 1 list_label += labels.data.cpu().tolist() list_pred += preds.cpu().tolist() epoch_loss = running_loss / trainval_sizes[phase] epoch_acc = running_corrects.double() / trainval_sizes[phase] epoch_roc = multiclass_roc_score(label=list_label, pred=list_pred, n_cls=N_CLASSES) if phase == 'train': writer.add_scalar('data/train_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/train_acc_epoch', epoch_acc, epoch) writer.add_scalar('data/train_roc_epoch', epoch_roc, epoch) else: writer.add_scalar('data/val_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/val_acc_epoch', epoch_acc, epoch) writer.add_scalar('data/val_roc_epoch', epoch_roc, epoch) # if epoch_acc >= global_best_val_acc: # torch.save({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'opt_dict': optimizer.state_dict(), # }, os.path.join(SAVE_FILE_FOLDER, 'models', EXP_NAME + '_epoch-' + str(epoch) + 'ValAcc_{:10.4f}_'.format(epoch_loss) + '.pth.tar')) # print("Save model at {}\n".format( # os.path.join(SAVE_FILE_FOLDER, 'models', EXP_NAME + '_epoch-' + str(epoch) + 'ValAcc_{:10.4f}_'.format(epoch_loss) + '.pth.tar'))) print("[{}] Epoch: {}/{} Loss: {} Acc: {}, ROC:{}".format(phase, epoch+1, nEpochs, epoch_loss, epoch_acc, epoch_roc)) stop_time = timeit.default_timer() # print("Execution time: " + str(stop_time - start_time) + "\n") if epoch % save_epoch == (save_epoch - 1): torch.save({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'opt_dict': optimizer.state_dict(), }, os.path.join(SAVE_FILE_FOLDER, EXP_NAME + '_epoch-' + str(epoch) + '.pth.tar')) print("Save model at {}\n".format(os.path.join(SAVE_FILE_FOLDER, EXP_NAME + '_epoch-' + str(epoch) + '.pth.tar'))) if useTest and epoch % test_interval == (test_interval - 1): model.eval() start_time = timeit.default_timer() running_loss = 0.0 running_corrects = 0.0 # running_roc = 0.0 list_pred = list() list_label = list() # for inputs, labels in tqdm(test_dataloader): run_count = 0 for inputs, labels in test_dataloader: inputs = inputs.to(device) labels = labels.to(device) with torch.no_grad(): outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] loss = criterion(outputs, labels) running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) # try: # running_roc += roc_auc_score(labels.data.cpu(), preds.cpu()) # except: # running_roc += 0.5 # run_count += 1 list_label += labels.data.cpu().tolist() list_pred += preds.cpu().tolist() epoch_loss = running_loss / test_size epoch_acc = running_corrects.double() / test_size epoch_roc = multiclass_roc_score(label=list_label, pred=list_pred, n_cls=N_CLASSES) writer.add_scalar('data/test_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/test_acc_epoch', epoch_acc, epoch) writer.add_scalar('data/test_roc_epoch', epoch_roc, epoch) print("[test] Epoch: {}/{} Loss: {} Acc:{} ROC: {}".format(epoch+1, nEpochs, epoch_loss, epoch_acc, epoch_roc)) stop_time = timeit.default_timer() # print("Execution time: " + str(stop_time - start_time) + "\n") writer.close()
def train_model(dataset=dataset, save_dir=save_dir, num_classes=num_classes, lr=lr, num_epochs=nEpochs, save_epoch=snapshot, useTest=useTest, test_interval=nTestInterval): """ Args: num_classes (int): Number of classes in the data num_epochs (int, optional): Number of epochs to train for. """ if modelName == 'C3D': model = C3D_model.C3D(num_classes=num_classes, pretrained=True) train_params = [{ 'params': C3D_model.get_1x_lr_params(model), 'lr': lr }, { 'params': C3D_model.get_10x_lr_params(model), 'lr': lr * 10 }] elif modelName == 'R2Plus1D': # model = R2Plus1D_model.R2Plus1DClassifier(num_classes=num_classes, layer_sizes=(2, 2, 2, 2)) # train_params = [{'params': R2Plus1D_model.get_1x_lr_params(model), 'lr': lr}, # {'params': R2Plus1D_model.get_10x_lr_params(model), 'lr': lr * 10}] model = models.video.r2plus1d_18(pretrained=True, progress=True) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, 2) model = model.to(device) train_params = model.parameters() # for name, param in model.named_parameters(): # print(name, param.data) # jdks elif modelName == 'R3D': # model = R3D_model.R3DClassifier(num_classes=num_classes, layer_sizes=(2, 2, 2, 2)) # train_params = model.parameters() model = models.video.r3d_18(pretrained=True, progress=True) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, 2) model = model.to(device) train_params = model.parameters() elif modelName == 'MC3': # model = R3D_model.R3DClassifier(num_classes=num_classes, layer_sizes=(2, 2, 2, 2)) # train_params = model.parameters() model = models.video.mc3_18(pretrained=True, progress=True) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, 2) model = model.to(device) train_params = model.parameters() elif modelName == 'I3D': model = I3D.InceptionI3d(num_classes=157) load_file = 'rgb_charades.pt' model = model.to(device) model.load_state_dict(torch.load(load_file)) model.replace_logits(num_classes=2) train_params = model.parameters() else: print('We only implemented C3D and R2Plus1D models.') raise NotImplementedError criterion = nn.CrossEntropyLoss(weight=torch.tensor( [1.0 / 375, 1.0 / 4388])) # standard crossentropy loss for classification optimizer = optim.SGD(train_params, lr=lr, momentum=0.9, weight_decay=5e-4) scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=10, gamma=0.1) # the scheduler divides the lr by 10 every 10 epochs #sampler = torch.utils.data.WeightedRandomSampler([1.0/212, 1.0/4388], 8, replacement=True) if resume_epoch == 0: print("Training {} from scratch...".format(modelName)) else: #checkpoint = torch.load(os.path.join(save_dir, 'checkpoints', saveName + '_epoch-' + str(resume_epoch - 1) + '.pth.tar'), # map_location=lambda storage, loc: storage) # Load all tensors onto the CPU #print("Initializing weights from: {}...".format( # os.path.join(save_dir, 'models', saveName + '_epoch-' + str(resume_epoch - 1) + '.pth.tar'))) checkpoint = torch.load( 'run\\run_10\\models\\I3D-celeb-df_epoch-19.pth.tar', map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['opt_dict']) print("Chekpoint loaded") print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) model.to(device) criterion.to(device) log_dir = os.path.join( save_dir, 'models', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) print('Training model on {} dataset...'.format(dataset)) train_dataloader = DataLoader(VideoDataset(dataset=dataset, split='train', clip_len=16, preprocess=False), batch_size=8, shuffle=True, num_workers=4) val_dataloader = DataLoader(VideoDataset(dataset=dataset, split='val', clip_len=16), batch_size=8, num_workers=4, shuffle=True) test_dataloader = DataLoader(VideoDataset(dataset=dataset, split='test', clip_len=16), batch_size=8, num_workers=4, shuffle=True) trainval_loaders = {'train': train_dataloader, 'val': val_dataloader} trainval_sizes = { x: len(trainval_loaders[x].dataset) for x in ['train', 'val'] } test_size = len(test_dataloader.dataset) training_loss_history = [] val_loss_history = [] for epoch in range(resume_epoch, num_epochs): #each epoch has a training and validation step # for phase in ['train', 'val']: # start_time = timeit.default_timer() # # reset the running loss and corrects # running_loss = 0.0 # running_corrects = 0.0 # # set model to train() or eval() mode depending on whether it is trained # # or being validated. Primarily affects layers such as BatchNorm or Dropout. # if phase == 'train': # # scheduler.step() is to be called once every epoch during training # scheduler.step() # model.train() # else: # model.eval() # for inputs, labels in tqdm(trainval_loaders[phase]): # # move inputs and labels to the device the training is taking place on # inputs = Variable(inputs, requires_grad=True).to(device) # labels = Variable(labels).to(device) # optimizer.zero_grad() # if phase == 'train': # outputs = model(inputs) # else: # with torch.no_grad(): # outputs = model(inputs) # probs = nn.Softmax(dim=1)(outputs) # preds = torch.max(probs, 1)[1] # loss = criterion(outputs, labels.type(torch.long)) # if phase == 'train': # loss.backward() # #torch.nn.utils.clip_grad_norm_(model.parameters(), 1) # optimizer.step() # training_loss_history.append(loss.item()) # else: # val_loss_history.append(loss.item()) # running_loss += loss.item() * inputs.size(0) # running_corrects += torch.sum(preds == labels.data) # #print("Running loss: ", running_loss) # #print("Running corrects: ", running_corrects) # epoch_loss = running_loss / trainval_sizes[phase] # epoch_acc = running_corrects.double() / trainval_sizes[phase] # if phase == 'train': # writer.add_scalar('data/train_loss_epoch', epoch_loss, epoch) # writer.add_scalar('data/train_acc_epoch', epoch_acc, epoch) # else: # writer.add_scalar('data/val_loss_epoch', epoch_loss, epoch) # writer.add_scalar('data/val_acc_epoch', epoch_acc, epoch) # save_loss(training_loss_history, val_loss_history) # print("[{}] Epoch: {}/{} Loss: {} Acc: {}".format(phase, epoch+1, nEpochs, epoch_loss, epoch_acc)) # stop_time = timeit.default_timer() # print("Execution time: " + str(stop_time - start_time) + "\n") # if epoch % save_epoch == (save_epoch - 1): # torch.save({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'opt_dict': optimizer.state_dict(), # }, os.path.join(save_dir, 'models', saveName + '_epoch-' + str(epoch) + '.pth.tar')) # print("Save model at {}\n".format(os.path.join(save_dir, 'models', saveName + '_epoch-' + str(epoch) + '.pth.tar'))) if useTest and epoch % test_interval == (test_interval - 1): model.eval() start_time = timeit.default_timer() running_loss = 0.0 running_corrects = 0.0 cat_probs = None cat_labels = None for inputs, labels in tqdm(test_dataloader): inputs = inputs.to(device) labels = labels.to(device) with torch.no_grad(): outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] loss = criterion(outputs, labels.type(torch.long)) if type(cat_probs) != type(None): cat_probs = torch.cat((cat_probs, probs), dim=0) cat_labels = torch.cat((cat_labels, labels), dim=0) else: cat_probs = probs cat_labels = labels running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) save_roc_curve(cat_labels, cat_probs) epoch_loss = running_loss / test_size epoch_acc = running_corrects.double() / test_size writer.add_scalar('data/test_loss_epoch', epoch_loss, epoch) writer.add_scalar('data/test_acc_epoch', epoch_acc, epoch) print("[test] Epoch: {}/{} Loss: {} Acc: {}".format( epoch + 1, nEpochs, epoch_loss, epoch_acc)) stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time) + "\n") writer.close()
def train_model(dataset=dataset, num_classes=num_classes, lr=lr, num_epochs=nEpochs, save_epoch=snapshot): """ Args: num_classes (int): Number of classes in the data num_epochs (int, optional): Number of epochs to train for. """ if modelName == 'C3D': model = C3D_model.C3D(num_classes=num_classes, pretrained=True) train_params = [{ 'params': C3D_model.get_1x_lr_params(model), 'lr': lr }, { 'params': C3D_model.get_10x_lr_params(model), 'lr': lr * 10 }] elif modelName == 'resnet': model = resnet.generate_model(model_depth=50, n_classes=700, n_input_channels=3, shortcut_type='B', conv1_t_size=7, conv1_t_stride=1, no_max_pool=False, widen_factor=1.0) model = resnet.load_pretrained_model(model, 'network/r3d50_K_200ep.pth', modelName, num_classes) train_params = resnet.get_fine_tuning_parameters(model, 'fc') criterion = nn.CrossEntropyLoss( ) # standard crossentropy loss for classification optimizer = optim.SGD(train_params, lr=lr, momentum=0.9, weight_decay=5e-4) scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=10, gamma=0.1) # the scheduler divides the lr by 10 every 10 epochs print("Training {} from scratch...".format(modelName)) print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) model.to(device) criterion.to(device) print('Training model on {} dataset...'.format(dataset)) train_dataloader = DataLoader(VideoDataset(dataset=dataset, split='train', clip_len=16), batch_size=20, shuffle=True, num_workers=4) train_size = len(train_dataloader.dataset) for epoch in range(0, num_epochs): start_time = timeit.default_timer() # reset the running loss and corrects running_loss = 0.0 running_corrects = 0.0 # scheduler.step() is to be called once every epoch during training scheduler.step() model.train() for inputs, labels in tqdm(train_dataloader): # move inputs and labels to the device the training is taking place on inputs = Variable(inputs, requires_grad=True).to(device) labels = Variable(labels).to(device) optimizer.zero_grad() outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / train_size epoch_acc = running_corrects.double() / train_size print("[{}] Epoch: {}/{} Loss: {} Acc: {}".format( phase, epoch + 1, nEpochs, epoch_loss, epoch_acc)) stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time) + "\n") if epoch % save_epoch == (save_epoch - 1): torch.save( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'opt_dict': optimizer.state_dict(), }, os.path.join(str(epoch) + '.pth'))
def train_model(dataset=dataset, save_dir=save_dir, num_classes=num_classes, lr=lr, num_epochs=nEpochs, save_epoch=snapshot, useTest=useTest, test_interval=nTestInterval): # 1 导入模型 model = C3D_model.C3D(num_classes=num_classes, pretrained=False) train_params = [{ 'params': C3D_model.get_1x_lr_params(model), 'lr': lr }, { 'params': C3D_model.get_10x_lr_params(model), 'lr': lr * 10 }] # 2 损失函数 criterion = nn.CrossEntropyLoss() # 3 优化函数 optimizer = optim.SGD(train_params, lr=lr, momentum=0.9, weight_decay=5e-4) # 4 学习率下降 scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) model.cuda() criterion.cuda() # 6 设置训练集和测试机 print('Training model on {} dataset...'.format(dataset)) train_dataloader = DataLoader(VideoDataset(dataset=dataset, split='train', clip_len=16), batch_size=4, shuffle=True, num_workers=0) val_dataloader = DataLoader(VideoDataset(dataset=dataset, split='val', clip_len=16), batch_size=4, num_workers=0) test_dataloader = DataLoader(VideoDataset(dataset=dataset, split='test', clip_len=16), batch_size=4, num_workers=0) trainval_loaders = {'train': train_dataloader, 'val': val_dataloader} trainval_sizes = { x: len(trainval_loaders[x].dataset) for x in ['train', 'val'] } test_size = len(test_dataloader.dataset) # 8 开始训练模型 for epoch in range(0, num_epochs): for phase in ['train', 'val']: start_time = timeit.default_timer() # reset the running loss and corrects running_loss = 0.0 running_corrects = 0.0 if phase == 'train': scheduler.step() model.train() else: model.eval() for inputs, labels in tqdm(trainval_loaders[phase]): inputs = inputs.cuda() labels = labels.cuda() optimizer.zero_grad() # 根据训练集还是测试集进行模型输出选择 if phase == 'train': outputs = model(inputs) else: # 禁用梯度下降上下文管理器 with torch.no_grad(): outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] loss = criterion(outputs, labels.long()) # 训练集,进行反向传播 if phase == 'train': loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / trainval_sizes[phase] epoch_acc = running_corrects.double() / trainval_sizes[phase] print("[{}] Epoch: {}/{} Loss: {} Acc: {}".format( phase, epoch + 1, nEpochs, epoch_loss, epoch_acc)) stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time) + "\n") # 保存模型 if epoch % save_epoch == (save_epoch - 1): torch.save( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'opt_dict': optimizer.state_dict(), }, os.path.join(save_dir, 'models', saveName + '_epoch-' + str(epoch) + '.pth.tar')) print("Save model at {}\n".format( os.path.join(save_dir, 'models', saveName + '_epoch-' + str(epoch) + '.pth.tar'))) # 走测试集 if useTest and epoch % test_interval == (test_interval - 1): model.eval() start_time = timeit.default_timer() running_loss = 0.0 running_corrects = 0.0 for inputs, labels in tqdm(test_dataloader): inputs = inputs.cuda() labels = labels.cuda() with torch.no_grad(): outputs = model(inputs) probs = nn.Softmax(dim=1)(outputs) preds = torch.max(probs, 1)[1] loss = criterion(outputs, labels.long()) running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / test_size epoch_acc = running_corrects.double() / test_size print("[test] Epoch: {}/{} Loss: {} Acc: {}".format( epoch + 1, nEpochs, epoch_loss, epoch_acc)) stop_time = timeit.default_timer() print("Execution time: " + str(stop_time - start_time) + "\n")