예제 #1
0
    def __init__(self, root, mode, test_dir, train_dir, save_model_withname=None,\
                 save_error_withname=None, checkpoint=None):
        self.root = root
        self.mode = mode
        self.test_dir = test_dir
        self.train_dir = train_dir
        self.save_model_withname = save_model_withname
        self.save_error_withname = save_error_withname
        self.checkpoint = checkpoint
        self.batch_size = 50
        self.learning_rate = 0.0001
        self.validation_loop = 0

        if(self.mode=='train'):
            self.writer = tensorboardX.SummaryWriter(comment="train")
        else:
            self.writer = tensorboardX.SummaryWriter(comment="test")
        # setup dataset
        self.train_transforms = transforms.Compose([videotransforms.RandomCrop(112),
                                           videotransforms.RandomHorizontalFlip(),])
        self.test_transforms = transforms.Compose([videotransforms.CenterCrop(112)])

        self.dataset = VisualTactile(self.root, self.train_dir, self.train_transforms)
        self.dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True, num_workers=1, pin_memory=True)

        self.val_dataset = VisualTactile(self.root, self.test_dir, self.test_transforms)
        self.val_dataloader = torch.utils.data.DataLoader(self.val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True)

#         self.dataloaders = {'train': self.dataloader, 'val': self.val_dataloader}
#         self.datasets = {'train': self.dataset, 'val': self.val_dataset}

        self.model, self.optimizer, self.scheduler = self.load_model(self.checkpoint)
예제 #2
0
def run(max_steps=64e3,load_model='',root='/l/vision/v7/wang617/taiwan', batch_size=1, save_dir=''):
    # setup dataset
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
    dataset = Dataset(root,test_transforms, save_dir=save_dir)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=8)    
    i3d = InceptionI3d(400, in_channels=3)
    #i3d.replace_logits(157)
    i3d.load_state_dict(torch.load(load_model))
    i3d.cuda()
    i3d.train(False)  # Set model to evaluate mode
    count = 0
    start = time.time()
    for data in dataloader:
            # get the inputs
        inputs, label, name = data
        label = str(label.numpy()[0])
        b,c,t,h,w = inputs.shape
        inputs = Variable(inputs.cuda(), volatile=True)
        features = i3d.extract_features(inputs)
        np.save(os.path.join(save_dir,name[0]),features.squeeze().data.cpu().numpy())
        f = open('/l/vision/v7/wang617/taiwan_data/i3d_feature_list.txt','a')
        f.writelines([name[0],',',label,'\n'])
        count = count +1
        if count%100 ==0:
            current = time.time()
            print('Count {:2},|' 'running time:{:.2f} sec'.format(count,current-start))
    f.close()
예제 #3
0
def run(max_steps=64e3, mode='rgb', root='', split='', batch_size=1, load_model='', save_dir=''):
    # setup dataset
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(split, 'testing', root, mode, test_transforms, num=-1, save_dir=save_dir)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True)

    # val_dataset = Dataset(split, 'testing', root, mode, test_transforms, num=-1, save_dir=save_dir)
    # val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)

    dataloaders = {'train': dataloader}#, 'val': val_dataloader}
    datasets = {'train': dataset}#, 'val': val_dataset}

    
    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
    else:
        i3d = InceptionI3d(400, in_channels=3)
    i3d.replace_logits(400)
    i3d.load_state_dict(torch.load(load_model))
    i3d.cuda()

    for phase in ['train']:
        i3d.train(False)  # Set model to evaluate mode
                
        tot_loss = 0.0
        tot_loc_loss = 0.0
        tot_cls_loss = 0.0
                    
        # Iterate over data.
        for data in dataloaders[phase]:
            # get the inputs
            inputs, labels, name = data
            if os.path.exists(os.path.join(save_dir, name[0]+'.npy')):
                continue
            i=0
            for input in inputs:
                i+=1
                b,c,t,h,w = input.shape
                if t > 1600:
                    features = []
                    for start in range(1, t-56, 1600):
                        end = min(t-1, start+1600+56)
                        start = max(1, start-48)
                        ip = Variable(torch.from_numpy(input.numpy()[:,:,start:end]).cuda(), volatile=True)
                        features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy())
                    np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0))
                else:
                    # wrap them in Variable
                    input = Variable(input.cuda(), volatile=True)

                    features = i3d.extract_features(input)
                    new_path = os.path.join(save_dir, name[0], mode)
                    if not os.path.exists(new_path):
                        os.makedirs(new_path)
                    np.save(os.path.join(new_path, str(i)), features.squeeze(0).permute(1,2,3,0).data.cpu().numpy())
예제 #4
0
def run(max_steps=64e3, mode='rgb', root='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/data/charades/Charades_v1_rgb', split='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/data/charades/charades.json', batch_size=1, load_model='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/pytorch-i3d/models/rgb_charades.pt', save_dir='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/data/charades/charades_features'):
    # setup dataset
    #root = '/ssd2/charades/Charades_v1_rgb', split = 'charades/charades.json', batch_size = 1, load_model = '', save_dir = ''
    # root = '/gpfs/home/lhe/xxw/xxw/super-events-cvpr18-master/data/charades/Charades_v1_rgb'
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
    # print ( mode,root,split,batch_size)
    dataset = Dataset(split, 'training', root, mode, test_transforms, save_dir=save_dir) #num=-1,
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)

    val_dataset = Dataset(split, 'testing', root, mode, test_transforms,  save_dir=save_dir)#num=-1,
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}


    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
    else:
        i3d = InceptionI3d(400, in_channels=3)
    i3d.replace_logits(157)
    i3d.load_state_dict(torch.load(load_model))
    i3d.cuda()

    for phase in ['train', 'val']:
        i3d.train(False)  # Set model to evaluate mode

        tot_loss = 0.0
        tot_loc_loss = 0.0
        tot_cls_loss = 0.0

        # Iterate over data.
        for data in dataloaders[phase]:
            # get the inputs
            inputs, labels, name = data
            if os.path.exists(os.path.join(save_dir, name[0]+'.npy')):
                continue

            b,c,t,h,w = inputs.shape
            if t > 1600:
                features = []
                for start in range(1, t-56, 1600):
                    end = min(t-1, start+1600+56)
                    start = max(1, start-48)
                    ip = Variable(torch.from_numpy(inputs.numpy()[:,:,start:end]).cuda(), volatile=True)
                    features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy())
                np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0))
            else:
                # wrap them in Variable
                inputs = Variable(inputs.cuda(), volatile=True)
                features = i3d.extract_features(inputs)
                np.save(os.path.join(save_dir, name[0]), features.squeeze(0).permute(1,2,3,0).data.cpu().numpy())
예제 #5
0
def run(init_lr=0.1,
        max_steps=64e3,
        mode='rgb',
        root='/ssd/Charades_v1_rgb',
        train_split='charades/charades.json',
        batch_size=3 * 15,
        save_model='',
        weights=None,
        num_classes=0):
    # setup dataset
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    data = make_eval_json()
    class_map = make_label_map()

    val_dataset = Dataset(train_split, 'test', root, mode, data, num_classes,
                          test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=1,
                                                 shuffle=False,
                                                 num_workers=2,
                                                 pin_memory=False)
    dataloaders = {'test': val_dataloader}
    datasets = {'test': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('weights/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt'))
    i3d.replace_logits(num_classes)
    i3d.load_state_dict(
        torch.load(weights)
    )  # nslt_2000_000700.pt nslt_1000_010800 nslt_300_005100.pt(best_results)  nslt_300_005500.pt(results_reported) nslt_2000_011400
    #i3d.cuda()
    #i3d = nn.DataParallel(i3d)
    i3d.eval()
    preds = []
    for data in dataloaders["test"]:
        inputs, labels, video_id = data  # inputs: b, c, t, h, w

        per_frame_logits = i3d(inputs)

        predictions = torch.max(per_frame_logits, dim=2)[0]
        out_labels = np.argsort(predictions.cpu().detach().numpy()[0])
        out_probs = np.sort(predictions.cpu().detach().numpy()[0])
        print(class_map[out_labels[-1]])
        preds.append(class_map[out_labels[-1]])
    return preds
def predict_video(model_path, video_path, device):

    # Load model
    m = load_model(model_path).to(device)

    # Load rgb frames from video
    frames = load_rgb_frames_from_video(video_path, 0, -1, True)

    crop = videotransforms.CenterCrop(224)
    frames = video_to_tensor(crop(frames))

    logits = m(frames.unsqueeze(0).to(device))

    return logits[0, -1]
def calculate_confusion_matrix():
    args = get_parse()
    cabin_video_dir = args.cabin_video_dir
    face_video_dir = args.face_video_dir
    test_data_path = args.test_data_path
    batch_size = args.batch_size
    num_classes = args.num_classes
    weight = args.weight
    print('Start to load data')
    test_transforms = transforms.Compose([
        videotransforms.CenterCrop(224),
        videotransforms.ToTensor(),
        videotransforms.ClipNormalize()
    ])
    test_dataset = IVBSSDataset(cabin_video_dir, face_video_dir,
                                test_data_path, test_transforms)
    print('Total number of test samples is {0}'.format(len(test_dataset)))
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 sampler=SequentialSampler(test_dataset),
                                 collate_fn=collate_fn)
    model = TAL_Net(num_classes)
    print('Load checkpoint')
    model = load_ckp(args.ckp_path, model)
    model.cuda()
    model.eval()

    print('Start to calculate confusion matrix')
    all_predicts = []
    all_labels = []
    for i, (cabin_imgs, face_imgs, labels, start_labels,
            end_labels) in enumerate(test_dataloader):
        cabin_imgs = cabin_imgs.cuda()
        face_imgs = face_imgs.cuda()
        with torch.no_grad():
            class_scores, start_scores, end_scores = model(
                cabin_imgs, face_imgs)
            class_preds = torch.argmax(class_scores, dim=1)
            class_preds = class_preds.cpu().numpy()
            labels = labels.numpy()
            all_predicts.append(class_preds)
            all_labels.append(labels)
    all_predicts = np.concatenate(all_predicts)
    all_labels = np.concatenate(all_labels)
    cf_matrix = confusion_matrix(all_labels, all_predicts)
    normalized_confusion_matrix = confusion_matrix(all_labels,
                                                   all_predicts,
                                                   normalize='true')
    return cf_matrix, normalized_confusion_matrix
예제 #8
0
def load_data(dataset_path, batch_size=1):
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
    dataset_train = VidorDataset(dataset_path, 'training', test_transforms)
    dataloader_train = torch.utils.data.DataLoader(dataset_train,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   num_workers=5,
                                                   pin_memory=True)
    dataset_val = VidorDataset(dataset_path, 'validation', test_transforms)
    dataloader_val = torch.utils.data.DataLoader(dataset_val,
                                                 batch_size=1,
                                                 shuffle=False,
                                                 num_workers=5,
                                                 pin_memory=True)
    dataloaders = {'train': dataloader_train, 'val': dataloader_val}
    datasets = {'train': dataset_train, 'val': dataset_val}
    return datasets, dataloaders
예제 #9
0
def predict():
    args = get_parse()
    cabin_video_dir = args.cabin_video_dir
    test_data_path = args.test_data_path
    #     batch_size = args.batch_size
    num_classes = args.num_classes

    print('Start to load data')
    test_transforms = transforms.Compose(
        [videotransforms.CenterCrop(224),
         videotransforms.ToTensor()])
    test_dataset = IVBSSDataset(face_video_dir, cabin_video_dir,
                                test_data_path, test_transforms)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 sampler=SequentialSampler(test_dataset),
                                 collate_fn=collate_fn)

    model = TemporalActionLocalization(num_classes, pretrained_I3D_model)
    print('Load checkpoint')
    model = load_ckp(args.ckp_path, model)

    model.cuda()
    model.eval()

    print('Start to test')
    test_loss = 0.0
    test_steps = 0
    for i, (face_imgs, cabin_imgs, labels) in enumerate(test_dataloader):
        face_imgs = face_imgs.cuda()
        cabin_imgs = cabin_imgs.cuda()
        for k, v in labels.items():
            labels[k] = v.cuda()
        loss = model(face_imgs, cabin_imgs, labels)
        test_loss += loss.item()
        test_steps += 1
    avg_test_loss = test_loss / test_steps
    return avg_test_loss
예제 #10
0
def load_data(dataset_path, batch_size=5, num_workers=10):
    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset_train = VidorDataset(dataset_path, 'training', train_transforms)
    cls_weights = dataset_train.get_weights()
    dataloader_train = torch.utils.data.DataLoader(dataset_train,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   num_workers=num_workers,
                                                   pin_memory=True)
    dataset_val = VidorDataset(dataset_path, 'validation', test_transforms)
    dataloader_val = torch.utils.data.DataLoader(dataset_val,
                                                 batch_size=1,
                                                 shuffle=True,
                                                 num_workers=num_workers,
                                                 pin_memory=True)
    dataloaders = {'train': dataloader_train, 'val': dataloader_val}
    datasets = {'train': dataset_train, 'val': dataset_val}
    return datasets, dataloaders, np.asarray(1 - cls_weights, dtype=np.float32)
예제 #11
0
def run(init_lr=0.1,
        max_steps=64e3,
        mode='rgb',
        root='/ssd/Charades_v1_rgb',
        train_split='charades/new_data.json',
        batch_size=8,
        save_model=''):
    # setup dataset
    print("Inside Run")
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    print("Train Dataset")
    dataset = Dataset(train_split, 'training', root, mode, train_transforms)
    print("Train Dataset DataLoader")
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=8,
                                             pin_memory=True)

    print("Test Dataset")
    val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms)
    print("Test Dataset DataLoader")
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=8,
                                                 pin_memory=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # setup the model
    if mode == 'flow':
        print("Running flow")
        i3d = InceptionI3d(400, in_channels=2)
        print("loading flow dict")
        i3d.load_state_dict(torch.load('models/flow_imagenet.pt'))
    else:
        print("Running rgb")
        i3d = InceptionI3d(400, in_channels=3)
        print("loading dict")
        # i3d.load_state_dict(torch.load('models/rgb_imagenet.pt'))
    i3d.replace_logits(2)
    print("Replaced logits")
    #i3d.load_state_dict(torch.load('/ssd/models/000920.pt'))
    i3d.cuda()
    i3d = nn.DataParallel(i3d)
    print("Parallel Data")

    lr = init_lr
    print("Initializing SGD")
    optimizer = optim.SGD(i3d.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    print("Scheduling some multistep")
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])

    num_steps_per_update = 4  # accum gradient
    steps = 0
    # train it
    print("Starting Training")
    while steps < max_steps:  #for epoch in range(num_epochs):
        print 'Step {}/{}'.format(steps, max_steps)
        print '-' * 10

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                print("Training")
                i3d.train(True)
            else:
                print("Validation")
                i3d.train(False)  # Set model to evaluate mode

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            optimizer.zero_grad()

            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                inputs, labels = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                per_frame_logits = i3d(inputs)
                # upsample to input size
                per_frame_logits = F.upsample(per_frame_logits,
                                              t,
                                              mode='linear')

                # compute localization loss
                loc_loss = F.binary_cross_entropy_with_logits(
                    per_frame_logits, labels)
                tot_loc_loss += loc_loss.data

                # compute classification loss (with max-pooling along time B x C x T)
                cls_loss = F.binary_cross_entropy_with_logits(
                    torch.max(per_frame_logits, dim=2)[0],
                    torch.max(labels, dim=2)[0])
                tot_cls_loss += cls_loss.data

                loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update
                tot_loss += loss.data
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_sched.step()
                    if steps % 10 == 0:
                        print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(
                            phase, tot_loc_loss / (10 * num_steps_per_update),
                            tot_cls_loss / (10 * num_steps_per_update),
                            tot_loss / 10)
                        # save model
                        torch.save(i3d.module.state_dict(),
                                   save_model + str(steps).zfill(6) + '.pt')
                        tot_loss = tot_loc_loss = tot_cls_loss = 0.
            if phase == 'val':
                print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(
                    phase, tot_loc_loss / num_iter, tot_cls_loss / num_iter,
                    (tot_loss * num_steps_per_update) / num_iter)
예제 #12
0
def predict_events(cabin_video_path, face_video_path, args):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    checkpoint = args.checkpoint
    clip_length = args.clip_length
    clip_stride = args.clip_stride
    batch_size = args.batch_size
    num_classes = args.num_classes
    threshold = args.threshold

    cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(cabin_video_path, face_video_path, clip_length,
                                                                      clip_stride)
    model = TAL_Net(num_classes)
    ckp = torch.load(checkpoint)
    model.load_state_dict(ckp['model'])
    model.to(device)
    model.eval()

    clip_transforms = transforms.Compose([videotransforms.CenterCrop(224),
                                          videotransforms.ToTensor(),
                                          videotransforms.ClipNormalize()
                                          ])
    all_clips = []
    all_predict_classes = []
    all_start_scores = []
    all_end_scores = []

    n = len(cabin_clips) // batch_size
    for i in range(n):
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for j in range(i * batch_size, (i + 1) * batch_size):
            cabin_clip = cabin_clips[j]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[j]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)
        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)

        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    if len(cabin_clips) % batch_size != 0:
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for k in range(n * batch_size, len(cabin_clips)):
            cabin_clip = cabin_clips[k]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[k]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)

        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)
        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    all_predict_classes = np.concatenate(all_predict_classes)
    all_start_scores = np.concatenate(all_start_scores)
    all_end_scores = np.concatenate(all_end_scores)
    
    print(all_predict_classes)
    # refined chunk aggregation
    cabin_frames = os.listdir(cabin_video_path)
    cabin_frame_length  = len(cabin_frames)
    cabin_indices = np.arange(start=0, stop=cabin_frame_length - clip_stride + 1, step=clip_stride)
    indices_in_shorter_clips = [list(range(idx, idx + clip_stride)) for idx in cabin_indices]
#     remainder = cabin_frame_length % clip_stride
#     if remainder != 0:
#         indices_in_shorter_clips.append(list(range(cabin_frame_length-remainder, cabin_frame_length)))
    print(len(indices_in_shorter_clips))
    print(len(indices_in_cabin_clips))
    shorter_clip_predict_classes = []
    for i in range(len(indices_in_shorter_clips)):
        if i == 0:
            shorter_clip_predict_classes.append(all_predict_classes[0])
        elif i == 1:
            l = [all_predict_classes[0], all_predict_classes[1]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == 2:
            l = [all_predict_classes[0], all_predict_classes[1], all_predict_classes[2]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i < len(indices_in_cabin_clips):
            l = [all_predict_classes[j] for j in range(i-3, i+1)]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips):
            index = len(indices_in_cabin_clips) - 1
            l = [all_predict_classes[index-2], all_predict_classes[index-1], all_predict_classes[index]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips) + 1:
            index = len(indices_in_cabin_clips) - 1
            l = [all_predict_classes[index-1], all_predict_classes[index]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips) + 2:
            index = len(indices_in_cabin_clips) - 1
            shorter_clip_predict_classes.append(all_predict_classes[index])
    print(shorter_clip_predict_classes)
    
    # extract start and end peaks
    start_peak_indices = []
    end_peak_indices = []
    if all_start_scores[0] > all_start_scores[1]:
        start_peak_indices.append(0)
    for i in range(1, len(all_start_scores) - 1):
        if all_start_scores[i] > all_start_scores[i - 1]:
            if all_start_scores[i] > all_start_scores[i + 1]:
                start_peak_indices.append(i)
        if all_end_scores[i] > all_end_scores[i - 1]:
            if all_end_scores[i] > all_end_scores[i + 1]:
                end_peak_indices.append(i)
    if all_end_scores[-1] > all_end_scores[-2]:
        end_peak_indices.append(len(cabin_clips) - 1)

    j = 0
    copy_start_peak_indices = start_peak_indices.copy()
    while j < len(start_peak_indices) - 1:
        index1 = copy_start_peak_indices[j]
        index2 = copy_start_peak_indices[j + 1]
        if index1 + 4 < index2:
            j += 1
        else:
            if all_start_scores[start_peak_indices[j]] > all_start_scores[start_peak_indices[j + 1]]:
                copy_start_peak_indices[j] = index2
                copy_start_peak_indices.pop(j + 1)
                start_peak_indices.pop(j + 1)

            else:
                copy_start_peak_indices.pop(j)
                start_peak_indices.pop(j)

    k = 0
    copy_end_peak_indices = end_peak_indices.copy()
    while k < len(end_peak_indices) - 1:
        index1 = copy_end_peak_indices[k]
        index2 = copy_end_peak_indices[k + 1]
        if index1 + 4 < index2:
            k += 1
        else:
            if all_end_scores[end_peak_indices[k]] > all_end_scores[end_peak_indices[k + 1]]:
                copy_end_peak_indices[k] = index2
                copy_end_peak_indices.pop(k + 1)
                end_peak_indices.pop(k + 1)
            else:
                copy_end_peak_indices.pop(k)
                end_peak_indices.pop(k)

    selected_starts = []
    selected_ends = []
    for start_indice in start_peak_indices:
        if all_start_scores[start_indice] > threshold:
            selected_starts.append(start_indice)
    for end_indice in end_peak_indices:
        if all_end_scores[end_indice] > threshold:
            selected_ends.append(end_indice+3)
    print(selected_starts)
    print(selected_ends)
    
       
    rough_clip_groups = defaultdict(list)
    for i in range(len(shorter_clip_predict_classes)):
        if shorter_clip_predict_classes[i] != 0:
            rough_clip_groups[shorter_clip_predict_classes[i]].append(i)
    print(rough_clip_groups)
    
#     all_refined_clip_groups = dict()
#     for key in rough_clip_groups.keys():
#         clip_group = rough_clip_groups[key]
#         refined_groups = []
        
#         previous = 0
#         i = 0
#         while i < len(clip_group) - 1:
#             if clip_group[i] in selected_starts:
#                 previous = i
#             elif clip_group[i] in selected_ends:
#                 refined_groups.append(clip_group[previous:(index+1)])
#                 j = i + 1
#                 while j < len(clip_group) - 1:
#                     if clip_group[j] - clip_group[j-1] == 1:
#                         j += 1
#                     else:
#                         previous = j 
#                         i = j
#                         break
#             elif clip_group[i] + 2 < clip_group[i+1]:
#                 refined_groups.append(clip_group[previous:(i+1)])
#                 previous = i+1
#             i += 1
#             print(previous, i)
#         if previous < len(clip_group) - 1:
#             refined_groups.append(clip_group[previous:])
#             all_refined_clip_groups[key] = refined_groups
#     print(all_refined_clip_groups)
    
    all_refined_clip_groups = dict()
    for key in rough_clip_groups.keys():
        clip_group = rough_clip_groups[key]
        refined_groups = []
        
        previous = 0
        i = 0
        while i < len(clip_group) - 1:
            if clip_group[i] + 2 < clip_group[i+1]:
                refined_groups.append(clip_group[previous:(i+1)])
                previous = i+1
            i += 1
        
        refined_groups.append(clip_group[previous:])
        all_refined_clip_groups[key] = refined_groups
    print(all_refined_clip_groups)

    
    keys = list(all_refined_clip_groups)
    if len(keys) == 2:
        k1 = keys[0]
        k2 = keys[1]
        groups1 = all_refined_clip_groups[k1]
        groups2 = all_refined_clip_groups[k2]

        i = 0
        j = 0
        while i < len(groups1):
            while j < len(groups2):
                min_index1 = min(groups1[i])
                max_index1 = max(groups1[i])
                min_index2 = min(groups2[j])
                max_index2 = max(groups2[j])
                set1 = set(range(min_index1, max_index1+1))
                set2 = set(range(min_index2, max_index2+1))
                if set1.issubset(set2) == True:
                    groups1.remove(groups1[i])
                    if i >= len(groups1):
                        break
                elif set2.issubset(set1) == True:
                    groups2.remove(groups2[j])
                else:
                    if max_index1 > max_index2:
                        j += 1
                    else:
                        break
            i += 1
        filtered_all_clip_groups = {
            k1:groups1,
            k2:groups2
        }
    else:
        filtered_all_clip_groups = all_refined_clip_groups
    print(filtered_all_clip_groups)
    
    # add start and end information
    final_all_clip_groups = {}
    for key in filtered_all_clip_groups.keys():
        clip_groups = filtered_all_clip_groups[key]
        all_clip_groups = []
        for clip_group in clip_groups: 
            if len(clip_group) > 6:
                start_clip = min(clip_group)
                end_clip = max(clip_group)
                for selected_start in selected_starts:
                    if selected_start > start_clip and selected_start < start_clip + 3:
                        start_clip = selected_start
                for selected_end in selected_ends:
                    if selected_end > end_clip - 3 and selected_end < end_clip:
                        end_clip = selected_end
                clip_group = list(range(start_clip, end_clip+1))
            all_clip_groups.append(clip_group)
        final_all_clip_groups[key] = all_clip_groups
    
    all_clip_frame_groups = {} 
    for key in final_all_clip_groups.keys():
        final_groups = final_all_clip_groups[key]
        clip_frame_groups = []
        for group in final_groups:
            clip_frame_group = set()
            for index in group:
                clip_frame_group = clip_frame_group.union(set(indices_in_shorter_clips[index]))
                start_frame = min(clip_frame_group) + 1
                end_frame = max(clip_frame_group) + 1            
            clip_frame_groups.append([start_frame, end_frame])
            
        
        all_clip_frame_groups[key] = clip_frame_groups
    return all_clip_frame_groups
def run(init_lr=0.01, max_steps=200, mode='rgb', root='/media/pritesh/Entertainment/Visual-Tactile_Dataset/dataset/',\
        train_split='train.txt', test_split='test.txt', batch_size=1, save_model=''):
    print(train_split, test_split)
    writer = tensorboardX.SummaryWriter()
    # setup dataset
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split, root, mode, test_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=0,
                                             pin_memory=True)

    val_dataset = Dataset(test_split, root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=0,
                                                 pin_memory=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # setup the model
    sm = InceptionI3d(400, in_channels=3)
    sm.replace_logits(1)
    #add your network here
    fusedNet = FusionNet(sm)
    if torch.cuda.is_available():
        fusedNet.cuda()
    fusedNet = nn.DataParallel(fusedNet)

    lr = init_lr
    optimizer = optim.SGD(fusedNet.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [50, 100, 150, 200])
    if torch.cuda.is_available():
        data = torch.load(save_model)
    else:
        data = torch.load(save_model,
                          map_location=lambda storage, loc: storage)
    fusedNet.load_state_dict(data['model_state'])
    optimizer.load_state_dict(data['optimizer_state'])
    lr_sched.load_state_dict(data['scheduler_state'])

    steps = 0
    with open('inference_V.txt', 'w') as file:
        file.write("train and validation loss file\n")
    # train it
    # Each epoch has a training and validation phase

    fusedNet.train(False)  # Set model to evaluate mode
    for phase in ['train', 'val']:
        print('phase : {}'.format(phase))

        tot_cls_loss = 0.0
        num_iter = 0
        count = 0
        #         optimizer.zero_grad()

        with open('inference_V.txt', 'a') as file:
            file.write("---------------\n")
        # Iterate over data.
        for data in dataloaders[phase]:
            num_iter += 1
            # get the inputs
            f_vid, l_vid, tactile, pos, labels = data

            if torch.cuda.is_available():
                rgb_inputs = Variable(f_vid.cuda())
                t = rgb_inputs.size(2)
                labels = Variable(labels.cuda())
            else:
                rgb_inputs = Variable(f_vid)
                t = rgb_inputs.size(2)
                labels = Variable(labels)

            out = fusedNet(rgb_inputs.float())
            #print('prediction output = ', per_frame_logits.shape)
            #print('labels = ',labels.shape)
            # compute classification loss (with max-pooling along time B x C x T)
            out = out.squeeze(1)
            cls_loss = F.binary_cross_entropy_with_logits(
                out.double(), labels.double())
            tot_cls_loss += cls_loss.item()
            #             cls_loss.backward()
            print('{} Loss: {:.4f} and lr: {}'.format(phase,
                                                      tot_cls_loss / num_iter,
                                                      init_lr))
            with open('inference_V.txt', 'a') as file:
                file.write("%f\n" % (tot_cls_loss / num_iter))


#             optimizer.step()
#             optimizer.zero_grad()
            if phase == 'val':
                writer.add_scalar('inference_error/' + phase,
                                  (tot_cls_loss / num_iter), num_iter)
            else:
                writer.add_scalar('inference_error/' + phase,
                                  (tot_cls_loss / num_iter), num_iter)
def run(init_lr=0.01, max_steps=200, mode='rgb', root='/media/pritesh/Entertainment/Visual-Tactile_Dataset/dataset/',\
        train_split='train.txt', test_split='test.txt', batch_size=5, save_model=''):
    writer = tensorboardX.SummaryWriter()
    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split, root, mode, train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=3,
                                             pin_memory=True)

    val_dataset = Dataset(test_split, root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=3,
                                                 pin_memory=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # setup the model
    sm = InceptionI3d(400, in_channels=3)
    sm.load_state_dict(torch.load('models/rgb_imagenet.pt'))
    #tm = InceptionI3d(400, in_channels=2)
    #tm.load_state_dict(torch.load('models/flow_imagenet.pt'))
    sm.replace_logits(1)
    sm = freeze_network_layer(sm)
    #add your network here
    fusedNet = FusionNet(sm)
    if torch.cuda.is_available():
        fusedNet.cuda()
        fusedNet = nn.DataParallel(fusedNet)

    lr = init_lr
    optimizer = optim.SGD(fusedNet.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [50, 100, 150, 200])

    steps = 0
    with open('i3d_video.txt', 'w') as file:
        file.write("train and validation loss file\n")
    # train it
    while steps < max_steps:  #for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            print('phase : {}'.format(phase))
            if phase == 'train':
                fusedNet.train(True)
            else:
                fusedNet.train(False)  # Set model to evaluate mode

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            count = 0
            optimizer.zero_grad()
            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                f_vid, l_vid, tactile, pos, labels = data

                if torch.cuda.is_available():
                    inputs = Variable(f_vid.cuda())
                    t = inputs.size(2)
                    labels = Variable(labels.cuda())
                else:
                    inputs = Variable(f_vid)
                    t = inputs.size(2)
                    labels = Variable(labels)

                per_frame_logits = fusedNet(inputs.float())
                #print('prediction output = ', per_frame_logits.shape)
                #print('labels = ',labels.shape)
                # compute classification loss (with max-pooling along time B x C x T)
                per_frame_logits = per_frame_logits.squeeze(1)
                cls_loss = F.binary_cross_entropy_with_logits(
                    per_frame_logits.double(), labels.double())
                tot_cls_loss += cls_loss.item()
                cls_loss.backward()
                print('{} Loss: {:.4f} and lr: {}'.format(
                    phase, tot_cls_loss / num_iter, init_lr))
                with open('i3d_video.txt', 'a') as file:
                    file.write("%f\n" % (tot_cls_loss / num_iter))
                optimizer.step()
                optimizer.zero_grad()
                if phase == 'val':
                    writer.add_scalar('error/' + phase,
                                      (tot_cls_loss / num_iter), num_iter)
                else:
                    writer.add_scalar('error/' + phase,
                                      (tot_cls_loss / num_iter), num_iter)
                    if (steps % 50 == 0):
                        torch.save(
                            fusedNet.module.state_dict(),
                            save_model + phase + str(steps).zfill(6) + '.pt')
                        save_checkpoint(fusedNet, optimizer, lr_sched, steps)
            #save error at every epoch
            writer.add_scalar('errorAtEpoch/' + phase,
                              (tot_cls_loss / num_iter), steps)
            tot_cls_loss = 0.
        #if(steps%50 == 0):
        #    torch.save(fusedNet.module.state_dict(), save_model+phase+str(steps).zfill(6)+'.pt')
        #    save_checkpoint(fusedNet, optimizer, lr_sched, steps)
        steps += 1
        lr_sched.step()
def ensemble(mode, root, train_split, weights, num_classes):
    # setup dataset
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
    # test_transforms = transforms.Compose([])

    val_dataset = Dataset(train_split, 'test', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1,
                                                 shuffle=False, num_workers=2,
                                                 pin_memory=False)

    dataloaders = {'test': val_dataloader}
    datasets = {'test': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('weights/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt'))
    i3d.replace_logits(num_classes)
    i3d.load_state_dict(torch.load(weights))  # nslt_2000_000700.pt nslt_1000_010800 nslt_300_005100.pt(best_results)  nslt_300_005500.pt(results_reported) nslt_2000_011400
    i3d.cuda()
    i3d = nn.DataParallel(i3d)
    i3d.eval()

    correct = 0
    correct_5 = 0
    correct_10 = 0
    # confusion_matrix = np.zeros((num_classes,num_classes), dtype=np.int)

    top1_fp = np.zeros(num_classes, dtype=np.int)
    top1_tp = np.zeros(num_classes, dtype=np.int)

    top5_fp = np.zeros(num_classes, dtype=np.int)
    top5_tp = np.zeros(num_classes, dtype=np.int)

    top10_fp = np.zeros(num_classes, dtype=np.int)
    top10_tp = np.zeros(num_classes, dtype=np.int)

    for data in dataloaders["test"]:
        inputs, labels, video_id = data  # inputs: b, c, t, h, w

        t = inputs.size(2)
        num = 64
        if t > num:
            num_segments = math.floor(t / num)

            segments = []
            for k in range(num_segments):
                segments.append(inputs[:, :, k*num: (k+1)*num, :, :])

            segments = torch.cat(segments, dim=0)
            per_frame_logits = i3d(segments)

            predictions = torch.mean(per_frame_logits, dim=2)

            if predictions.shape[0] > 1:
                predictions = torch.mean(predictions, dim=0)

        else:
            per_frame_logits = i3d(inputs)
            predictions = torch.mean(per_frame_logits, dim=2)[0]

        out_labels = np.argsort(predictions.cpu().detach().numpy())

        if labels[0].item() in out_labels[-5:]:
            correct_5 += 1
            top5_tp[labels[0].item()] += 1
        else:
            top5_fp[labels[0].item()] += 1
        if labels[0].item() in out_labels[-10:]:
            correct_10 += 1
            top10_tp[labels[0].item()] += 1
        else:
            top10_fp[labels[0].item()] += 1
        if torch.argmax(predictions).item() == labels[0].item():
            correct += 1
            top1_tp[labels[0].item()] += 1
        else:
            top1_fp[labels[0].item()] += 1
        print(video_id, float(correct) / len(dataloaders["test"]), float(correct_5) / len(dataloaders["test"]),
              float(correct_10) / len(dataloaders["test"]))

    top1_per_class = np.mean(top1_tp / (top1_tp + top1_fp))
    top5_per_class = np.mean(top5_tp / (top5_tp + top5_fp))
    top10_per_class = np.mean(top10_tp / (top10_tp + top10_fp))
    print('top-k average per class acc: {}, {}, {}'.format(top1_per_class, top5_per_class, top10_per_class))
예제 #16
0
파일: train_i3d.py 프로젝트: jnwestra/WLASL
def run(configs,
        mode='rgb',
        root='/ssd/Charades_v1_rgb',
        train_split='charades/charades.json',
        save_model='',
        num_classes=None,
        weights=None):
    print(configs)

    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split,
                      'train',
                      root,
                      mode,
                      num_classes=num_classes,
                      transforms=train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=configs.batch_size,
                                             shuffle=True,
                                             num_workers=4,
                                             pin_memory=True)

    val_dataset = Dataset(train_split,
                          'test',
                          root,
                          mode,
                          num_classes=num_classes,
                          transforms=test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=configs.batch_size,
                                                 shuffle=True,
                                                 num_workers=4,
                                                 pin_memory=False)

    dataloaders = {'train': dataloader, 'test': val_dataloader}
    datasets = {'train': dataset, 'test': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('weights/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt'))

    num_classes = dataset.num_classes
    i3d.replace_logits(num_classes)

    if weights:
        print('loading weights {}'.format(weights))
        i3d.load_state_dict(torch.load(weights))

    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    lr = configs.init_lr
    weight_decay = configs.adam_weight_decay
    optimizer = optim.Adam(i3d.parameters(), lr=lr, weight_decay=weight_decay)

    num_steps_per_update = configs.update_per_step  # accum gradient
    steps = 0
    epoch = 0

    best_val_score = 0
    # train it
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           'min',
                                                           patience=5,
                                                           factor=0.3)
    while steps < configs.max_steps and epoch < 400:  # for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, configs.max_steps))
        print('-' * 10)

        epoch += 1
        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            collected_vids = []

            if phase == 'train':
                i3d.train(True)
            else:
                i3d.train(False)  # Set model to evaluate mode

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            optimizer.zero_grad()

            confusion_matrix = np.zeros((num_classes, num_classes),
                                        dtype=np.int)
            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                if data == -1:  # bracewell does not compile opencv with ffmpeg, strange errors occur resulting in no video loaded
                    continue

                # inputs, labels, vid, src = data
                inputs, labels, vid = data

                # wrap them in Variable
                inputs = inputs.cuda()
                t = inputs.size(2)
                labels = labels.cuda()

                per_frame_logits = i3d(inputs, pretrained=False)
                # upsample to input size
                per_frame_logits = F.upsample(per_frame_logits,
                                              t,
                                              mode='linear')

                # compute localization loss
                loc_loss = F.binary_cross_entropy_with_logits(
                    per_frame_logits, labels)
                tot_loc_loss += loc_loss.data.item()

                predictions = torch.max(per_frame_logits, dim=2)[0]
                gt = torch.max(labels, dim=2)[0]

                # compute classification loss (with max-pooling along time B x C x T)
                cls_loss = F.binary_cross_entropy_with_logits(
                    torch.max(per_frame_logits, dim=2)[0],
                    torch.max(labels, dim=2)[0])
                tot_cls_loss += cls_loss.data.item()

                for i in range(per_frame_logits.shape[0]):
                    confusion_matrix[torch.argmax(gt[i]).item(),
                                     torch.argmax(predictions[i]).item()] += 1

                loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update
                tot_loss += loss.data.item()
                if num_iter == num_steps_per_update // 2:
                    print(epoch, steps, loss.data.item())
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    # lr_sched.step()
                    if steps % 10 == 0:
                        acc = float(np.trace(confusion_matrix)) / np.sum(
                            confusion_matrix)
                        print(
                            'Epoch {} {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}'
                            .format(epoch, phase,
                                    tot_loc_loss / (10 * num_steps_per_update),
                                    tot_cls_loss / (10 * num_steps_per_update),
                                    tot_loss / 10, acc))
                        tot_loss = tot_loc_loss = tot_cls_loss = 0.
            if phase == 'test':
                val_score = float(
                    np.trace(confusion_matrix)) / np.sum(confusion_matrix)
                if val_score > best_val_score or epoch % 2 == 0:
                    best_val_score = val_score
                    model_name = save_model + "nslt_" + str(
                        num_classes) + "_" + str(steps).zfill(
                            6) + '_%3f.pt' % val_score

                    torch.save(i3d.module.state_dict(), model_name)
                    print(model_name)

                print(
                    'VALIDATION: {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}'
                    .format(phase, tot_loc_loss / num_iter,
                            tot_cls_loss / num_iter,
                            (tot_loss * num_steps_per_update) / num_iter,
                            val_score))

                scheduler.step(tot_loss * num_steps_per_update / num_iter)
예제 #17
0
def run(cfg):
    # setup dataset
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
    dataset = Dataset(root=cfg['data_dir'],
                      mode=cfg['mode'],
                      transforms=test_transforms,
                      num=-1,
                      save_dir=cfg['save_dir'])
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=cfg['batch_size'],
                                             shuffle=True,
                                             num_workers=16,
                                             pin_memory=True)

    # setup the model
    if cfg['mode'] == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
    else:
        i3d = InceptionI3d(400, in_channels=3)
    # i3d.replace_logits(157)
    i3d.load_state_dict(torch.load(cfg['load_model']))
    i3d.cuda()
    i3d.train(False)  # Set model to evaluate mode

    tot_loss = 0.0
    tot_loc_loss = 0.0
    tot_cls_loss = 0.0

    map_dir = cfg['save_dir'] + '_map'

    if not os.path.exists(cfg['save_dir']):
        os.mkdir(cfg['save_dir'])
    if not os.path.exists(map_dir):
        os.mkdir(map_dir)

    # Iterate over data.
    for data in tqdm(dataloader):
        # get the inputs
        inputs, name = data
        mov = '_'.join(name[0].split('_')[:-1])

        if not os.path.exists(os.path.join(cfg['save_dir'], mov)):
            os.mkdir(os.path.join(cfg['save_dir'], mov))
        elif os.path.exists(
                os.path.join(cfg['save_dir'], mov, name[0] + '.npy')):
            continue

        if not os.path.exists(os.path.join(map_dir, mov)):
            os.mkdir(os.path.join(map_dir, mov))

        b, c, t, h, w = inputs.shape
        #print('LOG: {} shape: {}'.format(name[0], inputs.shape))
        if t > 1600:
            features = []
            maps = []
            for start in range(1, t - 56, 1600):
                end = min(t - 1, start + 1600 + 56)
                do_end_crop = True if end == start + 1600 + 56 else False
                start = max(1, start - 48)
                do_start_crop = True if start != 1 else False
                ip = Variable(torch.from_numpy(
                    inputs.numpy()[:, :, start:end]).cuda(),
                              volatile=True)
                map_pool, avg_pool = i3d.extract_features(ip)
                map_pool = map_pool.squeeze(0).permute(1, 2, 3,
                                                       0).data.cpu().numpy()
                avg_pool = avg_pool.squeeze(0).squeeze(-1).squeeze(-1).permute(
                    -1, 0).data.cpu().numpy()
                if do_end_crop:
                    #print('LOG: do end crop')
                    map_pool = map_pool[:-6, :, :, :]
                    avg_pool = avg_pool[:-6, :]
                if do_start_crop:
                    #print('LOG: do start crop')
                    map_pool = map_pool[6:, :, :, :]
                    avg_pool = avg_pool[6:, :]
                maps.append(map_pool)
                features.append(avg_pool)
                #print('LOG: maps: {}, features: {}'.format(map_pool.shape, avg_pool.shape))
            np.save(os.path.join(cfg['save_dir'], mov, name[0]),
                    np.concatenate(features, axis=0))
            np.save(os.path.join(map_dir, mov, name[0]),
                    np.concatenate(maps, axis=0))
        else:
            inputs = Variable(inputs.cuda(), volatile=True)
            map_pool, avg_pool = i3d.extract_features(inputs)
            #print('LOG: maps: {}, features: {}'.format(map_pool.shape, avg_pool.shape))
            np.save(
                os.path.join(cfg['save_dir'], mov, name[0]),
                avg_pool.squeeze(0).squeeze(-1).squeeze(-1).permute(
                    -1, 0).data.cpu().numpy())
            np.save(os.path.join(map_dir, mov, name[0]),
                    map_pool.squeeze(0).permute(1, 2, 3, 0).data.cpu().numpy())
예제 #18
0
def run(init_lr=0.1,
        max_steps=64e3,
        mode='rgb',
        root='../data/BL_and_PL',
        train_split='../ms-tcn/data/BL_and_PL/groundTruth/annotations.json',
        batch_size=1,
        save_model=''):

    # Transforms
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip()
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    # Dataset and dataloader
    dataset = Dataset(train_split,
                      'training',
                      root,
                      mode,
                      train_transforms,
                      save_dir='saveds')
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=0,
                                             pin_memory=True)
    dataloaders = {'train': dataloader}
    datasets = {'train': dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('models/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('models/rgb_imagenet.pt'))

    # We have 13 different classes.
    i3d.replace_logits(13)  #157)
    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    # Optimization stuff
    lr = init_lr
    optimizer = optim.SGD(i3d.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])
    num_steps_per_update = 4  # accum gradient
    steps = 0

    # train:
    while steps < max_steps:  #for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train']:  #, 'val']:

            # Train model if in train phase
            i3d.train(phase == 'train')

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            optimizer.zero_grad()

            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                inputs, labels, vid = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                per_frame_logits = i3d(inputs)
                # upsample to input size
                pdb.set_trace()
                per_frame_logits = F.upsample(per_frame_logits,
                                              t,
                                              mode='linear')

                # compute localization loss
                loc_loss = F.binary_cross_entropy_with_logits(
                    per_frame_logits, labels)
                tot_loc_loss += loc_loss.item()  #.data[0]

                # compute classification loss (with max-pooling along time B x C x T)
                cls_loss = F.binary_cross_entropy_with_logits(
                    torch.max(per_frame_logits, dim=2)[0],
                    torch.max(labels, dim=2)[0])
                tot_cls_loss += cls_loss.item()  #.data[0]

                loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update
                tot_loss += loss.item()  #.data[0]
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_sched.step()
                    if steps % 10 == 0:
                        print(
                            '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'
                            .format(phase,
                                    tot_loc_loss / (10 * num_steps_per_update),
                                    tot_cls_loss / (10 * num_steps_per_update),
                                    tot_loss / 10))
                        # save model
                        torch.save(i3d.module.state_dict(),
                                   save_model + str(steps).zfill(6) + '.pt')
                        tot_loss = tot_loc_loss = tot_cls_loss = 0.
            if phase == 'val':
                print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.
                      format(phase, tot_loc_loss / num_iter,
                             tot_cls_loss / num_iter,
                             (tot_loss * num_steps_per_update) / num_iter))
def run(init_lr=0.1,
        max_steps=64e3,
        mode='rgb',
        root='/ssd/Charades_v1_rgb',
        train_split='charades/charades.json',
        batch_size=3 * 15,
        save_model='',
        weights=None):
    # setup dataset
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    val_dataset = Dataset(train_split, 'test', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1,
                                                 shuffle=False, num_workers=2,
                                                 pin_memory=False)

    dataloaders = {'test': val_dataloader}
    datasets = {'test': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('weights/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt'))
    i3d.replace_logits(num_classes)
    i3d.load_state_dict(torch.load(weights))  # nslt_2000_000700.pt nslt_1000_010800 nslt_300_005100.pt(best_results)  nslt_300_005500.pt(results_reported) nslt_2000_011400
    i3d.cuda()
    i3d = nn.DataParallel(i3d)
    i3d.eval()

    correct = 0
    correct_5 = 0
    correct_10 = 0

    top1_fp = np.zeros(num_classes, dtype=np.int)
    top1_tp = np.zeros(num_classes, dtype=np.int)

    top5_fp = np.zeros(num_classes, dtype=np.int)
    top5_tp = np.zeros(num_classes, dtype=np.int)

    top10_fp = np.zeros(num_classes, dtype=np.int)
    top10_tp = np.zeros(num_classes, dtype=np.int)

    for data in dataloaders["test"]:
        inputs, labels, video_id = data  # inputs: b, c, t, h, w

        per_frame_logits = i3d(inputs)

        predictions = torch.max(per_frame_logits, dim=2)[0]
        out_labels = np.argsort(predictions.cpu().detach().numpy()[0])
        out_probs = np.sort(predictions.cpu().detach().numpy()[0])

        if labels[0].item() in out_labels[-5:]:
            correct_5 += 1
            top5_tp[labels[0].item()] += 1
        else:
            top5_fp[labels[0].item()] += 1
        if labels[0].item() in out_labels[-10:]:
            correct_10 += 1
            top10_tp[labels[0].item()] += 1
        else:
            top10_fp[labels[0].item()] += 1
        if torch.argmax(predictions[0]).item() == labels[0].item():
            correct += 1
            top1_tp[labels[0].item()] += 1
        else:
            top1_fp[labels[0].item()] += 1
        print(video_id, float(correct) / len(dataloaders["test"]), float(correct_5) / len(dataloaders["test"]),
              float(correct_10) / len(dataloaders["test"]))

        # per-class accuracy
    top1_per_class = np.mean(top1_tp / (top1_tp + top1_fp))
    top5_per_class = np.mean(top5_tp / (top5_tp + top5_fp))
    top10_per_class = np.mean(top10_tp / (top10_tp + top10_fp))
    print('top-k average per class acc: {}, {}, {}'.format(top1_per_class, top5_per_class, top10_per_class))
예제 #20
0
def run(max_steps=64e3, mode='rgb', root='/ssd2/charades/Charades_v1_rgb', batch_size=1, load_model='', save_dir=''):
    # setup dataset
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(root, mode, test_transforms, save_dir=save_dir)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)

    #val_dataset = Dataset(split, 'testing', root, mode, test_transforms, num=-1, save_dir=save_dir)
    #val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)

    dataloaders = {'train': dataloader}
    datasets = {'train': dataset}


    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
    else:
        i3d = InceptionI3d(400, in_channels=3)
    i3d.replace_logits(157)
    i3d.load_state_dict(torch.load(load_model))
    i3d.cuda()


    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for phase in ['train']:
        i3d.train(False)  # Set model to evaluate mode

        tot_loss = 0.0
        tot_loc_loss = 0.0
        tot_cls_loss = 0.0

        # Iterate over data.
        for data in tqdm(dataloaders[phase]):
            # get the inputs
            inputs, labels, name = data
            name = name[0].split('frames_hq/')[1:]

            parent_dir = os.path.join(save_dir, os.path.dirname(name[0]))
            if not os.path.exists(parent_dir):
                os.makedirs(parent_dir)

            #if os.path.exists(os.path.join(save_dir, name[0]+'.npy')):
            #    continue

            b,c,t,h,w = inputs.shape
            if t > 128:
                features = []
                for start in range(0, t, 128):
                    end = min(t, start + 128)
                    if (end - start < 8):
                        start = start - 8
                    with torch.no_grad():
                        ip = Variable(torch.from_numpy(inputs.numpy()[:,:,start:end]).cuda())
                    features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy())
                np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0))
            else:
                # wrap them in Variable
                inputs = Variable(inputs.cuda(), volatile=True)
                features = i3d.extract_features(inputs)
                np.save(os.path.join(save_dir, name[0]), features.squeeze(0).permute(1,2,3,0).data.cpu().numpy())
        args.video_dir))

if not os.path.exists(args.images_dir):
    os.makedirs(args.images_dir)

if not os.path.exists(args.save_dir):
    os.makedirs(args.save_dir)

# create I3D model and load pre-trained model
i3d_model = InceptionI3d(400, in_channels=3)
if args.use_finetuned:
    i3d_model.replace_logits(157)  # charades has 157 activity types
i3d_model.load_state_dict(torch.load(args.load_model))
i3d_model.cuda()
i3d_model.train(False)
video_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

# load video ids
video_ids = []
for filename in ["charades_sta_train.txt", "charades_sta_test.txt"]:
    with open(os.path.join(args.dataset_dir, filename),
              mode="r",
              encoding="utf-8") as f:
        for line in f:
            line = line.lstrip().rstrip()
            if len(line) == 0:
                continue
            vid = line.split("##")[0].split(" ")[0]
            video_ids.append(vid)
video_ids = list(set(video_ids))
예제 #22
0
def run(init_lr=0.1,
        max_steps=1e8,
        mode='rgb',
        dataset='thumos',
        root_train='/mnt/data_a/alex/PyTorch_I3D/thumos/validation/',
        root_eval='/mnt/data_a/alex/PyTorch_I3D/thumos/test/',
        train_split='/mnt/data_a/alex/PyTorch_I3D/thumos/validation/validation_thumos.json',
        eval_split='/mnt/data_a/alex/PyTorch_I3D/thumos/test/test_thumos.json',
        batch_size=4,
        batch_size_eval=4,
        save_model='',
        snippets=64,
        saving_steps=5000,
        num_steps_per_update=1,
        num_classes=65,
        crf=False,
        num_updates_crf=1,
        reg_crf=-1,
        use_cls=False,
        pairwise_cond_crf=False,
        reg_type='l2'):

    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split, 'training', root_train, mode, snippets,
                      train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=8,
                                             pin_memory=True,
                                             drop_last=True)

    val_dataset = Dataset(eval_split, 'testing', root_eval, mode, snippets,
                          test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size_eval,
                                                 shuffle=True,
                                                 num_workers=8,
                                                 pin_memory=True,
                                                 drop_last=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # setup model
    steps = 0
    epoch = 0
    if not os.path.exists(args.save_model):
        subprocess.call('mkdir ' + args.save_model, shell=True)
    configure(args.save_model + "tensorboard_logger", flush_secs=5)

    # resume the training or load the pre-trained I3D
    checkpoint = -1
    try:
        checkpoint = last_checkpoint(args.save_model)
    except:
        print("Loading the pre-trained I3D")
        if mode == 'flow':
            i3d = InceptionI3d(400,
                               in_channels=2,
                               use_crf=crf,
                               num_updates_crf=num_updates_crf,
                               pairwise_cond_crf=pairwise_cond_crf)
            total_dict = i3d.state_dict()
            partial_dict = torch.load('models/flow_imagenet.pt')
            total_dict.update(partial_dict)
            i3d.load_state_dict(total_dict)

        else:
            i3d = InceptionI3d(400,
                               in_channels=3,
                               use_crf=crf,
                               num_updates_crf=num_updates_crf,
                               pairwise_cond_crf=pairwise_cond_crf)
            total_dict = i3d.state_dict()
            partial_dict = torch.load('models/rgb_imagenet.pt')
            total_dict.update(partial_dict)
            i3d.load_state_dict(total_dict)

        i3d.replace_logits(num_classes)

    if (checkpoint != -1):
        if mode == 'flow':
            i3d = InceptionI3d(num_classes,
                               in_channels=2,
                               use_crf=crf,
                               num_updates_crf=num_updates_crf,
                               pairwise_cond_crf=pairwise_cond_crf)

        else:
            i3d = InceptionI3d(num_classes,
                               in_channels=3,
                               use_crf=crf,
                               num_updates_crf=num_updates_crf,
                               pairwise_cond_crf=pairwise_cond_crf)

        i3d.load_state_dict(torch.load(args.save_model + checkpoint))
        steps = int(checkpoint[:-3])
        if dataset == 'thumos':
            epoch = int(steps * snippets * batch_size * num_steps_per_update /
                        1214016)
        else:
            epoch = int(steps * snippets * batch_size * num_steps_per_update /
                        5482688)

    # push the pipeline on multiple gpus if possible
    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    # setup optimizer
    lr = init_lr
    optimizer = optim.SGD(i3d.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer=optimizer,
                                              milestones=[1000],
                                              gamma=0.1)
    if steps > 0:
        for i in range(steps):
            lr_sched.step()

    # train the model
    while steps < max_steps:
        epoch += 1
        print('-' * 10)
        print('Epoch {}'.format(epoch))
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)

        # each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                print('Entering training loop...')
                i3d.train()
            else:
                print('Entering validation loop...')
                i3d.eval()
                time_init_eval = time.time()

            cumul_pred = Cumulator(num_classes)
            cumul_labels = Cumulator(num_classes)

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            tot_loss_updt = 0.0
            tot_loc_loss_updt = 0.0
            tot_cls_loss_updt = 0.0
            tot_reg_loss_updt = 0.0
            num_iter = 0
            optimizer.zero_grad()
            count_batch = 0
            gap_train = 0

            print("Losses initialized to 0")

            # Iterate over data.
            for data in dataloaders[phase]:
                time_init_batch = time.time()
                count_batch += 1
                num_iter += 1
                # get the inputs
                inputs, labels = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                # forward
                if crf:
                    per_frame_logits_ante_crf, per_frame_logits = i3d(inputs)
                else:
                    per_frame_logits = i3d(inputs)

                # upsample to input size
                per_frame_logits = F.upsample(per_frame_logits,
                                              t,
                                              mode='linear')
                if crf:
                    per_frame_logits_ante_crf = F.upsample(
                        per_frame_logits_ante_crf, t, mode='linear')

                # accumulate predictions and ground truths
                pred_np = pt_var_to_numpy(nn.Sigmoid()(per_frame_logits))
                cumul_pred.append(pred_np)
                labels_np = pt_var_to_numpy(labels)
                cumul_labels.append(labels_np)

                # compute localization loss
                if crf:
                    loc_loss = F.binary_cross_entropy_with_logits(
                        per_frame_logits,
                        labels) + F.binary_cross_entropy_with_logits(
                            per_frame_logits_ante_crf, labels)
                else:
                    loc_loss = F.binary_cross_entropy_with_logits(
                        per_frame_logits, labels)
                tot_loc_loss += loc_loss.data[0]
                tot_loc_loss_updt += loc_loss.data[0]

                # compute classification loss (with max-pooling along time B x C x T)
                if crf:
                    cls_loss = F.binary_cross_entropy_with_logits(
                        torch.max(per_frame_logits, dim=2)[0],
                        torch.max(
                            labels,
                            dim=2)[0]) + F.binary_cross_entropy_with_logits(
                                torch.max(per_frame_logits_ante_crf, dim=2)[0],
                                torch.max(labels, dim=2)[0])
                else:
                    cls_loss = F.binary_cross_entropy_with_logits(
                        torch.max(per_frame_logits, dim=2)[0],
                        torch.max(labels, dim=2)[0])
                tot_cls_loss += cls_loss.data[0]
                tot_cls_loss_updt += cls_loss.data[0]

                # compute regularization loss for the crf module
                if crf and (reg_crf > 0 and not pairwise_cond_crf):
                    reg_loss = get_reg_loss(i3d, 'crf', reg_type)
                    tot_reg_loss_updt += reg_loss.data[0]
                elif crf and (reg_crf > 0 and pairwise_cond_crf):
                    reg_loss = get_reg_loss(i3d, 'psi_0',
                                            reg_type) + get_reg_loss(
                                                i3d, 'psi_1', reg_type)
                    tot_reg_loss_updt += reg_crf * reg_loss.data[0]
                else:
                    reg_loss = 0

                # put all the losses together
                if use_cls:
                    loss = (0.5 * loc_loss + 0.5 * cls_loss +
                            reg_crf * reg_loss) / num_steps_per_update
                else:
                    loss = (loc_loss +
                            reg_crf * reg_loss) / num_steps_per_update

                tot_loss += loss.data[0]
                tot_loss_updt += loss.data[0]
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_sched.step()
                    examples_processed_updt = num_steps_per_update * batch_size * snippets
                    examples_processed_tot = count_batch * batch_size * snippets
                    map_train = map_calculator(cumul_pred.accumuled[1:],
                                               cumul_labels.accumuled[1:])
                    gap_train = ap_calculator(
                        cumul_pred.accumuled[1:].flatten(),
                        cumul_labels.accumuled[1:].flatten())
                    print(
                        'TRAINING - Epoch: {} Step: {} Examples processed {} Loc Loss: {:.6f} Cls Loss: {:.6f} Tot Loss: {:.6f} Reg Loss: {:.6f} mAP: {:.6f} GAP: {:.6f}'
                        .format(
                            epoch, steps, examples_processed_tot,
                            tot_loc_loss_updt / examples_processed_updt,
                            tot_cls_loss_updt / examples_processed_updt,
                            tot_loss_updt / (batch_size * snippets), reg_crf *
                            tot_reg_loss_updt / examples_processed_updt,
                            map_train, gap_train))
                    log_value('Training_loc_loss',
                              tot_loc_loss_updt / examples_processed_updt,
                              steps)
                    log_value('Training_cls_loss',
                              tot_cls_loss_updt / examples_processed_updt,
                              steps)
                    log_value('Training_reg_loss',
                              tot_reg_loss_updt / examples_processed_updt,
                              steps)
                    log_value('Training_tot_loss',
                              tot_loss_updt / (batch_size * snippets), steps)
                    log_value('Training_mAP', map_train, steps)
                    log_value('Training_GAP', gap_train, steps)
                    tot_loss_updt, tot_loc_loss_updt, tot_cls_loss_updt, tot_reg_loss_updt = 0.0, 0.0, 0.0, 0.0
                    cumul_pred.clear()
                    cumul_labels.clear()
                    cumul_pred = Cumulator(num_classes)
                    cumul_labels = Cumulator(num_classes)

                if ((steps % saving_steps)
                        == 0) & (phase == 'train') & (num_iter == 0):
                    # save model
                    print("EPOCH: {} Step: {} - Saving model...".format(
                        epoch, steps))
                    torch.save(i3d.module.state_dict(),
                               save_model + str(steps).zfill(6) + '.pt')
                    tot_loss = tot_loc_loss = tot_cls_loss = 0.

                if phase == 'val':
                    time_end_batch = time.time()
                    examples_processed_tot = count_batch * batch_size_eval * snippets
                    print(
                        'EVAL - Epoch: {} Step: {} Examples processed {} - Time for batch: {}'
                        .format(epoch, steps, examples_processed_tot,
                                time_end_batch - time_init_batch))
                    log_value('Evaluation time',
                              time_end_batch - time_init_batch,
                              examples_processed_tot)

            if phase == 'val':
                examples_processed_tot = count_batch * batch_size_eval * snippets
                map_val = map_calculator(cumul_pred.accumuled[1:],
                                         cumul_labels.accumuled[1:])
                gap_val = ap_calculator(cumul_pred.accumuled[1:].flatten(),
                                        cumul_labels.accumuled[1:].flatten())
                time_end_eval = time.time()
                print(
                    'EVAL - Epoch: {} Step: {} Loc Loss: {:.6f} Cls Loss: {:.6f} Tot Loss: {:.6f} mAP: {:.4f} GAP: {:.4f} Total time: {}'
                    .format(
                        epoch, steps, tot_loc_loss / examples_processed_tot,
                        tot_cls_loss / examples_processed_tot, tot_loss_updt *
                        num_steps_per_update / examples_processed_tot, map_val,
                        gap_val, time_end_eval - time_init_eval))
                log_value('Validation_subset_loc_loss',
                          tot_loc_loss / examples_processed_tot, steps)
                log_value('Validation_subset_cls_loss',
                          tot_cls_loss / examples_processed_tot, steps)
                log_value(
                    'Validation_subset_tot_loss', tot_loss_updt *
                    num_steps_per_update / examples_processed_tot)
                log_value('Validation_subset_mAP', map_val, steps)
                log_value('Validation_subset_GAP', gap_val, steps)
                cumul_pred.clear()
                cumul_labels.clear()
예제 #23
0
def run(max_steps=64e3, mode='rgb', root='../data/BL_and_PL', split='../data/BL_and_PL/new_annotations.json', batch_size=1, load_model='', save_dir='args.save_dir'):
    # setup dataset
    
    #test_transforms = transforms.Compose([videotransforms.RandomCrop(224),
    #                                      videotransforms.RandomHorizontalFlip(),
    #])

    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(split, 'training', root, mode, test_transforms, num=-1, save_dir=save_dir)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                            shuffle=True, num_workers=0, pin_memory=True)
    dataloaders = {'train': dataloader}
    datasets = {'train': dataset}
    
    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2, temp_window = args.temporal_window)
        input_padder = nn.ReplicationPad3d(padding=(0,0,0,0,
                                    args.temporal_window//2,args.temporal_window//2))
    #, final_endpoint= 'Mixed_5c'
    else:
        i3d = InceptionI3d(400,in_channels=3, temp_window = args.temporal_window) 
        input_padder = nn.ReplicationPad3d(padding=(0,0,0,0,
                                    args.temporal_window//2,args.temporal_window//2))

    i3d.load_state_dict(torch.load(load_model))
    i3d.cuda()

    for phase in ['train']:#, 'val']:
        i3d.train(False)  # Set model to evaluate mode
                
        tot_loss = 0.0
        tot_loc_loss = 0.0
        tot_cls_loss = 0.0
                    
        # Iterate over data.
        for data in dataloaders[phase]:
            # get the inputs
            inputs, labels, name = data

            print('extracting {} features for {} and tw {}'.format(mode, name[0],args.temporal_window))

            #if os.path.exists(os.path.join(save_dir, name[0]+'.npy')):
            #    continue

            b,c,t,h,w = inputs.shape
            if t > 1600:
                features = []
                for start in range(1, t-56, 1600):
                    end = min(t-1, start+1600+56)
                    start = max(1, start-48)

                    ip = Variable(torch.from_numpy(inputs.numpy()[:,:,start:end]).cuda(), volatile=True)

                    features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy())
                np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0))
            else:
                # Temporally pad inputs such that output temporal dimension conserved:
                no_frames = inputs.shape[2]
                inputs = input_padder(inputs)

                per_frame_features = []#torch.zeros((1,1024,1,1,1))

                # We want per-frame features. Authors of MS-TCN slid temporal window over 
                # each frame and input that to the network.           
             
                for w in range(no_frames):

                    windowed_inputs = inputs[:,:, w:(w+(args.temporal_window)), :,:].cuda()
                    features = i3d.extract_features(windowed_inputs)

                    per_frame_features.append(features.cpu().data)
                    if w % 10 == 0:
                        print('         {}'.format(w) )

                np.save(os.path.join(save_dir, name[0]), 
                        np.concatenate(per_frame_features,axis=2)[0,:,:,0,0])
def predict_video(cabin_video_path, face_video_path, args):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'
        
    checkpoint = args.checkpoint
    clip_length = args.clip_length
    clip_stride = args.clip_stride
    batch_size = args.batch_size
    num_classes = args.num_classes
    threshold = args.threshold

    cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(cabin_video_path, face_video_path, clip_length, clip_stride)
    model = TAL_Net(num_classes)
    ckp = torch.load(checkpoint)
    model.load_state_dict(ckp['model'])
    model.to(device)
    model.eval()

    clip_transforms = transforms.Compose([videotransforms.CenterCrop(224),
                                          videotransforms.ToTensor(),
                                          videotransforms.ClipNormalize()
                                          ])
    all_clips = []
    all_predict_classes = []
    all_start_scores = []
    all_end_scores = []

    n = len(cabin_clips) // batch_size
    for i in range(n):
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for j in range(i * batch_size, (i + 1) * batch_size):
            cabin_clip = cabin_clips[j]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[j]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)
        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)

        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    if len(cabin_clips) % batch_size != 0:
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for k in range(n * batch_size, len(cabin_clips)):
            cabin_clip = cabin_clips[k]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[k]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)

        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)
        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    all_predict_classes = np.concatenate(all_predict_classes)
    all_start_scores = np.concatenate(all_start_scores)
    all_end_scores = np.concatenate(all_end_scores)
#     print(all_start_scores)
#     print(all_end_scores)
#     start_peak_indices = []
#     end_peak_indices = []
    
#     if all_start_scores[0] > all_start_scores[1]:
#         start_peak_indices.append(0)
#     for i in range(1, len(cabin_clips) - 1):
#         if all_start_scores[i] > all_start_scores[i - 1]:
#             if all_start_scores[i] > all_start_scores[i + 1]:
#                 start_peak_indices.append(i)
#         if all_end_scores[i] > all_end_scores[i - 1]:
#             if all_end_scores[i] > all_end_scores[i + 1]:
#                 end_peak_indices.append(i)
#     if all_end_scores[-1] > all_end_scores[-2]:
#         end_peak_indices.append(len(cabin_clips) - 1)

#     j = 0
#     copy_start_peak_indices = start_peak_indices.copy()
#     while j < len(start_peak_indices) - 1:
#         index1 = copy_start_peak_indices[j]
#         index2 = copy_start_peak_indices[j + 1]
#         if index1 + 4 < index2:
#             j += 1
#         else:
#             if all_start_scores[start_peak_indices[j]] > all_start_scores[start_peak_indices[j+1]]:
#                 copy_start_peak_indices[j] = index2
#                 copy_start_peak_indices.pop(j + 1)
#                 start_peak_indices.pop(j + 1)

#             else:
#                 copy_start_peak_indices.pop(j)
#                 start_peak_indices.pop(j)

#     k = 0
#     copy_end_peak_indices = end_peak_indices.copy()
#     while k < len(end_peak_indices) - 1:
#         index1 = copy_end_peak_indices[k]
#         index2 = copy_end_peak_indices[k + 1]
#         if index1 + 4 < index2:
#             k += 1
#         else:
#             if all_end_scores[end_peak_indices[k]] > all_end_scores[end_peak_indices[k+1]]:
#                 copy_end_peak_indices[k] = index2
#                 copy_end_peak_indices.pop(k + 1)
#                 end_peak_indices.pop(k + 1)
#             else:
#                 copy_end_peak_indices.pop(k)
#                 end_peak_indices.pop(k)
                
    selected_starts = []
    selected_ends = []
    for i in range(len(all_start_scores)):
        if all_start_scores[i] > threshold:
            selected_starts.append(i)
    for j in range(len(all_end_scores)):
        if all_end_scores[j] > threshold:
            selected_ends.append(j)        
    return selected_starts, selected_ends, all_start_scores, indices_in_cabin_clips
예제 #25
0
def run(max_steps=64e3,
        mode='rgb',
        root='',
        split='',
        batch_size=1,
        save_dir=''):
    #tf.logging.set_verbosity(tf.logging.INFO)
    eval_type = mode

    imagenet_pretrained = False

    NUM_CLASSES = 400
    if eval_type == 'rgb600':
        NUM_CLASSES = 600

    if eval_type not in ['rgb', 'rgb600', 'flow', 'joint']:
        raise ValueError(
            'Bad `eval_type`, must be one of rgb, rgb600, flow, joint')

    if eval_type == 'rgb600':
        kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH_600)]
    else:
        kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)]

    if eval_type in ['rgb', 'rgb600', 'joint']:
        # RGB input has 3 channels.
        rgb_input = tf.placeholder(tf.float32,
                                   shape=(1, _SAMPLE_VIDEO_FRAMES, _IMAGE_SIZE,
                                          _IMAGE_SIZE, 3))

        with tf.variable_scope('RGB'):
            rgb_model = i3d.InceptionI3d(NUM_CLASSES,
                                         spatial_squeeze=True,
                                         final_endpoint='Mixed_5c')
            rgb_logits, _ = rgb_model(rgb_input,
                                      is_training=False,
                                      dropout_keep_prob=1.0)

        rgb_variable_map = {}
        for variable in tf.global_variables():

            if variable.name.split('/')[0] == 'RGB':
                if eval_type == 'rgb600':
                    rgb_variable_map[variable.name.replace(
                        ':0', '')[len('RGB/inception_i3d/'):]] = variable
                else:
                    rgb_variable_map[variable.name.replace(':0',
                                                           '')] = variable

        rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True)

    if eval_type in ['flow', 'joint']:
        # Flow input has only 2 channels.
        flow_input = tf.placeholder(tf.float32,
                                    shape=(None, _SAMPLE_VIDEO_FRAMES,
                                           _IMAGE_SIZE, _IMAGE_SIZE, 2))
        with tf.variable_scope('Flow'):
            flow_model = i3d.InceptionI3d(NUM_CLASSES,
                                          spatial_squeeze=True,
                                          final_endpoint='Mixed_5c')
            flow_logits, _ = flow_model(flow_input,
                                        is_training=False,
                                        dropout_keep_prob=1.0)

        flow_variable_map = {}
        for variable in tf.global_variables():
            if variable.name.split('/')[0] == 'Flow':
                flow_variable_map[variable.name.replace(':0', '')] = variable
        flow_saver = tf.train.Saver(var_list=flow_variable_map, reshape=True)

    if eval_type == 'rgb' or eval_type == 'rgb600':
        model_logits = rgb_logits
    elif eval_type == 'flow':
        model_logits = flow_logits
    else:
        model_logits = rgb_logits + flow_logits
    #model_predictions = tf.nn.softmax(model_logits)

    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
    dataset = Dataset(split,
                      'training',
                      root,
                      mode,
                      test_transforms,
                      save_dir=save_dir)

    with tf.Session() as sess:
        feed_dict = {}

        while True:
            inputs, labels, name = dataset.next_batch()
            if name == '0': break
            i = 0
            for input in inputs:
                i += 1
                c, t, h, w = input.shape

                if eval_type in ['rgb', 'rgb600', 'joint']:
                    if imagenet_pretrained:
                        rgb_saver.restore(sess,
                                          _CHECKPOINT_PATHS['rgb_imagenet'])
                    else:
                        rgb_saver.restore(sess, _CHECKPOINT_PATHS[eval_type])
                    #tf.logging.info('RGB checkpoint restored')
                    rgb_sample = input[np.newaxis, :]
                    #tf.logging.info('RGB data loaded, shape=%s', str(rgb_sample.shape))
                    feed_dict[rgb_input] = rgb_sample

                if eval_type in ['flow', 'joint']:
                    if imagenet_pretrained:
                        flow_saver.restore(sess,
                                           _CHECKPOINT_PATHS['flow_imagenet'])
                    else:
                        flow_saver.restore(sess, _CHECKPOINT_PATHS['flow'])
                    #tf.logging.info('Flow checkpoint restored')
                    flow_sample = input[np.newaxis, :]
                    # tf.logging.info('Flow data loaded, shape=%s', str(flow_sample.shape))
                    feed_dict[flow_input] = flow_sample

                out_logits = sess.run([model_logits], feed_dict=feed_dict)

                out_logits = out_logits[0]

                new_path = os.path.join(save_dir, name, mode)
                if not os.path.exists(new_path):
                    os.makedirs(new_path)
                np.save(os.path.join(new_path, str(i)),
                        out_logits.reshape(1024))
예제 #26
0
def run(max_steps=64e3,
        mode='flow',
        root='./frames',
        split='gt.json',
        batch_size=1,
        load_model='',
        save_dir=''):
    # setup dataset
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(split,
                      'training',
                      root,
                      mode,
                      test_transforms,
                      num=-1,
                      save_dir=save_dir)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=8,
                                             pin_memory=True)

    val_dataset = Dataset(split,
                          'test',
                          root,
                          mode,
                          test_transforms,
                          num=-1,
                          save_dir=save_dir)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=8,
                                                 pin_memory=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(20, in_channels=2)
    else:
        i3d = InceptionI3d(20, in_channels=3)
    i3d.replace_logits(157)
    i3d.load_state_dict(torch.load(load_model))
    i3d.cuda()

    for phase in ['train', 'val']:
        i3d.train(False)  # Set model to evaluate mode

        tot_loss = 0.0
        tot_loc_loss = 0.0
        tot_cls_loss = 0.0

        # Iterate over data.
        for data in dataloaders[phase]:
            # get the inputs
            inputs, labels, name = data
            # if os.path.exists(os.path.join(save_dir, name[0] + '.npy')):
            #     continue

            b, c, t, h, w = inputs.shape
            if t > 16:
                features = []
                for start in range(0, t, 16):
                    end = min(t - 1, start + 16)
                    if end < start + 16:
                        break
                    # start = max(1, start - 48)
                    ip = Variable(torch.from_numpy(
                        inputs.numpy()[:, :, start:end]).cuda(),
                                  volatile=True)
                    feature = i3d.extract_features(ip)
                    feature = torch.squeeze(feature)
                    features.append(feature.data.cpu().numpy())
                np.save(os.path.join(save_dir, name[0]), np.asarray(features))
            else:
                # wrap them in Variable
                inputs = Variable(inputs.cuda(), volatile=True)
                features = i3d.extract_features(inputs)
                np.save(
                    os.path.join(save_dir, name[0]),
                    features.squeeze(0).permute(1, 2, 3, 0).data.cpu().numpy())
예제 #27
0
def main():
    best_prec1 = 0
    with open(
            'logs/' + args.dataset + '/' + args.arch + '_' + args.mode +
            '_validation.txt', 'a') as f:
        f.write("=============================================")
        f.write('\n')
        f.write("lr: ")
        f.write(str(args.lr))
        f.write(" lr_step: ")
        f.write(str(args.lr_steps))
        f.write(" dataset: ")
        f.write(str(args.dataset))
        f.write(" modality: ")
        f.write(str(args.mode))
        f.write(" dropout: ")
        f.write(str(args.dropout))
        f.write(" batch size: ")
        f.write(str(args.batch_size))
        f.write('\n')
    if args.dataset == 'ucf101':
        num_class = 101
        data_length = 64
        image_tmpl = "frame{:06d}.jpg"
    elif args.dataset == 'hmdb51':
        num_class = 51
        data_length = 64
        image_tmpl = "img_{:05d}.jpg"
    elif args.dataset == 'kinetics':
        num_class = 400
        data_length = 64
        image_tmpl = "img_{:05d}.jpg"
    else:
        raise ValueError('Unknown dataset ' + args.dataset)

    val_logger = Logger(
        'logs/' + args.dataset + '/' + args.arch + '_' + args.mode +
        '_val.log', ['epoch', 'acc'])
    # define loss function (criterion) and optimizer
    #======================data transform=============

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip()
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
    #=======================design the dataset==============
    train_dataset = I3dDataSet("",
                               args.train_list,
                               num_segments=1,
                               new_length=data_length,
                               modality=args.mode,
                               dataset=args.dataset,
                               image_tmpl=image_tmpl if args.mode
                               in ["rgb", "RGBDiff"] else args.flow_prefix +
                               "{}_{:05d}.jpg",
                               transform=train_transforms,
                               test_mode=False)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=8,
                                               pin_memory=True)

    val_dataset = I3dDataSet("",
                             args.val_list,
                             num_segments=1,
                             new_length=data_length,
                             modality=args.mode,
                             dataset=args.dataset,
                             image_tmpl=image_tmpl if args.mode
                             in ["rgb", "RGBDiff"] else args.flow_prefix +
                             "{}_{:05d}.jpg",
                             random_shift=False,
                             transform=test_transforms,
                             test_mode=False)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=8,
                                             pin_memory=True)

    dataloaders = {'train': train_loader, 'val': val_loader}
    datasets = {'train': train_dataset, 'val': val_dataset}

    #=============================set the model ==================
    # setup the model
    if args.mode == 'flow':
        if args.arch == 'i3d':
            from net.i3d import I3D
            i3d = I3D(modality='flow',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'bilinear_i3d':
            from net.bilinear_i3d import I3D
            i3d = I3D(modality='flow',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'se_i3d':
            from net.se_i3d import I3D
            i3d = I3D(modality='flow',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'se_bilinear_i3d':
            from net.se_bilinear_i3d import I3D
            i3d = I3D(modality='flow',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        else:
            Exception("not support now!")
        i3d.eval()
        pretrain_dict = torch.load('pretrained_models/model_flow.pth')
        model_dict = i3d.state_dict()
        weight_dict = weight_transform(model_dict, pretrain_dict)
        i3d.load_state_dict(weight_dict)
    else:
        #i3d = InceptionI3d(400, in_channels=3)
        if args.arch == 'i3d':
            from net.i3d import I3D
            i3d = I3D(modality='rgb',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'se_i3d':
            from net.se_i3d import I3D
            i3d = I3D(modality='rgb',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'bilinear_i3d':
            from net.bilinear_i3d import I3D
            i3d = I3D(modality='rgb',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'se_bilinear_i3d':
            from net.se_bilinear_i3d import I3D
            i3d = I3D(modality='rgb',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        else:
            Exception("not support now!")
        i3d.eval()
        pretrain_dict = torch.load('pretrained_models/model_rgb.pth')
        model_dict = i3d.state_dict()
        weight_dict = weight_transform(model_dict, pretrain_dict)
        i3d.load_state_dict(weight_dict)

    i3d.cuda()
    #print(i3d)
    #============================set SGD, critization and lr ==================
    optimizer = torch.optim.SGD(i3d.parameters(),
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay,
                                dampening=0,
                                nesterov=False)
    model = nn.DataParallel(i3d)
    criterion = torch.nn.NLLLoss().cuda()
    disturb = DisturbLabel(alpha=10, C=51)
    # criterion = FocalLoss(gamma = 0).cuda()
    #print(model)

    writer = SummaryWriter()  #create log folders for plot
    timer = Timer()
    for epoch in range(1, args.epochs):
        timer.tic()
        adjust_learning_rate(optimizer, epoch, args.lr_steps)

        # train for one epoch
        train_prec1, train_loss = train(train_loader, model, criterion,
                                        optimizer, epoch, disturb)
        writer.add_scalar('Train/Accu', train_prec1, epoch)
        writer.add_scalar('Train/Loss', train_loss, epoch)
        # evaluate on validation set
        if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1:
            prec1, val_loss = validate(val_loader, model, criterion,
                                       (epoch + 1) * len(train_loader))
            writer.add_scalar('Val/Accu', prec1, epoch)
            writer.add_scalar('Val/Loss', val_loss, epoch)
            writer.add_scalars('data/Acc', {
                'train_prec1': train_prec1,
                'val_prec1': prec1
            }, epoch)
            writer.add_scalars('data/Loss', {
                'train_loss': train_loss,
                'val_loss': val_loss
            }, epoch)
            #scheduler.step(val_loss)
            # remember best prec@1 and save checkpoint
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                }, is_best, best_prec1)
            val_logger.log({'epoch': epoch, 'acc': prec1})
        timer.toc()
        left_time = timer.average_time * (args.epochs - epoch)
        print("best_prec1 is: {}".format(best_prec1))
        print("left time is: {}".format(timer.format(left_time)))
        with open(
                'logs/' + args.dataset + '/' + args.arch + '_' + args.mode +
                '_validation.txt', 'a') as f:
            f.write(str(epoch))
            f.write(" ")
            f.write(str(train_prec1))
            f.write(" ")
            f.write(str(prec1))
            f.write(" ")
            f.write(timer.format(timer.diff))
            f.write('\n')
    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
예제 #28
0
def run(dataset_path, db_filename, model_path, output_path, frames_per_clip=16,
        testset_filename='test_cross_env.txt', trainset_filename='train_cross_env.txt', frame_skip=1,
        batch_size=8, device='dev3', arch='HCN', pose_path='predictions/pose2d/openpose'):

    pred_output_filename = os.path.join(output_path, 'pred.npy')
    json_output_filename = os.path.join(output_path, 'action_segments.json')

    # setup dataset
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    test_dataset = Dataset(dataset_path, db_filename=db_filename, test_filename=testset_filename,
                           train_filename=trainset_filename, transform=test_transforms, set='test', camera=device,
                           frame_skip=frame_skip, frames_per_clip=frames_per_clip, mode='img', pose_path=pose_path,
                           arch=arch)

    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=6,
                                                  pin_memory=True)

    # setup the model
    num_classes = test_dataset.num_classes

    if arch == 'HCN':
        model = HCN.HCN(in_channel=2, num_joint=19, num_person=1, out_channel=64, window_size=frames_per_clip,
                        num_class=num_classes)
    elif arch == 'ST_GCN':
        graph_args = {'layout': 'openpose', 'strategy': 'spatial'}  # layout:'ntu-rgb+d'
        model = st_gcn.Model(in_channels=2, num_class=num_classes, graph_args=graph_args,
                             edge_importance_weighting=True, dropout=0.5)
    else:
        raise ValueError("Unsupported architecture: please select HCN | ST_GCN")

    checkpoints = torch.load(model_path)
    model.load_state_dict(checkpoints["model_state_dict"]) # load trained model
    model.cuda()
    # model = nn.DataParallel(model)

    n_examples = 0

    # Iterate over data.
    avg_acc = []
    pred_labels_per_video = [[] for i in range(len(test_dataset.video_list))]
    logits_per_video = [[] for i in range(len(test_dataset.video_list))]

    for test_batchind, data in enumerate(test_dataloader):
        model.train(False)
        # get the inputs
        inputs, labels, vid_idx, frame_pad = data

        # wrap them in Variable
        inputs = Variable(inputs.cuda(), requires_grad=True)
        labels = Variable(labels.cuda())

        t = inputs.size(2)
        logits = model(inputs)
        logits = torch.nn.functional.interpolate(logits.unsqueeze(-1), t, mode='linear', align_corners=True)
        # logits = F.interpolate(logits, t, mode='linear', align_corners=True)  # b x classes x frames

        acc = i3d_utils.accuracy_v2(torch.argmax(logits, dim=1), torch.argmax(labels, dim=1))

        avg_acc.append(acc.item())
        n_examples += batch_size
        print('batch Acc: {}, [{} / {}]'.format(acc.item(), test_batchind, len(test_dataloader)))
        logits = logits.permute(0, 2, 1)  # [ batch, classes, frames] -> [ batch, frames, classes]
        logits = logits.reshape(inputs.shape[0] * frames_per_clip, -1)
        pred_labels = torch.argmax(logits, 1).detach().cpu().numpy().tolist()
        logits = torch.nn.functional.softmax(logits, dim=1).detach().cpu().numpy().tolist()

        pred_labels_per_video, logits_per_video = \
            utils.accume_per_video_predictions(vid_idx, frame_pad,pred_labels_per_video, logits_per_video, pred_labels,
                                     logits, frames_per_clip)

    pred_labels_per_video = [np.array(pred_video_labels) for pred_video_labels in pred_labels_per_video]
    logits_per_video = [np.array(pred_video_logits) for pred_video_logits in logits_per_video]

    np.save(pred_output_filename, {'pred_labels': pred_labels_per_video,
                                   'logits': logits_per_video})
    utils.convert_frame_logits_to_segment_json(logits_per_video, json_output_filename, test_dataset.video_list,
                                               test_dataset.action_list)
def predict_events(cabin_video_path, face_video_path, args):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    checkpoint = args.checkpoint
    clip_length = args.clip_length
    clip_stride = args.clip_stride
    batch_size = args.batch_size
    num_classes = args.num_classes
    threshold = args.threshold

    cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(cabin_video_path, face_video_path, clip_length,
                                                                      clip_stride)
    model = TAL_Net(num_classes)
    ckp = torch.load(checkpoint)
    model.load_state_dict(ckp['model'])
    model.to(device)
    model.eval()

    clip_transforms = transforms.Compose([videotransforms.CenterCrop(224),
                                          videotransforms.ToTensor(),
                                          videotransforms.ClipNormalize()
                                          ])
    all_clips = []
    all_predict_classes = []
    all_start_scores = []
    all_end_scores = []

    n = len(cabin_clips) // batch_size
    for i in range(n):
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for j in range(i * batch_size, (i + 1) * batch_size):
            cabin_clip = cabin_clips[j]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[j]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)
        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)

        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    if len(cabin_clips) % batch_size != 0:
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for k in range(n * batch_size, len(cabin_clips)):
            cabin_clip = cabin_clips[k]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[k]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)

        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)
        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    all_predict_classes = np.concatenate(all_predict_classes)

    print(all_predict_classes)
    # rough chunk aggregation
    cabin_frames = os.listdir(cabin_video_path)
    cabin_frame_length  = len(cabin_frames)
    cabin_indices = np.arange(start=0, stop=cabin_frame_length - clip_stride + 1, step=clip_stride)
    indices_in_shorter_clips = [list(range(idx, idx + clip_stride)) for idx in cabin_indices]
#     remainder = cabin_frame_length % clip_stride
#     if remainder != 0:
#         indices_in_shorter_clips.append(list(range(cabin_frame_length-remainder, cabin_frame_length)))
#     print(len(indices_in_shorter_clips))
#     print(len(indices_in_cabin_clips))
    shorter_clip_predict_classes = [] 
    for i in range(len(indices_in_shorter_clips)):
        if i == 0:
            shorter_clip_predict_classes.append(all_predict_classes[0])
        elif i == 1:
            l = [all_predict_classes[0], all_predict_classes[1]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == 2:
            l = [all_predict_classes[0], all_predict_classes[1], all_predict_classes[2]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
#         elif i == len(indices_in_shorter_clips) - 3:
#             l = [all_predict_classes[i], all_predict_classes[i+1], all_predict_classes[i+2]]
#             shorter_clip_predict_classes.append(max(set(l), key = l.count))
#         elif i == len(indices_in_shorter_clips) - 2:
#             l = [all_predict_classes[i], all_predict_classes[i+1]]
#             shorter_clip_predict_classes.append(max(set(l), key = l.count))
#         elif i == len(indices_in_shorter_clips) - 1:
#             shorter_clip_predict_classes.append(all_predict_classes[i])
        elif i < len(indices_in_cabin_clips):
            l = [all_predict_classes[j] for j in range(i-3, i+1)]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips):
            index = len(indices_in_cabin_clips) - 1
            l = [all_predict_classes[index-2], all_predict_classes[index-1], all_predict_classes[index]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips) + 1:
            index = len(indices_in_cabin_clips) - 1
            l = [all_predict_classes[index-1], all_predict_classes[index]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips) + 2:
            index = len(indices_in_cabin_clips) - 1
            shorter_clip_predict_classes.append(all_predict_classes[index])
     
    print(shorter_clip_predict_classes)
    rough_clip_groups = defaultdict(list)
    for i in range(len(shorter_clip_predict_classes)):
        if shorter_clip_predict_classes[i] != 0:
            rough_clip_groups[shorter_clip_predict_classes[i]].append(i)
    print(rough_clip_groups)
    all_refined_clip_groups = dict()
    for key in rough_clip_groups.keys():
        clip_group = rough_clip_groups[key]
        refined_groups = []
        
        previous = 0
        i = 0
        while i < len(clip_group) - 1:
            if clip_group[i+1] - clip_group[i] >= 4:
                refined_groups.append(clip_group[previous:(i+1)])
                previous = i+1
            i += 1
        
        refined_groups.append(clip_group[previous:])
        all_refined_clip_groups[key] = refined_groups
    print(all_refined_clip_groups)
#     all_classes = all_clip_frame_groups.keys()
    keys = list(all_refined_clip_groups)
    if len(keys) == 2:
        k1 = keys[0]
        k2 = keys[1]
        groups1 = all_refined_clip_groups[k1]
        groups2 = all_refined_clip_groups[k2]

        i = 0
        j = 0
        while i < len(groups1):
            while j < len(groups2):
                min_index1 = min(groups1[i])
                max_index1 = max(groups1[i])
                min_index2 = min(groups2[j])
                max_index2 = max(groups2[j])
                set1 = set(range(min_index1, max_index1+1))
                set2 = set(range(min_index2, max_index2+1))
                if set1.issubset(set2) == True:
                    groups1.remove(groups1[i])
                    break
                elif set2.issubset(set1) == True:
                    groups2.remove(groups2[j])
                else:
                    intersec = set1.intersection(set2)
                    for item in intersec:
                        set1.discard(item)
                        set2.discard(item)
                    groups1[i] = list(set1)
                    groups2[j] = list(set2)
                    if max_index1 > max_index2:
                        j += 1
                    else:
                        i += 1
                        break
            if j == len(groups2):
                break
       
        final_all_clip_groups = {
            k1:groups1,
            k2:groups2
        }
    else:
        final_all_clip_groups = all_refined_clip_groups
    print(final_all_clip_groups)
    all_clip_frame_groups = {} 
    for key in final_all_clip_groups.keys():
        final_groups = final_all_clip_groups[key]
        clip_frame_groups = []
        for group in final_groups:
            clip_frame_group = set()
            for index in group:
                clip_frame_group = clip_frame_group.union(set(indices_in_shorter_clips[index]))
                start_frame = min(clip_frame_group) + 1
                end_frame = max(clip_frame_group) + 1            
            clip_frame_groups.append([start_frame, end_frame])
        all_clip_frame_groups[key] = clip_frame_groups
    return all_clip_frame_groups
예제 #30
0
def run(init_lr=0.1,
        max_steps=64e3,
        mode='rgb',
        root='../../SSBD/ssbd_clip_segment/data/',
        train_split='../../SSBD/Annotations/annotations_charades.json',
        batch_size=1,
        save_model=''):
    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split, 'training', root, mode, train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=4,
                                             pin_memory=True)

    val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=4,
                                                 pin_memory=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # dataloaders = {'train': dataloader}
    # datasets = {'train': dataset}

    # setup the model

    xdc = torch.hub.load('HumamAlwassel/XDC',
                         'xdc_video_encoder',
                         pretraining='r2plus1d_18_xdc_ig65m_kinetics',
                         num_classes=3)
    # if mode == 'flow':
    #     i3d = InceptionI3d(400, in_channels=2)
    #     i3d.load_state_dict(torch.load('models/flow_imagenet.pt'))
    # else:
    #     i3d = InceptionI3d(400, in_channels=3)
    #     i3d.load_state_dict(torch.load('models/rgb_imagenet.pt'))
    # i3d.replace_logits(8)
    # #i3d.load_state_dict(torch.load('/ssd/models/000920.pt'))
    # i3d.cuda()
    # i3d = nn.DataParallel(i3d)
    xdc.cuda()
    xdc = nn.DataParallel(xdc).cuda()

    for name, param in xdc.named_parameters():
        if 'fc' not in name and '4.1' not in name:
            param.requires_grad = False

    lr = init_lr
    optimizer = optim.SGD(xdc.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])

    num_steps_per_update = 4  # accum gradient
    steps = 0
    best_val = 0
    # new_flag = 0
    # train it
    while steps < max_steps:  #for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)
        # new_state_dict = OrderedDict()
        # state_dict = torch.load(save_model+'.pt')
        # for k, v in state_dict.items():
        #     name = "module."+k # add module.
        #     new_state_dict[name] = v
        # xdc.load_state_dict(new_state_dict)
        # new_flag = 0
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                xdc.train(True)
            else:
                xdc.train(False)  # Set model to evaluate mode

            tot_loss = 0.0
            # tot_loc_loss = 0.0
            # tot_cls_loss = 0.0
            num_iter = 0
            total = 0
            n = 0
            optimizer.zero_grad()

            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                inputs, labels = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                per_frame_logits = xdc(inputs)
                # print(per_frame_logits.shape)
                # print(labels.shape)
                # upsample to input size
                # per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')

                # compute localization loss
                # loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
                # tot_loc_loss += loc_loss.data.item()

                # compute classification loss (with max-pooling along time B x C x T)
                # cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0])
                # print(torch.max(per_frame_logits, dim=2)[0])
                # print(torch.max(labels, dim=2)[0])
                correct = per_frame_logits.argmax(1).eq(labels.argmax(1))
                total += correct.float().sum().item()
                n += batch_size
                # tot_cls_loss += cls_loss.data.item()

                loss = F.binary_cross_entropy_with_logits(
                    per_frame_logits, labels) / num_steps_per_update
                tot_loss += loss.data.item()
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_sched.step()
                    if steps % 10 == 0:
                        print('{} Tot Loss: {:.4f} Accuracy: {:.4f}'.format(
                            phase, tot_loss / 10, total / n))
                        # save model
                        # if(steps % 10000 == 0):
                        # torch.save(xdc.module.state_dict(), save_model+str(steps).zfill(6)+'.pt')
                        # tot_loss = tot_loc_loss = tot_cls_loss = 0.
                        tot_loss = 0
                        total = 0
                        n = 0
            if phase == 'val':
                print('{} Tot Loss: {:.4f} Accuracy: {:.4f}'.format(
                    phase, (tot_loss * num_steps_per_update) / num_iter,
                    total / n))
                if (total / n > best_val):
                    best_val = total / n
                    torch.save(xdc.module.state_dict(), save_model + '.pt')