def calculate_confusion_matrix():
    args = get_parse()
    cabin_video_dir = args.cabin_video_dir
    face_video_dir = args.face_video_dir
    test_data_path = args.test_data_path
    batch_size = args.batch_size
    num_classes = args.num_classes
    weight = args.weight
    print('Start to load data')
    test_transforms = transforms.Compose([
        videotransforms.CenterCrop(224),
        videotransforms.ToTensor(),
        videotransforms.ClipNormalize()
    ])
    test_dataset = IVBSSDataset(cabin_video_dir, face_video_dir,
                                test_data_path, test_transforms)
    print('Total number of test samples is {0}'.format(len(test_dataset)))
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 sampler=SequentialSampler(test_dataset),
                                 collate_fn=collate_fn)
    model = TAL_Net(num_classes)
    print('Load checkpoint')
    model = load_ckp(args.ckp_path, model)
    model.cuda()
    model.eval()

    print('Start to calculate confusion matrix')
    all_predicts = []
    all_labels = []
    for i, (cabin_imgs, face_imgs, labels, start_labels,
            end_labels) in enumerate(test_dataloader):
        cabin_imgs = cabin_imgs.cuda()
        face_imgs = face_imgs.cuda()
        with torch.no_grad():
            class_scores, start_scores, end_scores = model(
                cabin_imgs, face_imgs)
            class_preds = torch.argmax(class_scores, dim=1)
            class_preds = class_preds.cpu().numpy()
            labels = labels.numpy()
            all_predicts.append(class_preds)
            all_labels.append(labels)
    all_predicts = np.concatenate(all_predicts)
    all_labels = np.concatenate(all_labels)
    cf_matrix = confusion_matrix(all_labels, all_predicts)
    normalized_confusion_matrix = confusion_matrix(all_labels,
                                                   all_predicts,
                                                   normalize='true')
    return cf_matrix, normalized_confusion_matrix
Пример #2
0
def predict():
    args = get_parse()
    cabin_video_dir = args.cabin_video_dir
    test_data_path = args.test_data_path
    #     batch_size = args.batch_size
    num_classes = args.num_classes

    print('Start to load data')
    test_transforms = transforms.Compose(
        [videotransforms.CenterCrop(224),
         videotransforms.ToTensor()])
    test_dataset = IVBSSDataset(face_video_dir, cabin_video_dir,
                                test_data_path, test_transforms)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 sampler=SequentialSampler(test_dataset),
                                 collate_fn=collate_fn)

    model = TemporalActionLocalization(num_classes, pretrained_I3D_model)
    print('Load checkpoint')
    model = load_ckp(args.ckp_path, model)

    model.cuda()
    model.eval()

    print('Start to test')
    test_loss = 0.0
    test_steps = 0
    for i, (face_imgs, cabin_imgs, labels) in enumerate(test_dataloader):
        face_imgs = face_imgs.cuda()
        cabin_imgs = cabin_imgs.cuda()
        for k, v in labels.items():
            labels[k] = v.cuda()
        loss = model(face_imgs, cabin_imgs, labels)
        test_loss += loss.item()
        test_steps += 1
    avg_test_loss = test_loss / test_steps
    return avg_test_loss
Пример #3
0
def predict_events(cabin_video_path, face_video_path, args):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    checkpoint = args.checkpoint
    clip_length = args.clip_length
    clip_stride = args.clip_stride
    batch_size = args.batch_size
    num_classes = args.num_classes
    threshold = args.threshold

    cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(cabin_video_path, face_video_path, clip_length,
                                                                      clip_stride)
    model = TAL_Net(num_classes)
    ckp = torch.load(checkpoint)
    model.load_state_dict(ckp['model'])
    model.to(device)
    model.eval()

    clip_transforms = transforms.Compose([videotransforms.CenterCrop(224),
                                          videotransforms.ToTensor(),
                                          videotransforms.ClipNormalize()
                                          ])
    all_clips = []
    all_predict_classes = []
    all_start_scores = []
    all_end_scores = []

    n = len(cabin_clips) // batch_size
    for i in range(n):
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for j in range(i * batch_size, (i + 1) * batch_size):
            cabin_clip = cabin_clips[j]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[j]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)
        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)

        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    if len(cabin_clips) % batch_size != 0:
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for k in range(n * batch_size, len(cabin_clips)):
            cabin_clip = cabin_clips[k]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[k]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)

        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)
        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    all_predict_classes = np.concatenate(all_predict_classes)
    all_start_scores = np.concatenate(all_start_scores)
    all_end_scores = np.concatenate(all_end_scores)
    
    print(all_predict_classes)
    # refined chunk aggregation
    cabin_frames = os.listdir(cabin_video_path)
    cabin_frame_length  = len(cabin_frames)
    cabin_indices = np.arange(start=0, stop=cabin_frame_length - clip_stride + 1, step=clip_stride)
    indices_in_shorter_clips = [list(range(idx, idx + clip_stride)) for idx in cabin_indices]
#     remainder = cabin_frame_length % clip_stride
#     if remainder != 0:
#         indices_in_shorter_clips.append(list(range(cabin_frame_length-remainder, cabin_frame_length)))
    print(len(indices_in_shorter_clips))
    print(len(indices_in_cabin_clips))
    shorter_clip_predict_classes = []
    for i in range(len(indices_in_shorter_clips)):
        if i == 0:
            shorter_clip_predict_classes.append(all_predict_classes[0])
        elif i == 1:
            l = [all_predict_classes[0], all_predict_classes[1]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == 2:
            l = [all_predict_classes[0], all_predict_classes[1], all_predict_classes[2]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i < len(indices_in_cabin_clips):
            l = [all_predict_classes[j] for j in range(i-3, i+1)]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips):
            index = len(indices_in_cabin_clips) - 1
            l = [all_predict_classes[index-2], all_predict_classes[index-1], all_predict_classes[index]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips) + 1:
            index = len(indices_in_cabin_clips) - 1
            l = [all_predict_classes[index-1], all_predict_classes[index]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips) + 2:
            index = len(indices_in_cabin_clips) - 1
            shorter_clip_predict_classes.append(all_predict_classes[index])
    print(shorter_clip_predict_classes)
    
    # extract start and end peaks
    start_peak_indices = []
    end_peak_indices = []
    if all_start_scores[0] > all_start_scores[1]:
        start_peak_indices.append(0)
    for i in range(1, len(all_start_scores) - 1):
        if all_start_scores[i] > all_start_scores[i - 1]:
            if all_start_scores[i] > all_start_scores[i + 1]:
                start_peak_indices.append(i)
        if all_end_scores[i] > all_end_scores[i - 1]:
            if all_end_scores[i] > all_end_scores[i + 1]:
                end_peak_indices.append(i)
    if all_end_scores[-1] > all_end_scores[-2]:
        end_peak_indices.append(len(cabin_clips) - 1)

    j = 0
    copy_start_peak_indices = start_peak_indices.copy()
    while j < len(start_peak_indices) - 1:
        index1 = copy_start_peak_indices[j]
        index2 = copy_start_peak_indices[j + 1]
        if index1 + 4 < index2:
            j += 1
        else:
            if all_start_scores[start_peak_indices[j]] > all_start_scores[start_peak_indices[j + 1]]:
                copy_start_peak_indices[j] = index2
                copy_start_peak_indices.pop(j + 1)
                start_peak_indices.pop(j + 1)

            else:
                copy_start_peak_indices.pop(j)
                start_peak_indices.pop(j)

    k = 0
    copy_end_peak_indices = end_peak_indices.copy()
    while k < len(end_peak_indices) - 1:
        index1 = copy_end_peak_indices[k]
        index2 = copy_end_peak_indices[k + 1]
        if index1 + 4 < index2:
            k += 1
        else:
            if all_end_scores[end_peak_indices[k]] > all_end_scores[end_peak_indices[k + 1]]:
                copy_end_peak_indices[k] = index2
                copy_end_peak_indices.pop(k + 1)
                end_peak_indices.pop(k + 1)
            else:
                copy_end_peak_indices.pop(k)
                end_peak_indices.pop(k)

    selected_starts = []
    selected_ends = []
    for start_indice in start_peak_indices:
        if all_start_scores[start_indice] > threshold:
            selected_starts.append(start_indice)
    for end_indice in end_peak_indices:
        if all_end_scores[end_indice] > threshold:
            selected_ends.append(end_indice+3)
    print(selected_starts)
    print(selected_ends)
    
       
    rough_clip_groups = defaultdict(list)
    for i in range(len(shorter_clip_predict_classes)):
        if shorter_clip_predict_classes[i] != 0:
            rough_clip_groups[shorter_clip_predict_classes[i]].append(i)
    print(rough_clip_groups)
    
#     all_refined_clip_groups = dict()
#     for key in rough_clip_groups.keys():
#         clip_group = rough_clip_groups[key]
#         refined_groups = []
        
#         previous = 0
#         i = 0
#         while i < len(clip_group) - 1:
#             if clip_group[i] in selected_starts:
#                 previous = i
#             elif clip_group[i] in selected_ends:
#                 refined_groups.append(clip_group[previous:(index+1)])
#                 j = i + 1
#                 while j < len(clip_group) - 1:
#                     if clip_group[j] - clip_group[j-1] == 1:
#                         j += 1
#                     else:
#                         previous = j 
#                         i = j
#                         break
#             elif clip_group[i] + 2 < clip_group[i+1]:
#                 refined_groups.append(clip_group[previous:(i+1)])
#                 previous = i+1
#             i += 1
#             print(previous, i)
#         if previous < len(clip_group) - 1:
#             refined_groups.append(clip_group[previous:])
#             all_refined_clip_groups[key] = refined_groups
#     print(all_refined_clip_groups)
    
    all_refined_clip_groups = dict()
    for key in rough_clip_groups.keys():
        clip_group = rough_clip_groups[key]
        refined_groups = []
        
        previous = 0
        i = 0
        while i < len(clip_group) - 1:
            if clip_group[i] + 2 < clip_group[i+1]:
                refined_groups.append(clip_group[previous:(i+1)])
                previous = i+1
            i += 1
        
        refined_groups.append(clip_group[previous:])
        all_refined_clip_groups[key] = refined_groups
    print(all_refined_clip_groups)

    
    keys = list(all_refined_clip_groups)
    if len(keys) == 2:
        k1 = keys[0]
        k2 = keys[1]
        groups1 = all_refined_clip_groups[k1]
        groups2 = all_refined_clip_groups[k2]

        i = 0
        j = 0
        while i < len(groups1):
            while j < len(groups2):
                min_index1 = min(groups1[i])
                max_index1 = max(groups1[i])
                min_index2 = min(groups2[j])
                max_index2 = max(groups2[j])
                set1 = set(range(min_index1, max_index1+1))
                set2 = set(range(min_index2, max_index2+1))
                if set1.issubset(set2) == True:
                    groups1.remove(groups1[i])
                    if i >= len(groups1):
                        break
                elif set2.issubset(set1) == True:
                    groups2.remove(groups2[j])
                else:
                    if max_index1 > max_index2:
                        j += 1
                    else:
                        break
            i += 1
        filtered_all_clip_groups = {
            k1:groups1,
            k2:groups2
        }
    else:
        filtered_all_clip_groups = all_refined_clip_groups
    print(filtered_all_clip_groups)
    
    # add start and end information
    final_all_clip_groups = {}
    for key in filtered_all_clip_groups.keys():
        clip_groups = filtered_all_clip_groups[key]
        all_clip_groups = []
        for clip_group in clip_groups: 
            if len(clip_group) > 6:
                start_clip = min(clip_group)
                end_clip = max(clip_group)
                for selected_start in selected_starts:
                    if selected_start > start_clip and selected_start < start_clip + 3:
                        start_clip = selected_start
                for selected_end in selected_ends:
                    if selected_end > end_clip - 3 and selected_end < end_clip:
                        end_clip = selected_end
                clip_group = list(range(start_clip, end_clip+1))
            all_clip_groups.append(clip_group)
        final_all_clip_groups[key] = all_clip_groups
    
    all_clip_frame_groups = {} 
    for key in final_all_clip_groups.keys():
        final_groups = final_all_clip_groups[key]
        clip_frame_groups = []
        for group in final_groups:
            clip_frame_group = set()
            for index in group:
                clip_frame_group = clip_frame_group.union(set(indices_in_shorter_clips[index]))
                start_frame = min(clip_frame_group) + 1
                end_frame = max(clip_frame_group) + 1            
            clip_frame_groups.append([start_frame, end_frame])
            
        
        all_clip_frame_groups[key] = clip_frame_groups
    return all_clip_frame_groups
def predict_events(cabin_video_path, face_video_path, args):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    checkpoint = args.checkpoint
    clip_length = args.clip_length
    clip_stride = args.clip_stride
    batch_size = args.batch_size
    num_classes = args.num_classes
    threshold = args.threshold

    cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(cabin_video_path, face_video_path, clip_length,
                                                                      clip_stride)
    model = TAL_Net(num_classes)
    ckp = torch.load(checkpoint)
    model.load_state_dict(ckp['model'])
    model.to(device)
    model.eval()

    clip_transforms = transforms.Compose([videotransforms.CenterCrop(224),
                                          videotransforms.ToTensor(),
                                          videotransforms.ClipNormalize()
                                          ])
    all_clips = []
    all_predict_classes = []
    all_start_scores = []
    all_end_scores = []

    n = len(cabin_clips) // batch_size
    for i in range(n):
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for j in range(i * batch_size, (i + 1) * batch_size):
            cabin_clip = cabin_clips[j]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[j]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)
        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)

        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    if len(cabin_clips) % batch_size != 0:
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for k in range(n * batch_size, len(cabin_clips)):
            cabin_clip = cabin_clips[k]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[k]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)

        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)
        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    all_predict_classes = np.concatenate(all_predict_classes)

    print(all_predict_classes)
    # rough chunk aggregation
    cabin_frames = os.listdir(cabin_video_path)
    cabin_frame_length  = len(cabin_frames)
    cabin_indices = np.arange(start=0, stop=cabin_frame_length - clip_stride + 1, step=clip_stride)
    indices_in_shorter_clips = [list(range(idx, idx + clip_stride)) for idx in cabin_indices]
#     remainder = cabin_frame_length % clip_stride
#     if remainder != 0:
#         indices_in_shorter_clips.append(list(range(cabin_frame_length-remainder, cabin_frame_length)))
#     print(len(indices_in_shorter_clips))
#     print(len(indices_in_cabin_clips))
    shorter_clip_predict_classes = [] 
    for i in range(len(indices_in_shorter_clips)):
        if i == 0:
            shorter_clip_predict_classes.append(all_predict_classes[0])
        elif i == 1:
            l = [all_predict_classes[0], all_predict_classes[1]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == 2:
            l = [all_predict_classes[0], all_predict_classes[1], all_predict_classes[2]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
#         elif i == len(indices_in_shorter_clips) - 3:
#             l = [all_predict_classes[i], all_predict_classes[i+1], all_predict_classes[i+2]]
#             shorter_clip_predict_classes.append(max(set(l), key = l.count))
#         elif i == len(indices_in_shorter_clips) - 2:
#             l = [all_predict_classes[i], all_predict_classes[i+1]]
#             shorter_clip_predict_classes.append(max(set(l), key = l.count))
#         elif i == len(indices_in_shorter_clips) - 1:
#             shorter_clip_predict_classes.append(all_predict_classes[i])
        elif i < len(indices_in_cabin_clips):
            l = [all_predict_classes[j] for j in range(i-3, i+1)]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips):
            index = len(indices_in_cabin_clips) - 1
            l = [all_predict_classes[index-2], all_predict_classes[index-1], all_predict_classes[index]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips) + 1:
            index = len(indices_in_cabin_clips) - 1
            l = [all_predict_classes[index-1], all_predict_classes[index]]
            shorter_clip_predict_classes.append(max(set(l), key = l.count))
        elif i == len(indices_in_cabin_clips) + 2:
            index = len(indices_in_cabin_clips) - 1
            shorter_clip_predict_classes.append(all_predict_classes[index])
     
    print(shorter_clip_predict_classes)
    rough_clip_groups = defaultdict(list)
    for i in range(len(shorter_clip_predict_classes)):
        if shorter_clip_predict_classes[i] != 0:
            rough_clip_groups[shorter_clip_predict_classes[i]].append(i)
    print(rough_clip_groups)
    all_refined_clip_groups = dict()
    for key in rough_clip_groups.keys():
        clip_group = rough_clip_groups[key]
        refined_groups = []
        
        previous = 0
        i = 0
        while i < len(clip_group) - 1:
            if clip_group[i+1] - clip_group[i] >= 4:
                refined_groups.append(clip_group[previous:(i+1)])
                previous = i+1
            i += 1
        
        refined_groups.append(clip_group[previous:])
        all_refined_clip_groups[key] = refined_groups
    print(all_refined_clip_groups)
#     all_classes = all_clip_frame_groups.keys()
    keys = list(all_refined_clip_groups)
    if len(keys) == 2:
        k1 = keys[0]
        k2 = keys[1]
        groups1 = all_refined_clip_groups[k1]
        groups2 = all_refined_clip_groups[k2]

        i = 0
        j = 0
        while i < len(groups1):
            while j < len(groups2):
                min_index1 = min(groups1[i])
                max_index1 = max(groups1[i])
                min_index2 = min(groups2[j])
                max_index2 = max(groups2[j])
                set1 = set(range(min_index1, max_index1+1))
                set2 = set(range(min_index2, max_index2+1))
                if set1.issubset(set2) == True:
                    groups1.remove(groups1[i])
                    break
                elif set2.issubset(set1) == True:
                    groups2.remove(groups2[j])
                else:
                    intersec = set1.intersection(set2)
                    for item in intersec:
                        set1.discard(item)
                        set2.discard(item)
                    groups1[i] = list(set1)
                    groups2[j] = list(set2)
                    if max_index1 > max_index2:
                        j += 1
                    else:
                        i += 1
                        break
            if j == len(groups2):
                break
       
        final_all_clip_groups = {
            k1:groups1,
            k2:groups2
        }
    else:
        final_all_clip_groups = all_refined_clip_groups
    print(final_all_clip_groups)
    all_clip_frame_groups = {} 
    for key in final_all_clip_groups.keys():
        final_groups = final_all_clip_groups[key]
        clip_frame_groups = []
        for group in final_groups:
            clip_frame_group = set()
            for index in group:
                clip_frame_group = clip_frame_group.union(set(indices_in_shorter_clips[index]))
                start_frame = min(clip_frame_group) + 1
                end_frame = max(clip_frame_group) + 1            
            clip_frame_groups.append([start_frame, end_frame])
        all_clip_frame_groups[key] = clip_frame_groups
    return all_clip_frame_groups
def predict_video(cabin_video_path, face_video_path, args):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'
        
    checkpoint = args.checkpoint
    clip_length = args.clip_length
    clip_stride = args.clip_stride
    batch_size = args.batch_size
    num_classes = args.num_classes
    threshold = args.threshold

    cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(cabin_video_path, face_video_path, clip_length, clip_stride)
    model = TAL_Net(num_classes)
    ckp = torch.load(checkpoint)
    model.load_state_dict(ckp['model'])
    model.to(device)
    model.eval()

    clip_transforms = transforms.Compose([videotransforms.CenterCrop(224),
                                          videotransforms.ToTensor(),
                                          videotransforms.ClipNormalize()
                                          ])
    all_clips = []
    all_predict_classes = []
    all_start_scores = []
    all_end_scores = []

    n = len(cabin_clips) // batch_size
    for i in range(n):
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for j in range(i * batch_size, (i + 1) * batch_size):
            cabin_clip = cabin_clips[j]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[j]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)
        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)

        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    if len(cabin_clips) % batch_size != 0:
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for k in range(n * batch_size, len(cabin_clips)):
            cabin_clip = cabin_clips[k]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[k]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)

        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(cabin_video_frames_batch, face_video_frames_batch)
        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    all_predict_classes = np.concatenate(all_predict_classes)
    all_start_scores = np.concatenate(all_start_scores)
    all_end_scores = np.concatenate(all_end_scores)
#     print(all_start_scores)
#     print(all_end_scores)
#     start_peak_indices = []
#     end_peak_indices = []
    
#     if all_start_scores[0] > all_start_scores[1]:
#         start_peak_indices.append(0)
#     for i in range(1, len(cabin_clips) - 1):
#         if all_start_scores[i] > all_start_scores[i - 1]:
#             if all_start_scores[i] > all_start_scores[i + 1]:
#                 start_peak_indices.append(i)
#         if all_end_scores[i] > all_end_scores[i - 1]:
#             if all_end_scores[i] > all_end_scores[i + 1]:
#                 end_peak_indices.append(i)
#     if all_end_scores[-1] > all_end_scores[-2]:
#         end_peak_indices.append(len(cabin_clips) - 1)

#     j = 0
#     copy_start_peak_indices = start_peak_indices.copy()
#     while j < len(start_peak_indices) - 1:
#         index1 = copy_start_peak_indices[j]
#         index2 = copy_start_peak_indices[j + 1]
#         if index1 + 4 < index2:
#             j += 1
#         else:
#             if all_start_scores[start_peak_indices[j]] > all_start_scores[start_peak_indices[j+1]]:
#                 copy_start_peak_indices[j] = index2
#                 copy_start_peak_indices.pop(j + 1)
#                 start_peak_indices.pop(j + 1)

#             else:
#                 copy_start_peak_indices.pop(j)
#                 start_peak_indices.pop(j)

#     k = 0
#     copy_end_peak_indices = end_peak_indices.copy()
#     while k < len(end_peak_indices) - 1:
#         index1 = copy_end_peak_indices[k]
#         index2 = copy_end_peak_indices[k + 1]
#         if index1 + 4 < index2:
#             k += 1
#         else:
#             if all_end_scores[end_peak_indices[k]] > all_end_scores[end_peak_indices[k+1]]:
#                 copy_end_peak_indices[k] = index2
#                 copy_end_peak_indices.pop(k + 1)
#                 end_peak_indices.pop(k + 1)
#             else:
#                 copy_end_peak_indices.pop(k)
#                 end_peak_indices.pop(k)
                
    selected_starts = []
    selected_ends = []
    for i in range(len(all_start_scores)):
        if all_start_scores[i] > threshold:
            selected_starts.append(i)
    for j in range(len(all_end_scores)):
        if all_end_scores[j] > threshold:
            selected_ends.append(j)        
    return selected_starts, selected_ends, all_start_scores, indices_in_cabin_clips
def train():
    args = get_parse()
    cabin_video_dir = args.cabin_video_dir
    face_video_dir = args.face_video_dir
    train_data_path = args.train_data_path
    val_data_path = args.val_data_path
    train_batch_size = args.train_batch_size
    val_batch_size = args.val_batch_size
    num_epochs = args.num_epochs
    learning_rate = args.learning_rate
    weight_decay = args.weight_decay
    display_steps = args.display_steps
    ckp_dir = args.ckp_dir
    save_path = args.save_path
    num_classes = args.num_classes
    weight = args.weight

    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    if not os.path.exists(ckp_dir):
        os.makedirs(ckp_dir)

    print('Start to load data')
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.ToTensor(),
        videotransforms.ClipNormalize()
    ])
    val_transforms = transforms.Compose([
        videotransforms.CenterCrop(224),
        videotransforms.ToTensor(),
        videotransforms.ClipNormalize()
    ])
    train_dataset = IVBSSDataset(cabin_video_dir, face_video_dir,
                                 train_data_path, train_transforms)
    val_dataset = IVBSSDataset(cabin_video_dir, face_video_dir, val_data_path,
                               val_transforms)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=RandomSampler(train_dataset,
                                                        replacement=True),
                                  collate_fn=collate_fn,
                                  drop_last=True)
    total_steps = num_epochs * len(train_dataloader)
    print('Total number of training samples is {0}'.format(len(train_dataset)))
    print('Total number of validation samples is {0}'.format(len(val_dataset)))
    print('Total number of training steps is {0}'.format(total_steps))

    model = TAL_Net(num_classes)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=learning_rate,
                                momentum=0.9,
                                weight_decay=weight_decay)
    #     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=10,
                                                gamma=0.1)
    start_epoch = 0

    if args.pretrained_I3D_model is not None:
        print('Load pretrained I3D model')
        pretrained_I3D_model = torch.load(args.pretrained_I3D_model)
        model.I3D_1.load_state_dict(pretrained_I3D_model)
        model.I3D_2.load_state_dict(pretrained_I3D_model)

    if args.ckp_path is not None:
        print('Load checkpoint')
        start_epoch, model, optimizer, scheduler = load_ckp(
            args.ckp_path, model, optimizer, scheduler)

    model.to(device)
    model.train()

    print('Start to train')
    num_step = 0
    best_acc = 0.0
    for epoch in range(start_epoch, num_epochs):
        running_loss = 0.0
        class_running_loss = 0.0
        chunk_inclusion_running_loss = 0.0
        for i, (cabin_imgs, face_imgs, labels, start_labels,
                end_labels) in enumerate(train_dataloader):
            cabin_imgs = cabin_imgs.to(device)
            face_imgs = face_imgs.to(device)
            labels = labels.to(device)
            start_labels = start_labels.to(device)
            end_labels = end_labels.to(device)
            optimizer.zero_grad()
            loss, class_loss, chunk_inclusion_loss = model(
                cabin_imgs, face_imgs, labels, start_labels, end_labels,
                weight)[:3]
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            class_running_loss += class_loss.item()
            chunk_inclusion_running_loss += chunk_inclusion_loss.item()
            if (i + 1) % display_steps == 0:
                print(
                    'epoch:{0}/{1}, step:{2}/{3}, loss:{4:.4f}, class_loss:{5:.4f}, chunk_inclusion_loss:{6:.4f}'
                    .format(epoch + 1, num_epochs, i + 1,
                            len(train_dataloader),
                            running_loss / display_steps,
                            class_running_loss / display_steps,
                            chunk_inclusion_running_loss / display_steps))
                running_loss = 0.0
                class_running_loss = 0.0
                chunk_inclusion_running_loss = 0.0
            num_step += 1
            writer.add_scalars(
                'Loss/train', {
                    'total_loss': loss,
                    'class_loss': class_loss,
                    'chunk_inclusion_loss': chunk_inclusion_loss
                }, num_step)

        scheduler.step()

        print('Start to validate')
        #         eval_loss, eval_class_loss, eval_chunk_inclusion_loss, class_accuracy = eval(train_dataset, train_batch_size, model, weight, device)
        eval_loss, eval_class_loss, eval_chunk_inclusion_loss, class_accuracy = eval(
            val_dataset, val_batch_size, model, weight, device)
        writer.add_scalars(
            'Loss/valid', {
                'total_loss': eval_loss,
                'class_loss': eval_class_loss,
                'chunk_inclusion_loss': eval_chunk_inclusion_loss
            }, epoch)
        writer.add_scalar('Accuracy/valid', class_accuracy, epoch)

        print(
            'Toal loss on validation dataset: {0:.4f}, class loss on validation dataset: {1:.4f}, chunk inclusion loss on validation dataset: {2:.4f}, class accuracy on validation dataset: {3:.4f}'
            .format(eval_loss, eval_class_loss, eval_chunk_inclusion_loss,
                    class_accuracy))

        is_best = class_accuracy > best_acc
        best_acc = max(class_accuracy, best_acc)

        checkpoint = {
            'epoch': epoch + 1,
            'model': model.state_dict(),
            'best_acc': best_acc,
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict()
        }

        ckp_name = 'epoch_' + str(epoch + 1) + '.pt'
        save_ckp(checkpoint, ckp_dir, ckp_name, is_best, save_path)
        print('Save the checkpoint after {} epochs'.format(epoch + 1))

    writer.close()
Пример #7
0
def main():
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    args = get_parse()
    cabin_video_path = args.cabin_video_path
    face_video_path = args.face_video_path
    checkpoint = args.checkpoint
    clip_length = args.clip_length
    clip_stride = args.clip_stride
    batch_size = args.batch_size
    num_classes = args.num_classes
    threshold = args.threshold

    cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(
        cabin_video_path, face_video_path, clip_length, clip_stride)
    model = TAL_Net(num_classes)
    ckp = torch.load(checkpoint)
    model.load_state_dict(ckp['model'])
    model.to(device)
    model.eval()

    clip_transforms = transforms.Compose([
        videotransforms.CenterCrop(224),
        videotransforms.ToTensor(),
        videotransforms.ClipNormalize()
    ])
    all_clips = []
    all_predict_classes = []
    all_start_scores = []
    all_end_scores = []

    n = len(cabin_clips) // batch_size
    for i in range(n):
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for j in range(i * batch_size, (i + 1) * batch_size):
            cabin_clip = cabin_clips[j]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[j]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)
        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(
                cabin_video_frames_batch, face_video_frames_batch)

        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    if len(cabin_clips) % batch_size != 0:
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for k in range(n * batch_size, len(cabin_clips)):
            cabin_clip = cabin_clips[k]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[k]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)

        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(
                cabin_video_frames_batch, face_video_frames_batch)
        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    all_predict_classes = np.concatenate(all_predict_classes)
    all_start_scores = np.concatenate(all_start_scores)
    all_end_scores = np.concatenate(all_end_scores)

    # rough chunk aggregation
    start_peak_indices = []
    end_peak_indices = []
    if all_start_scores[0] > all_start_scores[1]:
        start_peak_indices.append(0)
    for i in range(1, len(all_start_scores) - 1):
        if all_start_scores[i] > all_start_scores[i - 1]:
            if all_start_scores[i] > all_start_scores[i + 1]:
                start_peak_indices.append(i)
        if all_end_scores[i] > all_end_scores[i - 1]:
            if all_end_scores[i] > all_end_scores[i + 1]:
                end_peak_indices.append(i)
    if all_end_scores[-1] > all_end_scores[-2]:
        end_peak_indices.append(len(cabin_clips) - 1)

    j = 0
    copy_start_peak_indices = start_peak_indices.copy()
    while j < len(start_peak_indices) - 1:
        index1 = copy_start_peak_indices[j]
        index2 = copy_start_peak_indices[j + 1]
        if index1 + 4 < index2:
            j += 1
        else:
            if all_start_scores[start_peak_indices[j]] > all_start_scores[
                    start_peak_indices[j + 1]]:
                copy_start_peak_indices[j] = index2
                copy_start_peak_indices.pop(j + 1)
                start_peak_indices.pop(j + 1)

            else:
                copy_start_peak_indices.pop(j)
                start_peak_indices.pop(j)

    k = 0
    copy_end_peak_indices = end_peak_indices.copy()
    while k < len(end_peak_indices) - 1:
        index1 = copy_end_peak_indices[k]
        index2 = copy_end_peak_indices[k + 1]
        if index1 + 4 < index2:
            k += 1
        else:
            if all_end_scores[end_peak_indices[k]] > all_end_scores[
                    end_peak_indices[k + 1]]:
                copy_end_peak_indices[k] = index2
                copy_end_peak_indices.pop(k + 1)
                end_peak_indices.pop(k + 1)
            else:
                copy_end_peak_indices.pop(k)
                end_peak_indices.pop(k)

    selected_starts = []
    selected_ends = []
    for start_indice in start_peak_indices:
        if all_start_scores[start_indice] > threshold:
            selected_starts.append(start_indice)
    for end_indice in end_peak_indices:
        if all_end_scores[end_indice] > threshold:
            selected_ends.append(end_indice)
    print(selected_starts)
    print(selected_ends)
    selected_start_scores = []
    selected_end_scores = []
    if selected_starts != []:
        for start in selected_starts:
            selected_start_scores.append(all_start_scores[start])
    if selected_ends != []:
        for end in selected_ends:
            selected_end_scores.append(all_end_scores[end])

    # plot
    all_clips = range(len(all_start_scores))
    fig = plt.figure()
    plt.plot(all_clips, all_start_scores, "b.-", label="start scores")
    plt.plot(all_clips, all_end_scores, "r.-", label="end scores")
    if selected_starts != []:
        plt.scatter(selected_starts,
                    selected_start_scores,
                    c='b',
                    marker='*',
                    linewidths=3,
                    label="selected clips including starts")
    if selected_ends != []:
        plt.scatter(selected_ends,
                    selected_end_scores,
                    c='r',
                    marker='*',
                    linewidths=3,
                    label="selected clips including ends")
    plt.legend(loc='upper right')
    plt.ylim(0, 1)
    plt.xlabel("Clip Index")
    plt.ylabel("predicted score")
    plt.show()
    cabin_video_name = os.path.basename(cabin_video_path)
    fig.savefig('figures/plot_{}.png'.format(cabin_video_name))
def predict_events(cabin_video_path, face_video_path, args):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    checkpoint = args.checkpoint
    clip_length = args.clip_length
    clip_stride = args.clip_stride
    batch_size = args.batch_size
    num_classes = args.num_classes
    threshold = args.threshold

    cabin_clips, face_clips, indices_in_cabin_clips = clip_generation(
        cabin_video_path, face_video_path, clip_length, clip_stride)
    model = TAL_Net(num_classes)
    ckp = torch.load(checkpoint)
    model.load_state_dict(ckp['model'])
    model.to(device)
    model.eval()

    clip_transforms = transforms.Compose([
        videotransforms.CenterCrop(224),
        videotransforms.ToTensor(),
        videotransforms.ClipNormalize()
    ])
    all_clips = []
    all_predict_classes = []
    all_start_scores = []
    all_end_scores = []

    n = len(cabin_clips) // batch_size
    for i in range(n):
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for j in range(i * batch_size, (i + 1) * batch_size):
            cabin_clip = cabin_clips[j]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[j]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)
        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(
                cabin_video_frames_batch, face_video_frames_batch)

        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    if len(cabin_clips) % batch_size != 0:
        cabin_video_frames_batch = []
        face_video_frames_batch = []
        for k in range(n * batch_size, len(cabin_clips)):
            cabin_clip = cabin_clips[k]
            cabin_video_frames = load_rgb_frames(cabin_video_path, cabin_clip)
            cabin_video_frames = clip_transforms(cabin_video_frames)
            cabin_video_frames_batch.append(cabin_video_frames)
            face_clip = face_clips[k]
            face_video_frames = load_rgb_frames(face_video_path, face_clip)
            face_video_frames = clip_transforms(face_video_frames)
            face_video_frames_batch.append(face_video_frames)

        cabin_video_frames_batch = torch.stack(cabin_video_frames_batch)
        face_video_frames_batch = torch.stack(face_video_frames_batch)

        cabin_video_frames_batch = cabin_video_frames_batch.to(device)
        face_video_frames_batch = face_video_frames_batch.to(device)

        with torch.no_grad():
            class_scores, start_scores, end_scores = model(
                cabin_video_frames_batch, face_video_frames_batch)
        pred_classes = torch.argmax(class_scores, dim=1)
        pred_classes = pred_classes.cpu().numpy()
        start_scores = start_scores.cpu().numpy()
        end_scores = end_scores.cpu().numpy()

        all_predict_classes.append(pred_classes)
        all_start_scores.append(start_scores)
        all_end_scores.append(end_scores)

    all_predict_classes = np.concatenate(all_predict_classes)

    print(all_predict_classes)
    # rough chunk aggregation

    rough_clip_groups = defaultdict(list)
    for i in range(len(all_predict_classes)):
        if all_predict_classes[i] != 0:
            rough_clip_groups[all_predict_classes[i]].append(i)
    print(rough_clip_groups)
    all_refined_clip_groups = dict()
    for key in rough_clip_groups.keys():
        clip_group = rough_clip_groups[key]
        refined_groups = []

        previous = 0
        i = 0
        while i < len(clip_group) - 1:
            if clip_group[i] + 2 < clip_group[i + 1]:
                refined_groups.append(clip_group[previous:(i + 1)])
                previous = i + 1
            i += 1

        refined_groups.append(clip_group[previous:])
        all_refined_clip_groups[key] = refined_groups
    print(all_refined_clip_groups)
    #     all_classes = all_clip_frame_groups.keys()
    keys = list(all_refined_clip_groups)
    if len(keys) == 2:
        k1 = keys[0]
        k2 = keys[1]
        groups1 = all_refined_clip_groups[k1]
        groups2 = all_refined_clip_groups[k2]

        i = 0
        j = 0
        while i < len(groups1):
            while j < len(groups2):
                min_index1 = min(groups1[i])
                max_index1 = max(groups1[i])
                min_index2 = min(groups2[j])
                max_index2 = max(groups2[j])
                set1 = set(range(min_index1, max_index1 + 1))
                set2 = set(range(min_index2, max_index2 + 1))
                if set1.issubset(set2) == True:
                    groups1.remove(groups1[i])
                    if i >= len(groups1):
                        break
                elif set2.issubset(set1) == True:
                    groups2.remove(groups2[j])
                else:
                    if max_index1 > max_index2:
                        j += 1
                    else:
                        break
            i += 1
        final_all_clip_groups = {k1: groups1, k2: groups2}
    else:
        final_all_clip_groups = all_refined_clip_groups
    print(final_all_clip_groups)
    all_clip_frame_groups = {}
    for key in final_all_clip_groups.keys():
        final_groups = final_all_clip_groups[key]
        clip_frame_groups = []
        for group in final_groups:
            clip_frame_group = set()
            for index in group:
                clip_frame_group = clip_frame_group.union(
                    set(indices_in_cabin_clips[index]))
                start_frame = min(clip_frame_group) + 1
                end_frame = max(clip_frame_group) + 1
            clip_frame_groups.append([start_frame, end_frame])
        all_clip_frame_groups[key] = clip_frame_groups
    return all_clip_frame_groups
Пример #9
0
def run(max_steps=64e3,
        mode='rgb',
        root='/ssd2/charades/Charades_v1_rgb',
        split='charades/charades.json',
        batch_size=1,
        load_model='',
        save_dir=''):
    # setup dataset
    # test_transforms = T.Compose([videotransforms.CenterCrop(224)])
    test_transforms = T.Compose([
        T.Resize(min_size=(240, ), max_size=320),
        T.ToTensor(),
        T.Normalize(mean=None, std=None, to_bgr255=False)
    ])

    dataset = Dataset(split,
                      'train',
                      root,
                      mode,
                      test_transforms,
                      save_dir=save_dir,
                      overlap=15)
    distributed = True
    shuffle = False
    images_per_batch = 4
    if distributed:
        sampler = DistributedSampler(dataset, shuffle=shuffle)
    if shuffle:
        sampler = torch.utils.data.sampler.RandomSampler(dataset)
    else:
        sampler = torch.utils.data.sampler.SequentialSampler(dataset)

    batch_sampler = torch.utils.data.sampler.BatchSampler(sampler,
                                                          images_per_batch,
                                                          drop_last=False)
    dataloader = DataLoader(
        dataset,
        # batch_size=batch_size,
        shuffle=shuffle,
        num_workers=4,
        # pin_memory=True,
        batch_sampler=batch_sampler)

    dataloaders = {'train': dataloader}
    datasets = {'train': dataset}

    # val_dataset = Dataset(split,
    #                       'testing',
    #                       root,
    #                       mode,
    #                       test_transforms,
    #                       save_dir=save_dir)
    # val_dataloader = DataLoader(val_dataset,
    #                             batch_size=batch_size,
    #                             shuffle=True,
    #                             num_workers=8,
    #                             pin_memory=True)

    # dataloaders = {'train': dataloader, 'train': val_dataloader}
    # datasets = {'train': dataset, 'train': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(dataset.num_classes, in_channels=2)
    else:
        i3d = InceptionI3d(dataset.num_classes, in_channels=3)
    i3d.replace_logits(dataset.num_classes)
    load_state_dict(i3d, torch.load(load_model), ignored_prefix='logits')
    i3d.cuda()

    # for phase in ['train', 'train']:
    for phase in ['train']:
        i3d.eval()  # Set model to evaluate mode

        tot_loss = 0.0
        tot_loc_loss = 0.0
        tot_cls_loss = 0.0

        # Iterate over data.
        for data in tqdm(dataloaders[phase]):
            # get the inputs
            inputs, labels, name, start, end = data
            feature_save_dir = os.path.join(save_dir, name[0])
            if not os.path.exists(feature_save_dir):
                os.makedirs(feature_save_dir)

            b, c, t, h, w = inputs.shape
            if t > 1600:
                features = []
                for start in range(1, t - 56, 1600):
                    end = min(t - 1, start + 1600 + 56)
                    start = max(1, start - 48)
                    ip = Variable(torch.from_numpy(
                        inputs.numpy()[:, :, start:end]).cuda(),
                                  volatile=True)
                    features.append(
                        i3d.extract_features(ip).squeeze(0).permute(
                            1, 2, 3, 0).data.cpu().numpy())
                np.save(os.path.join(save_dir, name[0]),
                        np.concatenate(features, axis=0))
            else:
                # wrap them in Variable
                inputs = Variable(inputs.cuda(), volatile=True)
                features = i3d.extract_features(inputs)
                for feature, s, e in zip(features, start, end):

                    np.save(
                        os.path.join(feature_save_dir,
                                     str(int(s)) + '_' + str(int(e)) + '.npy'),
                        feature.squeeze().data.cpu().numpy())
Пример #10
0
def test():
    args = get_parse()
    cabin_video_dir = args.cabin_video_dir
    face_video_dir = args.face_video_dir
    test_data_path = args.test_data_path
    batch_size = args.batch_size
    num_classes = args.num_classes
    weight = args.weight

    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'
    print('Start to load data')

    test_transforms = transforms.Compose([
        videotransforms.CenterCrop(224),
        videotransforms.ToTensor(),
        videotransforms.ClipNormalize()
    ])
    test_dataset = IVBSSDataset(cabin_video_dir, face_video_dir,
                                test_data_path, test_transforms)
    print('Total number of test samples is {0}'.format(len(test_dataset)))
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 sampler=SequentialSampler(test_dataset),
                                 collate_fn=collate_fn)
    model = TAL_Net(num_classes)

    print('Load checkpoint')
    ckp = torch.load(args.ckp_path)
    model.load_state_dict(ckp['model'])
    model.to(device)
    model.eval()

    print('Start to test')
    test_loss = 0.0
    test_class_loss = 0.0
    test_chunk_inclusion_loss = 0.0
    class_accuracy = 0.0
    test_steps = 0

    #     start_time = time.time()
    for i, (cabin_imgs, face_imgs, labels, start_labels,
            end_labels) in enumerate(test_dataloader):
        cabin_imgs = cabin_imgs.to(device)
        face_imgs = face_imgs.to(device)
        labels = labels.to(device)
        start_labels = start_labels.to(device)
        end_labels = end_labels.to(device)
        with torch.no_grad():
            loss, class_loss, chunk_inclusion_loss, class_scores, start_scores, end_scores = model(
                cabin_imgs, face_imgs, labels, start_labels, end_labels,
                weight)
        test_loss += loss.item()
        test_class_loss += class_loss.item()
        test_chunk_inclusion_loss += chunk_inclusion_loss.item()
        class_pred = torch.argmax(class_scores, dim=1)
        class_accuracy += torch.sum(
            (class_pred == labels).float()) / labels.shape[0]
        test_steps += 1

    avg_test_loss = test_loss / test_steps
    avg_test_class_loss = test_class_loss / test_steps
    avg_test_chunk_inclusion_loss = test_chunk_inclusion_loss / test_steps
    avg_class_accuracy = class_accuracy / test_steps

    #     end_time = time.time()
    #     total_time = end_time-start_time
    #     avg_time = total_time/(test_steps*batch_size)

    print(
        'avg_test_loss:{0}, avg_test_class_loss:{1}, avg_test_chunk_inclusion_loss:{2}, avg_class_accuracy:{3}'
        .format(avg_test_loss, avg_test_class_loss,
                avg_test_chunk_inclusion_loss, avg_class_accuracy))