def __init__(self, config, attention=False):
        super().__init__()

        # abaghel added:
        self.flag = False

        # Obj-level image embedding layer
        self.visn_fc = VisualFeatEncoder(config)

        # Number of layers
        self.num_l_layers = VISUAL_CONFIG.l_layers
        self.num_x_layers = VISUAL_CONFIG.x_layers
        self.num_r_layers = VISUAL_CONFIG.r_layers
        print("LXRT encoder with %d l_layers, %d x_layers, and %d r_layers." %
              (self.num_l_layers, self.num_x_layers, self.num_r_layers))

        # Layers
        # Using self.layer instead of self.l_layer to support loading BERT weights.
        self.layer = nn.ModuleList(
            [BertLayer(config) for _ in range(self.num_l_layers)])
        self.x_layers = nn.ModuleList(
            [LXRTXLayer(config) for _ in range(self.num_x_layers)])

        # SWAP: Include for running our Model
        self.r_layers = model_builder.build_model(cfg, attention)
Пример #2
0
def build_and_switch_demo_model(cfg):
    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    model.eval()
    misc.log_model_info(model)

    return model
Пример #3
0
    def __init__(self, cfg, backbone):
        if SlowFast.__instance != None:
            raise Exception('This class is a singleton!')
        else:
            SlowFast.__instance = self

        self.cfg = cfg
        self.backbone = backbone
        print(self.cfg.MODEL.ARCH)
        self.model = model_builder.build_model(self.cfg)
        self.model.eval()
        misc.log_model_info(self.model)

        self.checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH

        cu.load_checkpoint(
            self.checkpoint,
            self.model,
            self.cfg.NUM_GPUS > 1,
            None,
            inflation=False,
            convert_from_caffe2="caffe2"
            in [self.cfg.TEST.CHECKPOINT_TYPE, self.cfg.TRAIN.CHECKPOINT_TYPE],
        )

        if self.backbone == 'yolo':
            self.object_predictor = YOLO.get_instance(
                self.cfg.DEMO.YOLO_LIB, self.cfg.DEMO.YOLO_CFG,
                self.cfg.DEMO.YOLO_META, self.cfg.DEMO.YOLO_CLASS,
                self.cfg.DEMO.YOLO_WEIGHTS)
        else:
            dtron2_cfg_file = self.cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG
            dtron2_cfg = get_cfg()
            dtron2_cfg.merge_from_file(
                model_zoo.get_config_file(dtron2_cfg_file))
            dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = .5
            dtron2_cfg.MODEL.WEIGHTS = self.cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS
            self.object_predictor = DefaultPredictor(dtron2_cfg)

        with open(self.cfg.DEMO.LABEL_FILE_PATH) as f:
            self.labels = f.read().split('\n')[:-1]
        self.palette = np.random.randint(64, 128,
                                         (len(self.labels), 3)).tolist()

        self.images_queue = queue.Queue()
        self.write_queue = queue.Queue()

        self.cap = cv2.VideoCapture(self.cfg.DEMO.DATA_SOURCE)
        self.seq_len = self.cfg.DATA.NUM_FRAMES * self.cfg.DATA.SAMPLING_RATE
Пример #4
0
def build_model(cfg):
    """
    Build slowfast model
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)

    # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        cu.load_checkpoint(
            cfg.TEST.CHECKPOINT_FILE_PATH,
            model,
            cfg.NUM_GPUS > 1,
            None,
            inflation=False,
            convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2",
        )
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
        cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current
        # checkpoint folder, try to load checkpint from
        # TRAIN.CHECKPOINT_FILE_PATH and test it.
        cu.load_checkpoint(
            cfg.TRAIN.CHECKPOINT_FILE_PATH,
            model,
            cfg.NUM_GPUS > 1,
            None,
            inflation=False,
            convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2",
        )
    else:
        # raise NotImplementedError("Unknown way to load checkpoint.")
        print("Testing with random initialization. Only for debugging.")

    return model
Пример #5
0
def test(cfg):
    """
    Perform multi-view testing on the pretrained video model.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Test with config:")
    logger.info(cfg)

    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    if du.is_master_proc():
        misc.log_model_info(model)

    # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        cu.load_checkpoint(
            cfg.TEST.CHECKPOINT_FILE_PATH,
            model,
            cfg.NUM_GPUS > 1,
            None,
            inflation=False,
            convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2",
        )
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
        cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current
        # checkpoint folder, try to load checkpint from
        # TRAIN.CHECKPOINT_FILE_PATH and test it.
        cu.load_checkpoint(
            cfg.TRAIN.CHECKPOINT_FILE_PATH,
            model,
            cfg.NUM_GPUS > 1,
            None,
            inflation=False,
            convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2",
        )
    else:
        # raise NotImplementedError("Unknown way to load checkpoint.")
        logger.info("Testing with random initialization. Only for debugging.")

    # Create video testing loaders.
    test_loader = loader.construct_loader(cfg, "test")
    logger.info("Testing model for {} iterations".format(len(test_loader)))

    if cfg.DETECTION.ENABLE:
        assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE
        test_meter = AVAMeter(len(test_loader), cfg, mode="test")
    else:
        assert (
            len(test_loader.dataset)
            % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS)
            == 0
        )
        # Create meters for multi-view testing.
        test_meter = TestMeter(
            len(test_loader.dataset)
            // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS),
            cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS,
            cfg.MODEL.NUM_CLASSES,
            len(test_loader),
        )

    # # Perform multi-view test on the entire dataset.
    perform_test(test_loader, model, test_meter, cfg)
Пример #6
0
def train(cfg):
    """
    Train a video model for many epochs on train set and evaluate it on val set.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Train with config:")
    logger.info(pprint.pformat(cfg))

    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    if du.is_master_proc():
        misc.log_model_info(model)

    # Construct the optimizer.
    optimizer = optim.construct_optimizer(model, cfg)

    # Load a checkpoint to resume training if applicable.
    if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR):
        logger.info("Load from last checkpoint.")
        last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
        checkpoint_epoch = cu.load_checkpoint(last_checkpoint, model,
                                              cfg.NUM_GPUS > 1, optimizer)
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        logger.info("Load from given checkpoint file.")
        checkpoint_epoch = cu.load_checkpoint(
            cfg.TRAIN.CHECKPOINT_FILE_PATH,
            model,
            cfg.NUM_GPUS > 1,
            optimizer,
            inflation=cfg.TRAIN.CHECKPOINT_INFLATE,
            convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2",
        )
        start_epoch = checkpoint_epoch + 1
    else:
        start_epoch = 0

    # Create the video train and val loaders.
    train_loader = loader.construct_loader(cfg, "train")
    val_loader = loader.construct_loader(cfg, "val")

    # Create meters.
    if cfg.DETECTION.ENABLE:
        train_meter = AVAMeter(len(train_loader), cfg, mode="train")
        val_meter = AVAMeter(len(val_loader), cfg, mode="val")
    else:
        train_meter = TrainMeter(len(train_loader), cfg)
        val_meter = ValMeter(len(val_loader), cfg)

    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch + 1))

    for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH):
        # Shuffle the dataset.
        loader.shuffle_dataset(train_loader, cur_epoch)
        # Train for one epoch.
        train_epoch(train_loader, model, optimizer, train_meter, cur_epoch,
                    cfg)

        # Compute precise BN stats.
        if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0:
            calculate_and_update_precise_bn(train_loader, model,
                                            cfg.BN.NUM_BATCHES_PRECISE)

        # Save a checkpoint.
        if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD):
            cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch,
                               cfg)
        # Evaluate the model on validation set.
        if misc.is_eval_epoch(cfg, cur_epoch):
            eval_epoch(val_loader, model, val_meter, cur_epoch, cfg)
def demo(cfg):
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Run demo with config:")
    logger.info(cfg)
    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    model.eval()
    misc.log_model_info(model)

    # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        ckpt = cfg.TEST.CHECKPOINT_FILE_PATH
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current
        # checkpoint folder, try to load checkpoint from
        # TRAIN.CHECKPOINT_FILE_PATH and test it.
        ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH
    else:
        raise NotImplementedError("Unknown way to load checkpoint.")

    cu.load_checkpoint(
        ckpt,
        model,
        cfg.NUM_GPUS > 1,
        None,
        inflation=False,
        convert_from_caffe2="caffe2"
        in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE],
    )

    if cfg.DETECTION.ENABLE:
        # Load object detector from detectron2
        dtron2_cfg_file = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG
        dtron2_cfg = get_cfg()
        dtron2_cfg.merge_from_file(model_zoo.get_config_file(dtron2_cfg_file))
        dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = .5
        dtron2_cfg.MODEL.WEIGHTS = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS
        object_predictor = DefaultPredictor(dtron2_cfg)
        # Load the labels of AVA dataset
        with open(cfg.DEMO.LABEL_FILE_PATH) as f:
            labels = f.read().split('\n')[:-1]
        palette = np.random.randint(64, 128, (len(labels), 3)).tolist()
        boxes = []
    else:
        # Load the labels of Kinectics-400 dataset
        labels_df = pd.read_csv(cfg.DEMO.LABEL_FILE_PATH)
        labels = labels_df['name'].values

    frame_provider = VideoReader(cfg)
    seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
    frames = []
    pred_labels = []
    s = 0.
    for able_to_read, frame in frame_provider:
        if not able_to_read:
            # when reaches the end frame, clear the buffer and continue to the next one.
            frames = []
            continue

        if len(frames) != seq_len:
            frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed)
            frames.append(frame_processed)
            if cfg.DETECTION.ENABLE and len(frames) == seq_len // 2 - 1:
                mid_frame = frame

        if len(frames) == seq_len:
            start = time()
            if cfg.DETECTION.ENABLE:
                outputs = object_predictor(mid_frame)
                fields = outputs["instances"]._fields
                pred_classes = fields["pred_classes"]
                selection_mask = pred_classes == 0
                # acquire person boxes
                pred_classes = pred_classes[selection_mask]
                pred_boxes = fields["pred_boxes"].tensor[selection_mask]
                scores = fields["scores"][selection_mask]
                boxes = cv2_transform.scale_boxes(
                    cfg.DATA.TEST_CROP_SIZE, pred_boxes,
                    frame_provider.display_height,
                    frame_provider.display_width)
                boxes = torch.cat(
                    [torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes],
                    axis=1)

            inputs = torch.as_tensor(frames).float()
            inputs = inputs / 255.0
            # Perform color normalization.
            inputs = inputs - torch.tensor(cfg.DATA.MEAN)
            inputs = inputs / torch.tensor(cfg.DATA.STD)
            # T H W C -> C T H W.
            inputs = inputs.permute(3, 0, 1, 2)

            # 1 C T H W.
            inputs = inputs.unsqueeze(0)

            # Sample frames for the fast pathway.
            index = torch.linspace(0, inputs.shape[2] - 1,
                                   cfg.DATA.NUM_FRAMES).long()
            fast_pathway = torch.index_select(inputs, 2, index)
            # logger.info('fast_pathway.shape={}'.format(fast_pathway.shape))

            # Sample frames for the slow pathway.
            index = torch.linspace(0, fast_pathway.shape[2] - 1,
                                   fast_pathway.shape[2] //
                                   cfg.SLOWFAST.ALPHA).long()
            slow_pathway = torch.index_select(fast_pathway, 2, index)
            # logger.info('slow_pathway.shape={}'.format(slow_pathway.shape))
            inputs = [slow_pathway, fast_pathway]
            """
            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list,)):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            """
            # Perform the forward pass.
            if cfg.DETECTION.ENABLE:
                # When there is nothing in the scene,
                #   use a dummy variable to disable all computations below.
                if not len(boxes):
                    preds = torch.tensor([])
                else:
                    preds = model(inputs, boxes)
            else:
                preds = model(inputs)

            # Gather all the predictions across all the devices to perform ensemble.
            if cfg.NUM_GPUS > 1:
                preds = du.all_gather(preds)[0]

            if cfg.DETECTION.ENABLE:
                # This post processing was intendedly assigned to the cpu since my laptop GPU
                #   RTX 2080 runs out of its memory, if your GPU is more powerful, I'd recommend
                #   to change this section to make CUDA does the processing.
                preds = preds.cpu().detach().numpy()
                pred_masks = preds > .1
                label_ids = [
                    np.nonzero(pred_mask)[0] for pred_mask in pred_masks
                ]
                pred_labels = [[
                    labels[label_id] for label_id in perbox_label_ids
                ] for perbox_label_ids in label_ids]
                # I'm unsure how to detectron2 rescales boxes to image original size, so I use
                #   input boxes of slowfast and rescale back it instead, it's safer and even if boxes
                #   was not rescaled by cv2_transform.rescale_boxes, it still works.
                boxes = boxes.cpu().detach().numpy()
                ratio = np.min([
                    frame_provider.display_height, frame_provider.display_width
                ]) / cfg.DATA.TEST_CROP_SIZE
                boxes = boxes[:, 1:] * ratio
            else:
                ## Option 1: single label inference selected from the highest probability entry.
                # label_id = preds.argmax(-1).cpu()
                # pred_label = labels[label_id]
                # Option 2: multi-label inferencing selected from probability entries > threshold
                label_ids = torch.nonzero(
                    preds.squeeze() > .1).reshape(-1).cpu().detach().numpy()
                pred_labels = labels[label_ids]
                logger.info(pred_labels)
                if not list(pred_labels):
                    pred_labels = ['Unknown']

            # # option 1: remove the oldest frame in the buffer to make place for the new one.
            # frames.pop(0)
            # option 2: empty the buffer
            frames = []
            s = time() - start

        if cfg.DETECTION.ENABLE and pred_labels and boxes.any():
            for box, box_labels in zip(boxes.astype(int), pred_labels):
                cv2.rectangle(frame,
                              tuple(box[:2]),
                              tuple(box[2:]), (0, 255, 0),
                              thickness=2)
                label_origin = box[:2]
                for label in box_labels:
                    label_origin[-1] -= 5
                    (label_width, label_height), _ = cv2.getTextSize(
                        label, cv2.FONT_HERSHEY_SIMPLEX, .5, 2)
                    cv2.rectangle(frame,
                                  (label_origin[0], label_origin[1] + 5),
                                  (label_origin[0] + label_width,
                                   label_origin[1] - label_height - 5),
                                  palette[labels.index(label)], -1)
                    cv2.putText(frame, label, tuple(label_origin),
                                cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255),
                                1)
                    label_origin[-1] -= label_height + 5
        if not cfg.DETECTION.ENABLE:
            # Display predicted labels to frame.
            y_offset = 50
            cv2.putText(frame,
                        'Action:', (10, y_offset),
                        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=.65,
                        color=(0, 235, 0),
                        thickness=2)
            for pred_label in pred_labels:
                y_offset += 30
                cv2.putText(frame,
                            '{}'.format(pred_label), (20, y_offset),
                            fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                            fontScale=.65,
                            color=(0, 235, 0),
                            thickness=2)

        # Display prediction speed
        cv2.putText(frame,
                    'Speed: {:.2f}s'.format(s), (10, 25),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale=.65,
                    color=(0, 235, 0),
                    thickness=2)
        # Display the frame
        cv2.imshow('SlowFast', frame)
        # hit Esc to quit the demo.
        key = cv2.waitKey(1)
        if key == 27:
            break

    frame_provider.clean()
Пример #8
0
def demo(cfg, backbone):
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    model.eval()
    misc.log_model_info(model)

   # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        ckpt = cfg.TEST.CHECKPOINT_FILE_PATH
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH
    else:
        raise NotImplementedError("Unknown way to load checkpoint.")

    cu.load_checkpoint(
        ckpt,
        model,
        cfg.NUM_GPUS > 1,
        None,
        inflation=False,
        convert_from_caffe2= "caffe2" in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE],
    )
    
    darknetlib_path = '/home/ubuntu/hanhbd/SlowFast/detector/libdarknet.so'
    config_path = '/home/ubuntu/hanhbd/SlowFast/detector/yolov4.cfg'
    meta_path = '/home/ubuntu/hanhbd/SlowFast/detector/coco.data'
    classes_path = '/home/ubuntu/hanhbd/SlowFast/detector/coco.names'
    weight_path = '/home/ubuntu/hanhbd/SlowFast/detector/yolov4.weights'
    
    if backbone == 'yolo':
        object_predictor =  YOLO.get_instance(darknetlib_path, config_path, meta_path, classes_path, weight_path)
    else:
        dtron2_cfg_file = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG
        dtron2_cfg = get_cfg()
        dtron2_cfg.merge_from_file(model_zoo.get_config_file(dtron2_cfg_file))
        dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = .5
        dtron2_cfg.MODEL.WEIGHTS = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS
        object_predictor = DefaultPredictor(dtron2_cfg)

    with open(cfg.DEMO.LABEL_FILE_PATH) as f:
        labels = f.read().split('\n')[:-1]
    palette = np.random.randint(64, 128, (len(labels), 3)).tolist()
    count_xxx = 0
    seq_len = cfg.DATA.NUM_FRAMES*cfg.DATA.SAMPLING_RATE

    frames = []
    org_frames = []
    mid_frame = None
    pred_labels = []
    draw_imgs = []

    cap  = cv2.VideoCapture(cfg.DEMO.DATA_SOURCE)
    was_read, frame = cap.read()
    display_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    display_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    fourcc = cv2.VideoWriter_fourcc(*'DIVX')
    videowriter = cv2.VideoWriter('./result/testset_fighting_05.avi',fourcc, fps, (display_width,display_height))

    while was_read :
        was_read, frame = cap.read()
        if not was_read:
            videowriter.release()
            break

        if len(frames) != seq_len:
            frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed)
            frames.append(frame_processed)
            org_frames.append(frame)

        else:
            #predict all person box in all frame 
            start = time()
            mid_frame = org_frames[seq_len//2 - 2]
            # just draw half of number frame because we will use slide = 1/2 length of sequence
            if cfg.DETECTION.ENABLE and len(draw_imgs) == 0:
                for idx in range(seq_len//2 - 1):
                    image = org_frames[idx]
                    boxes = detector(object_predictor , image, backbone, cfg , display_height, display_width )
                    # boxes = object_predictor.detect_image(img)
                    # boxes = torch.as_tensor(boxes).float().cuda()

                    boxes = torch.cat([torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1)
                    boxes = boxes.cpu().detach().numpy()
                    if backbone == 'yolo':
                        boxes = boxes[:, 1:]
                    else:
                        ratio = np.min(
                            [display_height, display_width]
                        ) / cfg.DATA.TEST_CROP_SIZE
                        boxes = boxes[:, 1:] * ratio

                    for box in boxes:
                        xmin, ymin, xmax, ymax = box
                        cv2.rectangle(image, (xmin, ymin), (xmax , ymax), (0, 255, 0), thickness=2)

                    draw_imgs.append(image)

            # detect box in mid frame
            if cfg.DETECTION.ENABLE:
                boxes = detector(object_predictor , mid_frame, backbone, cfg , display_height, display_width )
                boxes = torch.cat([torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1)

            inputs = torch.from_numpy(np.array(frames)).float()
            inputs = inputs / 255.0
            # Perform color normalization.
            inputs = inputs - torch.tensor(cfg.DATA.MEAN)
            inputs = inputs / torch.tensor(cfg.DATA.STD)
            # T H W C -> C T H W.
            inputs = inputs.permute(3, 0, 1, 2)

            # 1 C T H W.
            inputs = inputs.unsqueeze(0)

            # Sample frames for the fast pathway.
            index = torch.linspace(0, inputs.shape[2] - 1, cfg.DATA.NUM_FRAMES).long()
            fast_pathway = torch.index_select(inputs, 2, index)
            

            # Sample frames for the slow pathway.
            index = torch.linspace(0, fast_pathway.shape[2] - 1, 
                                    fast_pathway.shape[2]//cfg.SLOWFAST.ALPHA).long()
            slow_pathway = torch.index_select(fast_pathway, 2, index)
            inputs = [slow_pathway, fast_pathway]

            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list,)):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)

            # use a dummy variable to disable all computations below.
            if not len(boxes):
                preds = torch.tensor([])
            else:
                preds = model(inputs, boxes)

            # Gather all the predictions across all the devices to perform ensemble.
            if cfg.NUM_GPUS > 1:
                preds = du.all_gather(preds)[0]
                
            # post processing
            preds = preds.cpu().detach().numpy()
            pred_masks = preds > .1
            label_ids = [np.nonzero(pred_mask)[0] for pred_mask in pred_masks]
            pred_labels = [
                [labels[label_id] for label_id in perbox_label_ids]
                for perbox_label_ids in label_ids
            ]
            print(pred_labels)
            boxes = boxes.cpu().detach().numpy()
            if backbone == 'yolo':
                boxes = boxes[:, 1:]
            else:
                ratio = np.min(
                    [display_height, display_width]
                ) / cfg.DATA.TEST_CROP_SIZE
                boxes = boxes[:, 1:] * ratio


            # draw result on mid frame
            if pred_labels and boxes.any():
                for box, box_labels in zip(boxes.astype(int), pred_labels):
                    xmin, ymin, xmax, ymax = box
                    cv2.rectangle(mid_frame, (xmin, ymin), (xmax , ymax), (0, 255, 0), thickness=2)
                
                    label_origin = box[:2]
                    for label in box_labels:
                        label_origin[-1] -= 5
                        (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, .5, 2)
                        cv2.rectangle(
                            mid_frame, 
                            (label_origin[0], label_origin[1] + 5), 
                            (label_origin[0] + label_width, label_origin[1] - label_height - 5),
                            palette[labels.index(label)], -1
                        )
                        cv2.putText(
                            mid_frame, label, tuple(label_origin), 
                            cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1
                        )
                        label_origin[-1] -= label_height + 5

            # append mid frame to the draw array
            draw_imgs.append(mid_frame)

            # write image to videos
            for img_ in draw_imgs:
                videowriter.write(img_)
            print("time process", (time() - start) /64 )
            # clean the buffer of frames and org_frames with slide 1/2 seq_len
            # frames = frames[seq_len//2 - 1:]
            # org_frames = org_frames[seq_len//2 - 1:]

            frames = frames[1:]
            org_frames = org_frames[1:]
            draw_imgs = draw_imgs[-1:]

            count_xxx += 1
Пример #9
0
def demo(cfg):
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Run demo with config:")
    logger.info(cfg)
    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    model.eval()
    misc.log_model_info(model)

    # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        ckpt = cfg.TEST.CHECKPOINT_FILE_PATH
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current
        # checkpoint folder, try to load checkpoint from
        # TRAIN.CHECKPOINT_FILE_PATH and test it.
        ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH
    else:
        raise NotImplementedError("Unknown way to load checkpoint.")

    cu.load_checkpoint(
        ckpt,
        model,
        cfg.NUM_GPUS > 1,
        None,
        inflation=False,
        convert_from_caffe2="caffe2"
        in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE],
    )

    # Load the labels of Kinectics-400 dataset
    labels_df = pd.read_csv(cfg.DEMO.LABEL_FILE_PATH)
    labels = labels_df['name'].values
    img_provider = VideoReader(cfg)
    frames = []
    # # Option 1
    # pred_label = ''
    # Option 2
    pred_labels = []
    s = 0.
    for able_to_read, frame in img_provider:
        if not able_to_read:
            # when reaches the end frame, clear the buffer and continue to the next one.
            frames = []
            continue

        if len(frames) != cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE:
            frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_processed = scale(256, frame_processed)
            frames.append(frame_processed)

        if len(frames) == cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE:
            start = time()
            # Perform color normalization.
            inputs = torch.tensor(frames).float()
            inputs = inputs / 255.0
            inputs = inputs - torch.tensor(cfg.DATA.MEAN)
            inputs = inputs / torch.tensor(cfg.DATA.STD)
            # T H W C -> C T H W.
            inputs = inputs.permute(3, 0, 1, 2)
            # 1 C T H W.
            inputs = inputs[None, :, :, :, :]
            # Sample frames for the fast pathway.
            index = torch.linspace(0, inputs.shape[2] - 1,
                                   cfg.DATA.NUM_FRAMES).long()
            fast_pathway = torch.index_select(inputs, 2, index)
            logger.info('fast_pathway.shape={}'.format(fast_pathway.shape))
            # Sample frames for the slow pathway.
            index = torch.linspace(0, fast_pathway.shape[2] - 1,
                                   fast_pathway.shape[2] //
                                   cfg.SLOWFAST.ALPHA).long()
            slow_pathway = torch.index_select(fast_pathway, 2, index)
            logger.info('slow_pathway.shape={}'.format(slow_pathway.shape))
            inputs = [slow_pathway, fast_pathway]
            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)

            # Perform the forward pass.
            preds = model(inputs)
            # Gather all the predictions across all the devices to perform ensemble.
            if cfg.NUM_GPUS > 1:
                preds = du.all_gather(preds)[0]

            ## Option 1: single label inference selected from the highest probability entry.
            # label_id = preds.argmax(-1).cpu()
            # pred_label = labels[label_id]
            # Option 2: multi-label inferencing selected from probability entries > threshold
            label_ids = torch.nonzero(
                preds.squeeze() > .1).reshape(-1).cpu().detach().numpy()
            pred_labels = labels[label_ids]
            logger.info(pred_labels)
            if not list(pred_labels):
                pred_labels = ['Unknown']

            # remove the oldest frame in the buffer to make place for the new one.
            # frames.pop(0)
            frames = []
            s = time() - start

        # #************************************************************
        # # Option 1
        # #************************************************************
        # # Display prediction speed to frame
        # cv2.putText(frame, 'Speed: {:.2f}s'.format(s), (20, 30),
        #             fontFace=cv2.FONT_HERSHEY_SIMPLEX,
        #             fontScale=1, color=(0, 235, 0), thickness=3)
        # # Display predicted label to frame.
        # cv2.putText(frame, 'Action: {}'.format(pred_label), (20, 60),
        #             fontFace=cv2.FONT_HERSHEY_SIMPLEX,
        #             fontScale=1, color=(0, 235, 0), thickness=3)
        #************************************************************
        # Option 2
        #************************************************************
        # Display prediction speed to frame
        cv2.putText(frame,
                    'Speed: {:.2f}s'.format(s), (20, 30),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale=1,
                    color=(0, 235, 0),
                    thickness=3)
        # Display predicted labels to frame.
        y_offset = 60
        cv2.putText(frame,
                    'Action:', (20, y_offset),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale=1,
                    color=(0, 235, 0),
                    thickness=3)
        for pred_label in pred_labels:
            y_offset += 30
            cv2.putText(frame,
                        '{}'.format(pred_label), (20, y_offset),
                        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=1,
                        color=(0, 235, 0),
                        thickness=3)

        # Display the frame
        cv2.imshow('SlowFast', frame)
        # hit Esc to quit the demo.
        key = cv2.waitKey(1)
        if key == 27:
            break

    img_provider.clean()
Пример #10
0
def feature_extract(kwargs):
    """
    Perform multi-view testing on the pretrained video model.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    cfg = kwargs.pop('cfg')
    path_to_video_list = kwargs.pop('path_to_video_list')
    path_to_video_dir = kwargs.pop('path_to_video_dir')
    path_to_feat_dir = kwargs.pop('path_to_feat_dir')
    num_features = kwargs.pop('num_features')

    if not os.path.isdir(path_to_feat_dir):
        os.makedirs(path_to_feat_dir)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Extract feature with config:")
    logger.info(cfg)

    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg, feature_extraction=True)
    if du.is_master_proc():
        misc.log_model_info(model)

    # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        cu.load_checkpoint(
            cfg.TEST.CHECKPOINT_FILE_PATH,
            model,
            cfg.NUM_GPUS > 1,
            None,
            inflation=False,
            convert_from_caffe2=cfg.TEST.CHECKPOINT_TYPE == "caffe2",
        )
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
        cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current
        # checkpoint folder, try to load checkpint from
        # TRAIN.CHECKPOINT_FILE_PATH and test it.
        cu.load_checkpoint(
            cfg.TRAIN.CHECKPOINT_FILE_PATH,
            model,
            cfg.NUM_GPUS > 1,
            None,
            inflation=False,
            convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2",
        )
    else:
        # raise NotImplementedError("Unknown way to load checkpoint.")
        logger.info("Extracting features with random initialization. Only for debugging.")

    feature_extract_fn = perform_bbox_feature_extract if cfg.DETECTION.ENABLE else perform_feature_extract

    # Create video feature extraction loaders.
    if osp.isfile(path_to_video_list):
        path_to_videos = []
        with open(path_to_video_list, 'r') as f:
            video_list = json.load(f)

            for video_name in video_list:
                path_to_video = osp.join(path_to_video_dir, video_name)
                if osp.isfile(path_to_video):
                    path_to_videos.append(path_to_video)

    else:
        path_to_videos = glob.glob(osp.join(path_to_video_dir, '*'))

    for video_idx, path_to_video in enumerate(tqdm(path_to_videos)):
        path_to_feature = osp.join(path_to_feat_dir, osp.splitext(osp.basename(path_to_video))[0] + '.json')
        if osp.isfile(path_to_feature) and json.load(open(path_to_feature))['num_features'] == num_features:
            continue

        video_extraction_loader = loader.construct_loader(cfg, path_to_video)

        video_data = {
            'num_features': 0,
            'video_features': {}
        }

        if len(video_extraction_loader) > 0:
            video_features = feature_extract_fn(video_extraction_loader, model, cfg)
            assert all([feature is not None for feature in video_features]), 'Missing some features !'
            video_data['num_features'] = len(video_features)
            video_data['video_features'] = video_features

        if video_data['num_features'] != num_features:
            print('Warning! Video %s has %d features.' % (path_to_video, video_data['num_features']))

        with open(path_to_feature, 'w') as f:
            json.dump(video_data, f)
Пример #11
0
def main():
    """
    Main function to spawn the train and test process.
    """
    args = parse_args()
    cfg = load_config(args)
    last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
    cfg.NUM_GPUS = 1  # fix single gpu to demo
    model = model_builder.build_model(cfg)
    cu.load_checkpoint(last_checkpoint, model, False)

    path = args.path
    cat = {1: 'walking', 2: 'standing', 3: 'rising', 4: 'lying', 5: 'falling'}
    model.eval().cuda()
    with torch.no_grad():
        for inputs in load(path, cfg.DATA.TEST_CROP_SIZE, False, cfg.DATA.MEAN,
                           cfg.DATA.STD, cfg.SLOWFAST.ALPHA):
            if isinstance(inputs, (list, tuple)) and isinstance(
                    inputs[0], list) and torch.is_tensor(inputs[0][0]):
                data, frame = inputs
                startTime = time.time()
                for i in range(len(data)):
                    data[i] = data[i].cuda(non_blocking=True)

                results = model(data)
                endTime = time.time() - startTime
                print(endTime)
                scores = results[0].get_field('scores')
                index = scores > 0.3
                results = results[0][index]
                bbox = results.bbox.int()
                scores = results.get_field("scores").tolist()
                labels = results.get_field("labels").tolist()
                labels = [cat[i] for i in labels]
                # bbox = results[0].bbox.int()
                # print(data[0].shape)
                # print(data[1].shape)
                # frame = data[1][0, :, -1].permute(1, 2, 0).cpu().numpy()
                # print(frame.shape)
                template = "{}: {:.2f}"

                if bbox.shape[0] > 0:
                    for box, score, label in zip(bbox, scores, labels):
                        x, y = box[:2]
                        s = template.format(label, score)
                        top_left, bottom_right = box[:2].tolist(
                        ), box[2:].tolist()
                        frame = cv2.rectangle(frame, tuple(top_left),
                                              tuple(bottom_right), (255, 0, 0),
                                              2)
                        frame = cv2.putText(frame, s, (x, y),
                                            cv2.FONT_HERSHEY_SIMPLEX, .5,
                                            (255, 255, 255), 1)
                    cv2.imshow('show', frame)
                else:
                    cv2.imshow('show', frame)

                cv2.waitKey(5)
                # print(results[0].bbox)
                # print(results[0].get_field('scores'))
            else:
                break