def run_all_pipeline(input_video, smoothing_k, activity_threshold):
    input_size = (112, 112)
    length = 16

    # Load labels
    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)

    print('Reading Video...')
    video_array = video_to_array(input_video, resize=input_size)
    if video_array is None:
        raise Exception('The video could not be read')
    nb_frames = get_num_frames(input_video)
    duration = get_duration(input_video)
    fps = nb_frames / duration
    print('Duration: {:.1f}s'.format(duration))
    print('FPS: {:.1f}'.format(fps))
    print('Number of frames: {}'.format(nb_frames))

    nb_clips = nb_frames // length
    video_array = video_array.transpose(1, 0, 2, 3)
    video_array = video_array[:nb_clips*length,:,:,:]
    video_array = video_array.reshape((nb_clips, length, 3, 112, 112))
    video_array = video_array.transpose(0, 2, 1, 3, 4)

    # Load C3D model and mean
    print('Loading C3D network...')
    model  = C3D_conv_features(True)
    model.compile(optimizer='sgd', loss='mse')
    mean_total = np.load('data/models/c3d-sports1M_mean.npy')
    mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True)

    # Extract features
    print('Extracting features...')
    X = video_array - mean
    Y = model.predict(X, batch_size=1, verbose=1)

    # Load the temporal localization network
    print('Loading temporal localization network...')
    model_localization = temporal_localization_network(True)
    model_localization.compile(optimizer='rmsprop', loss='categorical_crossentropy')

    # Predict with the temporal localization network
    Y = Y.reshape(nb_clips, 1, 4096)
    prediction = model_localization.predict(Y, batch_size=1, verbose=1)
    prediction = prediction.reshape(nb_clips, 201)

    # Post processing the predited output
    print('Post-processing output...')
    labels_idx, scores = get_classification(prediction, k=5)
    print('Video: {}\n'.format(input_video))
    for idx, score in zip(labels_idx, scores):
        label = labels[idx]
        print('{:.4f}\t{}'.format(score, label))

    prediction_smoothed = smoothing(prediction, k=smoothing_k)
    activities_idx, startings, endings, scores = activity_localization(

    for idx, s, e, score in zip(activities_idx, startings, endings, scores):
        start = s * float(length) / fps
        end = e * float(length) / fps
        label = labels[idx]
        print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))
def run_all_pipeline(input_video, smoothing_k, activity_threshold):
    input_size = (112, 112)
    length = 16

    # Load labels
    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)

    print('Reading Video...')
    video_array = video_to_array(input_video, resize=input_size)
    if video_array is None:
        raise Exception('The video could not be read')
    nb_frames = get_num_frames(input_video)
    duration = get_duration(input_video)
    fps = nb_frames / duration
    print('Duration: {:.1f}s'.format(duration))
    print('FPS: {:.1f}'.format(fps))
    print('Number of frames: {}'.format(nb_frames))

    nb_clips = nb_frames // length
    video_array = video_array.transpose(1, 0, 2, 3)
    video_array = video_array[:nb_clips * length, :, :, :]
    video_array = video_array.reshape((nb_clips, length, 3, 112, 112))
    video_array = video_array.transpose(0, 2, 1, 3, 4)

    # Load C3D model and mean
    print('Loading C3D network...')
    model = C3D_conv_features(True)
    model.compile(optimizer='sgd', loss='mse')
    mean_total = np.load('data/models/c3d-sports1M_mean.npy')
    mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True)

    # Extract features
    print('Extracting features...')
    X = video_array - mean
    Y = model.predict(X, batch_size=1, verbose=1)

    # Load the temporal localization network
    print('Loading temporal localization network...')
    model_localization = temporal_localization_network(True)

    # Predict with the temporal localization network
    Y = Y.reshape(nb_clips, 1, 4096)
    prediction = model_localization.predict(Y, batch_size=1, verbose=1)
    prediction = prediction.reshape(nb_clips, 201)

    # Post processing the predited output
    print('Post-processing output...')
    labels_idx, scores = get_classification(prediction, k=5)
    print('Video: {}\n'.format(input_video))
    for idx, score in zip(labels_idx, scores):
        label = labels[idx]
        print('{:.4f}\t{}'.format(score, label))

    prediction_smoothed = smoothing(prediction, k=smoothing_k)
    activities_idx, startings, endings, scores = activity_localization(
        prediction_smoothed, activity_threshold)

    for idx, s, e, score in zip(activities_idx, startings, endings, scores):
        start = s * float(length) / fps
        end = e * float(length) / fps
        label = labels[idx]
        print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end,
Exemplo n.º 3
def run_runtime_tests(input_video, model_features, c3d_mean, model_localization):
    input_size = (112, 112)
    length = 16

    # Setup post-processing variables
    smoothing_k = 5
    activity_threshold = .2

    # Load labels
    with open('dataset/labels.txt', 'r') as f:
        labels = import_labels(f)

    print('Reading Video...')
    t_s = time.time()
    video_array = video_to_array(input_video, resize=input_size)
    t_e = time.time()
    print('Loading Video: {:.2f}s'.format(t_e-t_s))
    if video_array is None:
        raise Exception('The video could not be read')
    nb_frames = get_num_frames(input_video)
    duration = get_duration(input_video)
    fps = nb_frames / duration
    print('Duration: {:.1f}s'.format(duration))
    print('FPS: {:.1f}'.format(fps))
    print('Number of frames: {}'.format(nb_frames))

    nb_clips = nb_frames // length
    video_array = video_array.transpose(1, 0, 2, 3)
    video_array = video_array[:nb_clips*length,:,:,:]
    video_array = video_array.reshape((nb_clips, length, 3, 112, 112))
    video_array = video_array.transpose(0, 2, 1, 3, 4)

    # Extract features
    print('Extracting features...')
    t_s = time.time()
    X = video_array - c3d_mean
    Y = model_features.predict(X, batch_size=1, verbose=1)
    t_e = time.time()
    print('Extracting C3D features: {:.2f}s'.format(t_e-t_s))

    # Predict with the temporal localization network
    t_s = time.time()
    Y = Y.reshape(nb_clips, 1, 4096)
    prediction = model_localization.predict(Y, batch_size=1, verbose=1)
    prediction = prediction.reshape(nb_clips, 201)
    t_e = time.time()
    print('Prediction temporal activities: {:.2f}s'.format(t_e-t_s))

    # Post processing the predited output
    print('Post-processing output...')
    t_s = time.time()

    labels_idx, scores = get_classification(prediction, k=5)
    print('Video: {}\n'.format(input_video))
    for idx, score in zip(labels_idx, scores):
        label = labels[idx]
        print('{:.4f}\t{}'.format(score, label))

    prediction_smoothed = smoothing(prediction, k=smoothing_k)
    activities_idx, startings, endings, scores = activity_localization(
    t_e = time.time()
    print('Post-processing runtime: {:.2f}s'.format(t_e-t_s))

    for idx, s, e, score in zip(activities_idx, startings, endings, scores):
        start = s * float(length) / fps
        end = e * float(length) / fps
        label = labels[idx]
        print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))