Exemplo n.º 1
0
def test_bulk_read_write(store):
    data_as_dict = dict(make_entry() for _ in range(100))

    bulk_write(store, data_as_dict)
    assert bulk_read(store) == data_as_dict
    assert set(store.list()) == set(data_as_dict.keys())

    # Get half of the data
    subset = dict(islice(data_as_dict.items(), 0, int(len(data_as_dict) / 2)))
    assert bulk_read(store, select=subset.keys()) == subset
def Convert(config):
    # collect_files(get_ray_nodes())
    reps = ReprStorage(os.path.join(config.repr.directory))
    # print('Extracting Signatures from Video representations')
    sm = SimilarityModel()
    vid_level_iterator = bulk_read(reps.video_level)
    print("Prepare to update database! vid_num :" +
          str(len(vid_level_iterator)))
    if len(vid_level_iterator) > 0:
        signatures = sm.predict(
            vid_level_iterator)  # Get {ReprKey => signature} dict

        if config.database.use:
            # Convert dict to list of (path, sha256, url, signature) tuples
            entries = [(key.path, key.hash, key.url, sig)
                       for key, sig in signatures.items()]

            # Connect to database
            database = Database(uri=config.database.uri)
            database.create_tables()

            try:
                # Save signatures
                result_storage = DBResultStorage(database)
                result_storage.add_signatures(entries)
                # after writen to db, remove.
                for key, sig in signatures.items():
                    remove_file("/project/data/representations/video_level/" +
                                key.path + ".npy")
            except Exception as e:
                print("save db ERROR!")
                print(e)
Exemplo n.º 3
0
def test_intermediate_cnn_extractor(intermediate_cnn_results, repr_keys):
    assert set(intermediate_cnn_results.frame_level.list()) == set(repr_keys)

    frame_level_features = list(bulk_read(intermediate_cnn_results.frame_level).values())

    shapes_correct = sum(features.shape[1] == 4096 for features in frame_level_features)

    assert shapes_correct == len(repr_keys)
Exemplo n.º 4
0
def signatures(frame_to_video_results):
    """Get calculated signatures as a dict.

    Each test dependent on this fixture is guaranteed to be
    executed AFTER signatures are calculated.

    Returns:
        Signatures dict (orig_path,hash) => signature.
    """
    reprs = frame_to_video_results
    sm = SimilarityModel()
    signatures = sm.predict(bulk_read(reprs.video_level))
    for repr_key, sig_value in signatures.items():
        reprs.signature.write(repr_key, sig_value)
    return signatures
Exemplo n.º 5
0
def test_saved_signatures(reprs, repr_keys):
    signatures = bulk_read(reprs.signature)
    assert set(signatures.keys()) == set(repr_keys)

    signatures_array = np.array(list(signatures.values()))
    assert signatures_array.shape == (NUMBER_OF_TEST_VIDEOS, 500)
Exemplo n.º 6
0
def test_frame_to_video_converter(frame_to_video_results, repr_keys):
    assert set(frame_to_video_results.video_level.list()) == set(repr_keys)

    video_level_features = np.array(list(bulk_read(frame_to_video_results.video_level).values()))

    assert video_level_features.shape == (len(repr_keys), 1, 4096)
def main(config, list_of_files, frame_sampling, save_frames):
    config = resolve_config(config_path=config,
                            frame_sampling=frame_sampling,
                            save_frames=save_frames)

    reps = ReprStorage(os.path.join(config.repr.directory))
    reprkey = reprkey_resolver(config)

    print('Searching for Dataset Video Files')

    if len(list_of_files) == 0:
        videos = scan_videos(config.sources.root,
                             '**',
                             extensions=config.sources.extensions)
    else:
        videos = scan_videos_from_txt(list_of_files,
                                      extensions=config.sources.extensions)

    print('Number of files found: {}'.format(len(videos)))

    remaining_videos_path = [
        path for path in videos if not reps.frame_level.exists(reprkey(path))
    ]

    print('There are {} videos left'.format(len(remaining_videos_path)))

    VIDEOS_LIST = create_video_list(remaining_videos_path,
                                    config.proc.video_list_filename)

    print('Processed video List saved on :{}'.format(VIDEOS_LIST))

    if len(remaining_videos_path) > 0:
        # Instantiates the extractor
        model_path = default_model_path(
            config.proc.pretrained_model_local_path)
        extractor = IntermediateCnnExtractor(
            video_src=VIDEOS_LIST,
            reprs=reps,
            reprkey=reprkey,
            frame_sampling=config.proc.frame_sampling,
            save_frames=config.proc.save_frames,
            model=(load_featurizer(model_path)))
        # Starts Extracting Frame Level Features
        extractor.start(batch_size=16, cores=4)

    print('Converting Frame by Frame representations to Video Representations')

    converter = FrameToVideoRepresentation(reps)

    converter.start()

    print('Extracting Signatures from Video representations')

    sm = SimilarityModel()

    vid_level_iterator = bulk_read(reps.video_level)

    assert len(vid_level_iterator) > 0, 'No Signatures left to be processed'

    signatures = sm.predict(
        vid_level_iterator)  # Get {ReprKey => signature} dict

    print('Saving Video Signatures on :{}'.format(reps.signature.directory))

    if config.database.use:
        # Convert dict to list of (path, sha256, signature) tuples
        entries = [(key.path, key.hash, key.url, sig)
                   for key, sig in signatures.items()]

        # Connect to database
        database = Database(uri=config.database.uri)
        database.create_tables()

        # Save signatures
        result_storage = DBResultStorage(database)
        result_storage.add_signatures(entries)

    if config.save_files:
        bulk_write(reps.signature, signatures)
def main(config):

    print('Loading config file')
    config = resolve_config(config_path=config)
    reps = ReprStorage(config.repr.directory)

    # Get mapping (path,hash) => sig.
    print('Extracting Video Signatures')
    signature_iterator = bulk_read(reps.signature)

    if len(signature_iterator) == 0:

        vid_level_iterator = bulk_read(reps.video_level)
        assert len(vid_level_iterator) > 0, "No video_level features were found"
        sm = SimilarityModel()
        signatures_dict = sm.predict(bulk_read(reps.video_level))
        # Unpack paths, hashes and signatures as separate np.arrays
        repr_keys, video_signatures = zip(*signatures_dict.items())

    else:
        repr_keys, video_signatures = zip(*signature_iterator.items())
    paths = np.array([key.path for key in repr_keys])
    hashes = np.array([key.hash for key in repr_keys])
    video_signatures = np.array(video_signatures)
    
    
    print('Finding Matches...')
    # Handles small tests for which number of videos <  number of neighbors
    t0 = time.time()
    neighbors = min(20,video_signatures.shape[0])
    nn = NearestNeighbors(n_neighbors=neighbors,metric='euclidean',algorithm='kd_tree')
    nn.fit(video_signatures)
    distances,indices =  nn.kneighbors(video_signatures)
    print('{} seconds spent finding matches '.format(time.time()-t0))
    results,results_distances = filter_results(config.proc.match_distance, distances, indices)

    ss = sorted(zip(results,results_distances),key=lambda x:len(x[0]),reverse=True)
    results_sorted = [x[0] for x in ss]
    results_sorted_distance = [x[1] for x in ss]


    q = []
    m = []
    distance = []

    print('Generating Report')
    for i,r in enumerate(results_sorted):
        for j,matches in enumerate(r):
            if j == 0:
                qq = matches
            q.append(qq)
            m.append(matches)
            distance.append(results_sorted_distance[i][j])

    match_df = pd.DataFrame({"query":q,"match":m,"distance":distance})
    match_df['query_video'] = paths[match_df['query']]
    match_df['query_sha256'] = hashes[match_df['query']]
    match_df['match_video'] = paths[match_df['match']]
    match_df['match_sha256'] = hashes[match_df['match']]
    match_df['self_match'] = match_df['query_video'] == match_df['match_video']
    # Remove self matches
    match_df = match_df.loc[~match_df['self_match'], :]
    # Creates unique index from query, match 
    match_df['unique_index'] = match_df.apply(uniq, axis=1)
    # Removes duplicated entries (eg if A matches B, we don't need B matches A)
    match_df = match_df.drop_duplicates(subset=['unique_index'])

    REPORT_PATH = os.path.join(config.repr.directory, f'matches_at_{config.proc.match_distance}_distance.csv')

    print('Saving unfiltered report to {}'.format(REPORT_PATH))

    match_df.to_csv(REPORT_PATH)

#    if config.proc.detect_scenes:
#
#        frame_features_dict = bulk_read(reps.frame_level, select=None)
#        assert len(frame_features_dict) > 0, 'No Frame Level features were found.'
#        scenes = extract_scenes(frame_features_dict)
#        scene_metadata = pd.DataFrame(asdict(scenes))
#
#        if config.database.use:
#            # Connect to database
#            database = Database(uri=config.database.uri)
#            database.create_tables()
#
#            # Save scenes
#            result_storage = DBResultStorage(database)
#            result_storage.add_scenes(zip(scenes.video_filename, scenes.video_sha256, scenes.scene_duration_seconds))
#
#        if config.save_files:
#
#            SCENE_METADATA_OUTPUT_PATH = os.path.join(config.repr.directory, 'scene_metadata.csv')
#            scene_metadata.to_csv(SCENE_METADATA_OUTPUT_PATH)
#            print('Scene Metadata saved in:'.format(SCENE_METADATA_OUTPUT_PATH))


    if config.proc.filter_dark_videos:

        print('Filtering dark and/or short videos')

        # Get original files for which we have both frames and frame-level features
        repr_keys = list(set(reps.video_level.list()))
        paths = [key.path for key in repr_keys]
        hashes = [key.hash for key in repr_keys]

        print('Extracting additional information from video files')
        brightness_estimation = np.array([get_brightness_estimation(reps, key) for key in tqdm(repr_keys)])
        print(brightness_estimation.shape)
        metadata_df = pd.DataFrame({"fn": paths,
                                    "sha256": hashes,
                                    "gray_max":brightness_estimation.reshape(brightness_estimation.shape[0])})

        # Flag videos to be discarded

        metadata_df['video_dark_flag'] = metadata_df.gray_max < config.proc.filter_dark_videos_thr

        print('Videos discarded because of darkness:{}'.format(metadata_df['video_dark_flag'].sum()))

        metadata_df['flagged'] = metadata_df['video_dark_flag'] 

        # Discard videos
        discarded_videos = metadata_df.loc[metadata_df['flagged'], :][['fn', 'sha256']]
        discarded_videos = set(tuple(row) for row in discarded_videos.to_numpy())

        # Function to check if the (path,hash) row is in the discarded set
        def is_discarded(row):
            return tuple(row) in discarded_videos

        msk_1 = match_df[['query_video', 'query_sha256']].apply(is_discarded, axis=1)
        msk_2 = match_df[['match_video', 'match_sha256']].apply(is_discarded, axis=1)
        discard_msk = msk_1 | msk_2

        FILTERED_REPORT_PATH = os.path.join(config.repr.directory,
                                            f'matches_at_{config.proc.match_distance}_distance_filtered.csv')
        METADATA_REPORT_PATH = os.path.join(config.repr.directory, 'metadata_signatures.csv')

        match_df = match_df.loc[~discard_msk, :]        

    if config.database.use:
        # Connect to database and ensure schema
        database = Database(uri=config.database.uri)
        database.create_tables()

        # Save metadata
        result_storage = DBResultStorage(database)
        
        if metadata_df is not None:

            metadata_entries = metadata_df[['fn', 'sha256']]
            metadata_entries['metadata'] = metadata_df.drop(columns=['fn', 'sha256']).to_dict('records')
            result_storage.add_metadata(metadata_entries.to_numpy())

        # Save matches
        match_columns = ['query_video', 'query_sha256', 'match_video', 'match_sha256', 'distance']

        result_storage.add_matches(match_df[match_columns].to_numpy())

    if config.save_files:

        print('Saving metadata to {}'.format(METADATA_REPORT_PATH))
        metadata_df.to_csv(METADATA_REPORT_PATH)
        print('Saving Filtered Matches report to {}'.format(METADATA_REPORT_PATH))
        match_df.to_csv(FILTERED_REPORT_PATH)