Python SimilarityModel示例，winnow.feature_extraction.SimilarityModel Python示例

示例#1

0

显示文件

文件： ray_extract_features.py 项目： chenhai1030/VideoDeduplication

def Convert(config):
    # collect_files(get_ray_nodes())
    reps = ReprStorage(os.path.join(config.repr.directory))
    # print('Extracting Signatures from Video representations')
    sm = SimilarityModel()
    vid_level_iterator = bulk_read(reps.video_level)
    print("Prepare to update database! vid_num :" +
          str(len(vid_level_iterator)))
    if len(vid_level_iterator) > 0:
        signatures = sm.predict(
            vid_level_iterator)  # Get {ReprKey => signature} dict

        if config.database.use:
            # Convert dict to list of (path, sha256, url, signature) tuples
            entries = [(key.path, key.hash, key.url, sig)
                       for key, sig in signatures.items()]

            # Connect to database
            database = Database(uri=config.database.uri)
            database.create_tables()

            try:
                # Save signatures
                result_storage = DBResultStorage(database)
                result_storage.add_signatures(entries)
                # after writen to db, remove.
                for key, sig in signatures.items():
                    remove_file("/project/data/representations/video_level/" +
                                key.path + ".npy")
            except Exception as e:
                print("save db ERROR!")
                print(e)

示例#2

0

显示文件

def main(path, output, config, save_frames, save_features, save_signatures,
         save_db):
    """
    Application to extract features from a single video file
    """
    print(save_db)

    PRETRAINED_LOCAL_PATH = download_pretrained(config)
    video_name = os.path.basename(path)

    model = load_featurizer(PRETRAINED_LOCAL_PATH)
    video_tensor = load_video(path, model.desired_size)
    features = model.extract(video_tensor, 10)

    video_level_repres = global_vector_from_tensor(features)
    sm = SimilarityModel()
    sm.build_features_single(video_level_repres, video_name)
    video_signatures = sm.predict()

    video_signatures = np.nan_to_num(video_signatures)

    if save_frames:

        frame_path = os.path.join(
            output, '{}_{}_frames'.format(video_name, model.net_name))
        np.save(frame_path, video_tensor)

    if save_features:

        features_path = os.path.join(
            output, '{}_{}_features'.format(video_name, model.net_name))
        np.save(features_path, features)

    if save_signatures:

        signatures_path = os.path.join(
            output, '{}_{}_signature'.format(video_name, model.net_name))
        np.save(signatures_path, video_signatures)

    if save_db:

        with open(config, 'r') as ymlfile:
            cfg = yaml.load(ymlfile)

        CONNINFO = cfg['conninfo']

        db_engine, session = create_engine_session(CONNINFO)
        create_tables(db_engine)
        #TODO Currently we have an automated incremental index set for the Signatures table (we might want to change it in the future so we don't add duplicated signatures)
        processed_paths = [os.path.relpath(path)]
        file_entries = add_files(session, processed_paths)

        # Extract ids from records in order to save signatures with the proper information
        processed_to_id = dict({x.file_path: x.id for x in file_entries})
        file_ids = [processed_to_id[x] for x in processed_paths]
        signatures = add_signatures(session, video_signatures, file_ids)

示例#3

0

显示文件

文件： general_tests.py 项目： giselilla/VideoDeduplication

def signatures(frame_to_video_results):
    """Get calculated signatures as a dict.

    Each test dependent on this fixture is guaranteed to be
    executed AFTER signatures are calculated.

    Returns:
        Signatures dict (orig_path,hash) => signature.
    """
    reprs = frame_to_video_results
    sm = SimilarityModel()
    signatures = sm.predict(bulk_read(reprs.video_level))
    for repr_key, sig_value in signatures.items():
        reprs.signature.write(repr_key, sig_value)
    return signatures

示例#4

0

显示文件

文件： utils.py 项目： giselilla/VideoDeduplication

def get_frame_sampling_permutations(frame_samplings, frame_level_files):

    d = defaultdict(list)

    for v in frame_level_files:

        data = np.load(v)

        for frame_sampling in frame_samplings:

            d[frame_sampling].append(data[::frame_sampling])

    sm = SimilarityModel()

    signatures = defaultdict(list)
    for fs in d.keys():

        video_level = np.array([global_vector(x) for x in d[fs]])
        signatures[fs].append(
            sm.predict_from_features(
                video_level.reshape(video_level.shape[0],
                                    video_level.shape[2])))

    return signatures

示例#5

0

显示文件

文件： network_vis.py 项目： stepan-anokhin/VideoDeduplication

import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import NearestNeighbors, KDTree, BallTree, LSHForest, NearestCentroid
from pyvis.network import Network
from winnow.feature_extraction import SimilarityModel
import yaml

sm = SimilarityModel()
print('Loading config file')

with open("config.yaml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)

DISTANCE = float(cfg['match_distance'])
DST_FOLDER = cfg['destination_folder']
VIDEO_LEVEL_SAVE_FOLDER = cfg["video_level_folder"]

print('Extracting Video Signatures')
sm = SimilarityModel()
video_signatures = sm.predict(VIDEO_LEVEL_SAVE_FOLDER)
video_signatures = np.nan_to_num(video_signatures)
labels = np.array([x.split('_vgg')[0].split('/')[-1] for x in sm.index])


def filter_results(distances, indexes, thr):
    results = []
    results_distances = []
    msk = distances < thr
    for i, r in enumerate(msk):
        results.append(indexes[i, r])
        results_distances.append(distances[i, r])

示例#6

0

显示文件

文件： extract_features.py 项目： chenhai1030/VideoDeduplication

def main(config, list_of_files, frame_sampling, save_frames):
    config = resolve_config(config_path=config,
                            frame_sampling=frame_sampling,
                            save_frames=save_frames)

    reps = ReprStorage(os.path.join(config.repr.directory))
    reprkey = reprkey_resolver(config)

    print('Searching for Dataset Video Files')

    if len(list_of_files) == 0:
        videos = scan_videos(config.sources.root,
                             '**',
                             extensions=config.sources.extensions)
    else:
        videos = scan_videos_from_txt(list_of_files,
                                      extensions=config.sources.extensions)

    print('Number of files found: {}'.format(len(videos)))

    remaining_videos_path = [
        path for path in videos if not reps.frame_level.exists(reprkey(path))
    ]

    print('There are {} videos left'.format(len(remaining_videos_path)))

    VIDEOS_LIST = create_video_list(remaining_videos_path,
                                    config.proc.video_list_filename)

    print('Processed video List saved on :{}'.format(VIDEOS_LIST))

    if len(remaining_videos_path) > 0:
        # Instantiates the extractor
        model_path = default_model_path(
            config.proc.pretrained_model_local_path)
        extractor = IntermediateCnnExtractor(
            video_src=VIDEOS_LIST,
            reprs=reps,
            reprkey=reprkey,
            frame_sampling=config.proc.frame_sampling,
            save_frames=config.proc.save_frames,
            model=(load_featurizer(model_path)))
        # Starts Extracting Frame Level Features
        extractor.start(batch_size=16, cores=4)

    print('Converting Frame by Frame representations to Video Representations')

    converter = FrameToVideoRepresentation(reps)

    converter.start()

    print('Extracting Signatures from Video representations')

    sm = SimilarityModel()

    vid_level_iterator = bulk_read(reps.video_level)

    assert len(vid_level_iterator) > 0, 'No Signatures left to be processed'

    signatures = sm.predict(
        vid_level_iterator)  # Get {ReprKey => signature} dict

    print('Saving Video Signatures on :{}'.format(reps.signature.directory))

    if config.database.use:
        # Convert dict to list of (path, sha256, signature) tuples
        entries = [(key.path, key.hash, key.url, sig)
                   for key, sig in signatures.items()]

        # Connect to database
        database = Database(uri=config.database.uri)
        database.create_tables()

        # Save signatures
        result_storage = DBResultStorage(database)
        result_storage.add_signatures(entries)

    if config.save_files:
        bulk_write(reps.signature, signatures)

示例#7

0

显示文件

文件： generate_matches.py 项目： chenhai1030/VideoDeduplication

def main(config):

    print('Loading config file')
    config = resolve_config(config_path=config)
    reps = ReprStorage(config.repr.directory)

    # Get mapping (path,hash) => sig.
    print('Extracting Video Signatures')
    signature_iterator = bulk_read(reps.signature)

    if len(signature_iterator) == 0:

        vid_level_iterator = bulk_read(reps.video_level)
        assert len(vid_level_iterator) > 0, "No video_level features were found"
        sm = SimilarityModel()
        signatures_dict = sm.predict(bulk_read(reps.video_level))
        # Unpack paths, hashes and signatures as separate np.arrays
        repr_keys, video_signatures = zip(*signatures_dict.items())

    else:
        repr_keys, video_signatures = zip(*signature_iterator.items())
    paths = np.array([key.path for key in repr_keys])
    hashes = np.array([key.hash for key in repr_keys])
    video_signatures = np.array(video_signatures)
    
    
    print('Finding Matches...')
    # Handles small tests for which number of videos <  number of neighbors
    t0 = time.time()
    neighbors = min(20,video_signatures.shape[0])
    nn = NearestNeighbors(n_neighbors=neighbors,metric='euclidean',algorithm='kd_tree')
    nn.fit(video_signatures)
    distances,indices =  nn.kneighbors(video_signatures)
    print('{} seconds spent finding matches '.format(time.time()-t0))
    results,results_distances = filter_results(config.proc.match_distance, distances, indices)

    ss = sorted(zip(results,results_distances),key=lambda x:len(x[0]),reverse=True)
    results_sorted = [x[0] for x in ss]
    results_sorted_distance = [x[1] for x in ss]


    q = []
    m = []
    distance = []

    print('Generating Report')
    for i,r in enumerate(results_sorted):
        for j,matches in enumerate(r):
            if j == 0:
                qq = matches
            q.append(qq)
            m.append(matches)
            distance.append(results_sorted_distance[i][j])

    match_df = pd.DataFrame({"query":q,"match":m,"distance":distance})
    match_df['query_video'] = paths[match_df['query']]
    match_df['query_sha256'] = hashes[match_df['query']]
    match_df['match_video'] = paths[match_df['match']]
    match_df['match_sha256'] = hashes[match_df['match']]
    match_df['self_match'] = match_df['query_video'] == match_df['match_video']
    # Remove self matches
    match_df = match_df.loc[~match_df['self_match'], :]
    # Creates unique index from query, match 
    match_df['unique_index'] = match_df.apply(uniq, axis=1)
    # Removes duplicated entries (eg if A matches B, we don't need B matches A)
    match_df = match_df.drop_duplicates(subset=['unique_index'])

    REPORT_PATH = os.path.join(config.repr.directory, f'matches_at_{config.proc.match_distance}_distance.csv')

    print('Saving unfiltered report to {}'.format(REPORT_PATH))

    match_df.to_csv(REPORT_PATH)

#    if config.proc.detect_scenes:
#
#        frame_features_dict = bulk_read(reps.frame_level, select=None)
#        assert len(frame_features_dict) > 0, 'No Frame Level features were found.'
#        scenes = extract_scenes(frame_features_dict)
#        scene_metadata = pd.DataFrame(asdict(scenes))
#
#        if config.database.use:
#            # Connect to database
#            database = Database(uri=config.database.uri)
#            database.create_tables()
#
#            # Save scenes
#            result_storage = DBResultStorage(database)
#            result_storage.add_scenes(zip(scenes.video_filename, scenes.video_sha256, scenes.scene_duration_seconds))
#
#        if config.save_files:
#
#            SCENE_METADATA_OUTPUT_PATH = os.path.join(config.repr.directory, 'scene_metadata.csv')
#            scene_metadata.to_csv(SCENE_METADATA_OUTPUT_PATH)
#            print('Scene Metadata saved in:'.format(SCENE_METADATA_OUTPUT_PATH))


    if config.proc.filter_dark_videos:

        print('Filtering dark and/or short videos')

        # Get original files for which we have both frames and frame-level features
        repr_keys = list(set(reps.video_level.list()))
        paths = [key.path for key in repr_keys]
        hashes = [key.hash for key in repr_keys]

        print('Extracting additional information from video files')
        brightness_estimation = np.array([get_brightness_estimation(reps, key) for key in tqdm(repr_keys)])
        print(brightness_estimation.shape)
        metadata_df = pd.DataFrame({"fn": paths,
                                    "sha256": hashes,
                                    "gray_max":brightness_estimation.reshape(brightness_estimation.shape[0])})

        # Flag videos to be discarded

        metadata_df['video_dark_flag'] = metadata_df.gray_max < config.proc.filter_dark_videos_thr

        print('Videos discarded because of darkness:{}'.format(metadata_df['video_dark_flag'].sum()))

        metadata_df['flagged'] = metadata_df['video_dark_flag'] 

        # Discard videos
        discarded_videos = metadata_df.loc[metadata_df['flagged'], :][['fn', 'sha256']]
        discarded_videos = set(tuple(row) for row in discarded_videos.to_numpy())

        # Function to check if the (path,hash) row is in the discarded set
        def is_discarded(row):
            return tuple(row) in discarded_videos

        msk_1 = match_df[['query_video', 'query_sha256']].apply(is_discarded, axis=1)
        msk_2 = match_df[['match_video', 'match_sha256']].apply(is_discarded, axis=1)
        discard_msk = msk_1 | msk_2

        FILTERED_REPORT_PATH = os.path.join(config.repr.directory,
                                            f'matches_at_{config.proc.match_distance}_distance_filtered.csv')
        METADATA_REPORT_PATH = os.path.join(config.repr.directory, 'metadata_signatures.csv')

        match_df = match_df.loc[~discard_msk, :]        

    if config.database.use:
        # Connect to database and ensure schema
        database = Database(uri=config.database.uri)
        database.create_tables()

        # Save metadata
        result_storage = DBResultStorage(database)
        
        if metadata_df is not None:

            metadata_entries = metadata_df[['fn', 'sha256']]
            metadata_entries['metadata'] = metadata_df.drop(columns=['fn', 'sha256']).to_dict('records')
            result_storage.add_metadata(metadata_entries.to_numpy())

        # Save matches
        match_columns = ['query_video', 'query_sha256', 'match_video', 'match_sha256', 'distance']

        result_storage.add_matches(match_df[match_columns].to_numpy())

    if config.save_files:

        print('Saving metadata to {}'.format(METADATA_REPORT_PATH))
        metadata_df.to_csv(METADATA_REPORT_PATH)
        print('Saving Filtered Matches report to {}'.format(METADATA_REPORT_PATH))
        match_df.to_csv(FILTERED_REPORT_PATH)