예제 #1
0
 def reprkey(path):
     local_path = os.path.join(config.sources.root, path.split('/')[-1])
     """Get intermediate representation storage key."""
     return ReprKey(
         path=storepath(local_path),
         hash=get_hash(local_path),
         tag=config_tag,
         url=path)
예제 #2
0
 def list(self):
     """Iterate over all storage keys."""
     path_pattern = join(self.directory, f"**/*{self.suffix}")
     with self._metadata_storage.begin(write=False) as txn:
         for repr_file_path in glob(path_pattern, recursive=True):
             original_path = self._reverse(repr_file_path)
             metadata = self._read_metadata(original_path, txn)
             yield ReprKey(path=original_path,
                           hash=metadata.hash,
                           tag=metadata.tag)
 def list_lmdb(self):
     """Iterate over all storage keys."""
     with self._metadata_storage.begin(write=False) as txn:
         for (key, value) in txn.cursor():
             # for repr_file_path in glob(path_pattern, recursive=True):
             original_path = key[0].decode('utf-8')
             metadata = self._read_metadata(original_path, txn)
             if metadata is not None:
                 yield ReprKey(path=original_path,
                               hash=metadata.hash,
                               tag=metadata.tag,
                               url=metadata.url)
예제 #4
0
 def to_key(self):
     """Convert database record to ReprKey."""
     return ReprKey(path=self.source_path, hash=self.hash, tag=self.tag)
def find_matchs(config):
    print('Reading Video Signatures')
    database = Database(uri=config.database.uri)
    with database.session_scope() as session:
        query = session.query(Files).options(joinedload(Files.signature))
        files = query.filter().all()

        signature_iterator = dict()
        for file in files:
            if file.signature is not None and check_is_signature_valid(file):
                with open("/tmp/test.txt", "wb+") as f:
                    f.write(file.signature.signature)
                    f.seek(0)
                    str = f.read()
                    len_s = len(str)
                    sig = struct.unpack(('%df' % (len_s / 4)), str)

                signature_iterator[ReprKey(path=file.file_path,
                                           hash=file.sha256,
                                           tag=file.meta,
                                           url=file.file_url)] = sig

        repr_keys, video_signatures = zip(*signature_iterator.items())
        paths = np.array([key.path for key in repr_keys])
        hashes = np.array([key.hash for key in repr_keys])
        video_signatures = np.array(video_signatures)

    print('Finding Matches...')
    # Handles small tests for which number of videos <  number of neighbors
    t0 = time.time()
    neighbors = min(20, video_signatures.shape[0])
    nn = NearestNeighbors(n_neighbors=neighbors,
                          metric='euclidean',
                          algorithm='kd_tree')
    nn.fit(video_signatures)
    distances, indices = nn.kneighbors(video_signatures)
    print('{} seconds spent finding matches '.format(time.time() - t0))
    results, results_distances = filter_results(config.proc.match_distance,
                                                distances, indices)

    ss = sorted(zip(results, results_distances),
                key=lambda x: len(x[0]),
                reverse=True)
    results_sorted = [x[0] for x in ss]
    results_sorted_distance = [x[1] for x in ss]

    q = []
    m = []
    distance = []

    print('Generating Report')
    for i, r in enumerate(results_sorted):
        for j, matches in enumerate(r):
            if j == 0:
                qq = matches
            q.append(qq)
            m.append(matches)
            distance.append(results_sorted_distance[i][j])

    match_df = pd.DataFrame({"query": q, "match": m, "distance": distance})
    match_df['query_video'] = paths[match_df['query']]
    match_df['query_sha256'] = hashes[match_df['query']]
    match_df['match_video'] = paths[match_df['match']]
    match_df['match_sha256'] = hashes[match_df['match']]
    match_df['self_match'] = match_df['query_video'] == match_df['match_video']
    # Remove self matches
    match_df = match_df.loc[~match_df['self_match'], :]
    # Creates unique index from query, match
    match_df['unique_index'] = match_df.apply(uniq, axis=1)
    # Removes duplicated entries (eg if A matches B, we don't need B matches A)
    match_df = match_df.drop_duplicates(subset=['unique_index'])

    # if config.proc.filter_dark_videos:
    #
    #     print('Filtering dark and/or short videos')
    #
    #     # Get original files for which we have both frames and frame-level features
    #     repr_keys = list(set(reps.video_level.list()))
    #     paths = [key.path for key in repr_keys]
    #     hashes = [key.hash for key in repr_keys]
    #
    #     print('Extracting additional information from video files')
    #     brightness_estimation = np.array([get_brightness_estimation(reps, key) for key in tqdm(repr_keys)])
    #     print(brightness_estimation.shape)
    #     metadata_df = pd.DataFrame({"fn": paths,
    #                                 "sha256": hashes,
    #                                 "gray_max":brightness_estimation.reshape(brightness_estimation.shape[0])})
    #
    #     # Flag videos to be discarded
    #
    #     metadata_df['video_dark_flag'] = metadata_df.gray_max < config.proc.filter_dark_videos_thr
    #
    #     print('Videos discarded because of darkness:{}'.format(metadata_df['video_dark_flag'].sum()))
    #
    #     metadata_df['flagged'] = metadata_df['video_dark_flag']
    #
    #     # Discard videos
    #     discarded_videos = metadata_df.loc[metadata_df['flagged'], :][['fn', 'sha256']]
    #     discarded_videos = set(tuple(row) for row in discarded_videos.to_numpy())
    #
    #     # Function to check if the (path,hash) row is in the discarded set
    #     def is_discarded(row):
    #         return tuple(row) in discarded_videos
    #
    #     msk_1 = match_df[['query_video', 'query_sha256']].apply(is_discarded, axis=1)
    #     msk_2 = match_df[['match_video', 'match_sha256']].apply(is_discarded, axis=1)
    #     discard_msk = msk_1 | msk_2
    #
    #     match_df = match_df.loc[~discard_msk, :]
    if config.database.use:
        # Connect to database and ensure schema
        database = Database(uri=config.database.uri)
        database.create_tables()

        # Save metadata
        result_storage = DBResultStorage(database)

        # if metadata_df is not None:
        #     metadata_entries = metadata_df[['fn', 'sha256']]
        #     metadata_entries['metadata'] = metadata_df.drop(columns=['fn', 'sha256']).to_dict('records')
        #     result_storage.add_metadata(metadata_entries.to_numpy())

        # Save matches
        match_columns = [
            'query_video', 'query_sha256', 'match_video', 'match_sha256',
            'distance'
        ]

        result_storage.add_matches(match_df[match_columns].to_numpy())
예제 #6
0
def copy(key, **kwargs):
    args = asdict(key)
    args.update(kwargs)
    return ReprKey(**args)
예제 #7
0
def make_key():
    """Make some repr storage key."""
    unique = uuid()
    return ReprKey(path=f"some/path-{unique}",
                   hash=f"some-hash-{unique}",
                   tag=f"some-tag-{unique}")
예제 #8
0
 def reprkey(path):
     """Get intermediate representation storage key."""
     return ReprKey(path=storepath(path),
                    hash=get_hash(path),
                    tag=config_tag)