def reprkey(path): local_path = os.path.join(config.sources.root, path.split('/')[-1]) """Get intermediate representation storage key.""" return ReprKey( path=storepath(local_path), hash=get_hash(local_path), tag=config_tag, url=path)
def list(self): """Iterate over all storage keys.""" path_pattern = join(self.directory, f"**/*{self.suffix}") with self._metadata_storage.begin(write=False) as txn: for repr_file_path in glob(path_pattern, recursive=True): original_path = self._reverse(repr_file_path) metadata = self._read_metadata(original_path, txn) yield ReprKey(path=original_path, hash=metadata.hash, tag=metadata.tag)
def list_lmdb(self): """Iterate over all storage keys.""" with self._metadata_storage.begin(write=False) as txn: for (key, value) in txn.cursor(): # for repr_file_path in glob(path_pattern, recursive=True): original_path = key[0].decode('utf-8') metadata = self._read_metadata(original_path, txn) if metadata is not None: yield ReprKey(path=original_path, hash=metadata.hash, tag=metadata.tag, url=metadata.url)
def to_key(self): """Convert database record to ReprKey.""" return ReprKey(path=self.source_path, hash=self.hash, tag=self.tag)
def find_matchs(config): print('Reading Video Signatures') database = Database(uri=config.database.uri) with database.session_scope() as session: query = session.query(Files).options(joinedload(Files.signature)) files = query.filter().all() signature_iterator = dict() for file in files: if file.signature is not None and check_is_signature_valid(file): with open("/tmp/test.txt", "wb+") as f: f.write(file.signature.signature) f.seek(0) str = f.read() len_s = len(str) sig = struct.unpack(('%df' % (len_s / 4)), str) signature_iterator[ReprKey(path=file.file_path, hash=file.sha256, tag=file.meta, url=file.file_url)] = sig repr_keys, video_signatures = zip(*signature_iterator.items()) paths = np.array([key.path for key in repr_keys]) hashes = np.array([key.hash for key in repr_keys]) video_signatures = np.array(video_signatures) print('Finding Matches...') # Handles small tests for which number of videos < number of neighbors t0 = time.time() neighbors = min(20, video_signatures.shape[0]) nn = NearestNeighbors(n_neighbors=neighbors, metric='euclidean', algorithm='kd_tree') nn.fit(video_signatures) distances, indices = nn.kneighbors(video_signatures) print('{} seconds spent finding matches '.format(time.time() - t0)) results, results_distances = filter_results(config.proc.match_distance, distances, indices) ss = sorted(zip(results, results_distances), key=lambda x: len(x[0]), reverse=True) results_sorted = [x[0] for x in ss] results_sorted_distance = [x[1] for x in ss] q = [] m = [] distance = [] print('Generating Report') for i, r in enumerate(results_sorted): for j, matches in enumerate(r): if j == 0: qq = matches q.append(qq) m.append(matches) distance.append(results_sorted_distance[i][j]) match_df = pd.DataFrame({"query": q, "match": m, "distance": distance}) match_df['query_video'] = paths[match_df['query']] match_df['query_sha256'] = hashes[match_df['query']] match_df['match_video'] = paths[match_df['match']] match_df['match_sha256'] = hashes[match_df['match']] match_df['self_match'] = match_df['query_video'] == match_df['match_video'] # Remove self matches match_df = match_df.loc[~match_df['self_match'], :] # Creates unique index from query, match match_df['unique_index'] = match_df.apply(uniq, axis=1) # Removes duplicated entries (eg if A matches B, we don't need B matches A) match_df = match_df.drop_duplicates(subset=['unique_index']) # if config.proc.filter_dark_videos: # # print('Filtering dark and/or short videos') # # # Get original files for which we have both frames and frame-level features # repr_keys = list(set(reps.video_level.list())) # paths = [key.path for key in repr_keys] # hashes = [key.hash for key in repr_keys] # # print('Extracting additional information from video files') # brightness_estimation = np.array([get_brightness_estimation(reps, key) for key in tqdm(repr_keys)]) # print(brightness_estimation.shape) # metadata_df = pd.DataFrame({"fn": paths, # "sha256": hashes, # "gray_max":brightness_estimation.reshape(brightness_estimation.shape[0])}) # # # Flag videos to be discarded # # metadata_df['video_dark_flag'] = metadata_df.gray_max < config.proc.filter_dark_videos_thr # # print('Videos discarded because of darkness:{}'.format(metadata_df['video_dark_flag'].sum())) # # metadata_df['flagged'] = metadata_df['video_dark_flag'] # # # Discard videos # discarded_videos = metadata_df.loc[metadata_df['flagged'], :][['fn', 'sha256']] # discarded_videos = set(tuple(row) for row in discarded_videos.to_numpy()) # # # Function to check if the (path,hash) row is in the discarded set # def is_discarded(row): # return tuple(row) in discarded_videos # # msk_1 = match_df[['query_video', 'query_sha256']].apply(is_discarded, axis=1) # msk_2 = match_df[['match_video', 'match_sha256']].apply(is_discarded, axis=1) # discard_msk = msk_1 | msk_2 # # match_df = match_df.loc[~discard_msk, :] if config.database.use: # Connect to database and ensure schema database = Database(uri=config.database.uri) database.create_tables() # Save metadata result_storage = DBResultStorage(database) # if metadata_df is not None: # metadata_entries = metadata_df[['fn', 'sha256']] # metadata_entries['metadata'] = metadata_df.drop(columns=['fn', 'sha256']).to_dict('records') # result_storage.add_metadata(metadata_entries.to_numpy()) # Save matches match_columns = [ 'query_video', 'query_sha256', 'match_video', 'match_sha256', 'distance' ] result_storage.add_matches(match_df[match_columns].to_numpy())
def copy(key, **kwargs): args = asdict(key) args.update(kwargs) return ReprKey(**args)
def make_key(): """Make some repr storage key.""" unique = uuid() return ReprKey(path=f"some/path-{unique}", hash=f"some-hash-{unique}", tag=f"some-tag-{unique}")
def reprkey(path): """Get intermediate representation storage key.""" return ReprKey(path=storepath(path), hash=get_hash(path), tag=config_tag)