def Convert(config): # collect_files(get_ray_nodes()) reps = ReprStorage(os.path.join(config.repr.directory)) # print('Extracting Signatures from Video representations') sm = SimilarityModel() vid_level_iterator = bulk_read(reps.video_level) print("Prepare to update database! vid_num :" + str(len(vid_level_iterator))) if len(vid_level_iterator) > 0: signatures = sm.predict( vid_level_iterator) # Get {ReprKey => signature} dict if config.database.use: # Convert dict to list of (path, sha256, url, signature) tuples entries = [(key.path, key.hash, key.url, sig) for key, sig in signatures.items()] # Connect to database database = Database(uri=config.database.uri) database.create_tables() try: # Save signatures result_storage = DBResultStorage(database) result_storage.add_signatures(entries) # after writen to db, remove. for key, sig in signatures.items(): remove_file("/project/data/representations/video_level/" + key.path + ".npy") except Exception as e: print("save db ERROR!") print(e)
def main(path, output, config, save_frames, save_features, save_signatures, save_db): """ Application to extract features from a single video file """ print(save_db) PRETRAINED_LOCAL_PATH = download_pretrained(config) video_name = os.path.basename(path) model = load_featurizer(PRETRAINED_LOCAL_PATH) video_tensor = load_video(path, model.desired_size) features = model.extract(video_tensor, 10) video_level_repres = global_vector_from_tensor(features) sm = SimilarityModel() sm.build_features_single(video_level_repres, video_name) video_signatures = sm.predict() video_signatures = np.nan_to_num(video_signatures) if save_frames: frame_path = os.path.join( output, '{}_{}_frames'.format(video_name, model.net_name)) np.save(frame_path, video_tensor) if save_features: features_path = os.path.join( output, '{}_{}_features'.format(video_name, model.net_name)) np.save(features_path, features) if save_signatures: signatures_path = os.path.join( output, '{}_{}_signature'.format(video_name, model.net_name)) np.save(signatures_path, video_signatures) if save_db: with open(config, 'r') as ymlfile: cfg = yaml.load(ymlfile) CONNINFO = cfg['conninfo'] db_engine, session = create_engine_session(CONNINFO) create_tables(db_engine) #TODO Currently we have an automated incremental index set for the Signatures table (we might want to change it in the future so we don't add duplicated signatures) processed_paths = [os.path.relpath(path)] file_entries = add_files(session, processed_paths) # Extract ids from records in order to save signatures with the proper information processed_to_id = dict({x.file_path: x.id for x in file_entries}) file_ids = [processed_to_id[x] for x in processed_paths] signatures = add_signatures(session, video_signatures, file_ids)
def signatures(frame_to_video_results): """Get calculated signatures as a dict. Each test dependent on this fixture is guaranteed to be executed AFTER signatures are calculated. Returns: Signatures dict (orig_path,hash) => signature. """ reprs = frame_to_video_results sm = SimilarityModel() signatures = sm.predict(bulk_read(reprs.video_level)) for repr_key, sig_value in signatures.items(): reprs.signature.write(repr_key, sig_value) return signatures
def get_frame_sampling_permutations(frame_samplings, frame_level_files): d = defaultdict(list) for v in frame_level_files: data = np.load(v) for frame_sampling in frame_samplings: d[frame_sampling].append(data[::frame_sampling]) sm = SimilarityModel() signatures = defaultdict(list) for fs in d.keys(): video_level = np.array([global_vector(x) for x in d[fs]]) signatures[fs].append( sm.predict_from_features( video_level.reshape(video_level.shape[0], video_level.shape[2]))) return signatures
import matplotlib.pyplot as plt import numpy as np from sklearn.neighbors import NearestNeighbors, KDTree, BallTree, LSHForest, NearestCentroid from pyvis.network import Network from winnow.feature_extraction import SimilarityModel import yaml sm = SimilarityModel() print('Loading config file') with open("config.yaml", 'r') as ymlfile: cfg = yaml.load(ymlfile) DISTANCE = float(cfg['match_distance']) DST_FOLDER = cfg['destination_folder'] VIDEO_LEVEL_SAVE_FOLDER = cfg["video_level_folder"] print('Extracting Video Signatures') sm = SimilarityModel() video_signatures = sm.predict(VIDEO_LEVEL_SAVE_FOLDER) video_signatures = np.nan_to_num(video_signatures) labels = np.array([x.split('_vgg')[0].split('/')[-1] for x in sm.index]) def filter_results(distances, indexes, thr): results = [] results_distances = [] msk = distances < thr for i, r in enumerate(msk): results.append(indexes[i, r]) results_distances.append(distances[i, r])
def main(config, list_of_files, frame_sampling, save_frames): config = resolve_config(config_path=config, frame_sampling=frame_sampling, save_frames=save_frames) reps = ReprStorage(os.path.join(config.repr.directory)) reprkey = reprkey_resolver(config) print('Searching for Dataset Video Files') if len(list_of_files) == 0: videos = scan_videos(config.sources.root, '**', extensions=config.sources.extensions) else: videos = scan_videos_from_txt(list_of_files, extensions=config.sources.extensions) print('Number of files found: {}'.format(len(videos))) remaining_videos_path = [ path for path in videos if not reps.frame_level.exists(reprkey(path)) ] print('There are {} videos left'.format(len(remaining_videos_path))) VIDEOS_LIST = create_video_list(remaining_videos_path, config.proc.video_list_filename) print('Processed video List saved on :{}'.format(VIDEOS_LIST)) if len(remaining_videos_path) > 0: # Instantiates the extractor model_path = default_model_path( config.proc.pretrained_model_local_path) extractor = IntermediateCnnExtractor( video_src=VIDEOS_LIST, reprs=reps, reprkey=reprkey, frame_sampling=config.proc.frame_sampling, save_frames=config.proc.save_frames, model=(load_featurizer(model_path))) # Starts Extracting Frame Level Features extractor.start(batch_size=16, cores=4) print('Converting Frame by Frame representations to Video Representations') converter = FrameToVideoRepresentation(reps) converter.start() print('Extracting Signatures from Video representations') sm = SimilarityModel() vid_level_iterator = bulk_read(reps.video_level) assert len(vid_level_iterator) > 0, 'No Signatures left to be processed' signatures = sm.predict( vid_level_iterator) # Get {ReprKey => signature} dict print('Saving Video Signatures on :{}'.format(reps.signature.directory)) if config.database.use: # Convert dict to list of (path, sha256, signature) tuples entries = [(key.path, key.hash, key.url, sig) for key, sig in signatures.items()] # Connect to database database = Database(uri=config.database.uri) database.create_tables() # Save signatures result_storage = DBResultStorage(database) result_storage.add_signatures(entries) if config.save_files: bulk_write(reps.signature, signatures)
def main(config): print('Loading config file') config = resolve_config(config_path=config) reps = ReprStorage(config.repr.directory) # Get mapping (path,hash) => sig. print('Extracting Video Signatures') signature_iterator = bulk_read(reps.signature) if len(signature_iterator) == 0: vid_level_iterator = bulk_read(reps.video_level) assert len(vid_level_iterator) > 0, "No video_level features were found" sm = SimilarityModel() signatures_dict = sm.predict(bulk_read(reps.video_level)) # Unpack paths, hashes and signatures as separate np.arrays repr_keys, video_signatures = zip(*signatures_dict.items()) else: repr_keys, video_signatures = zip(*signature_iterator.items()) paths = np.array([key.path for key in repr_keys]) hashes = np.array([key.hash for key in repr_keys]) video_signatures = np.array(video_signatures) print('Finding Matches...') # Handles small tests for which number of videos < number of neighbors t0 = time.time() neighbors = min(20,video_signatures.shape[0]) nn = NearestNeighbors(n_neighbors=neighbors,metric='euclidean',algorithm='kd_tree') nn.fit(video_signatures) distances,indices = nn.kneighbors(video_signatures) print('{} seconds spent finding matches '.format(time.time()-t0)) results,results_distances = filter_results(config.proc.match_distance, distances, indices) ss = sorted(zip(results,results_distances),key=lambda x:len(x[0]),reverse=True) results_sorted = [x[0] for x in ss] results_sorted_distance = [x[1] for x in ss] q = [] m = [] distance = [] print('Generating Report') for i,r in enumerate(results_sorted): for j,matches in enumerate(r): if j == 0: qq = matches q.append(qq) m.append(matches) distance.append(results_sorted_distance[i][j]) match_df = pd.DataFrame({"query":q,"match":m,"distance":distance}) match_df['query_video'] = paths[match_df['query']] match_df['query_sha256'] = hashes[match_df['query']] match_df['match_video'] = paths[match_df['match']] match_df['match_sha256'] = hashes[match_df['match']] match_df['self_match'] = match_df['query_video'] == match_df['match_video'] # Remove self matches match_df = match_df.loc[~match_df['self_match'], :] # Creates unique index from query, match match_df['unique_index'] = match_df.apply(uniq, axis=1) # Removes duplicated entries (eg if A matches B, we don't need B matches A) match_df = match_df.drop_duplicates(subset=['unique_index']) REPORT_PATH = os.path.join(config.repr.directory, f'matches_at_{config.proc.match_distance}_distance.csv') print('Saving unfiltered report to {}'.format(REPORT_PATH)) match_df.to_csv(REPORT_PATH) # if config.proc.detect_scenes: # # frame_features_dict = bulk_read(reps.frame_level, select=None) # assert len(frame_features_dict) > 0, 'No Frame Level features were found.' # scenes = extract_scenes(frame_features_dict) # scene_metadata = pd.DataFrame(asdict(scenes)) # # if config.database.use: # # Connect to database # database = Database(uri=config.database.uri) # database.create_tables() # # # Save scenes # result_storage = DBResultStorage(database) # result_storage.add_scenes(zip(scenes.video_filename, scenes.video_sha256, scenes.scene_duration_seconds)) # # if config.save_files: # # SCENE_METADATA_OUTPUT_PATH = os.path.join(config.repr.directory, 'scene_metadata.csv') # scene_metadata.to_csv(SCENE_METADATA_OUTPUT_PATH) # print('Scene Metadata saved in:'.format(SCENE_METADATA_OUTPUT_PATH)) if config.proc.filter_dark_videos: print('Filtering dark and/or short videos') # Get original files for which we have both frames and frame-level features repr_keys = list(set(reps.video_level.list())) paths = [key.path for key in repr_keys] hashes = [key.hash for key in repr_keys] print('Extracting additional information from video files') brightness_estimation = np.array([get_brightness_estimation(reps, key) for key in tqdm(repr_keys)]) print(brightness_estimation.shape) metadata_df = pd.DataFrame({"fn": paths, "sha256": hashes, "gray_max":brightness_estimation.reshape(brightness_estimation.shape[0])}) # Flag videos to be discarded metadata_df['video_dark_flag'] = metadata_df.gray_max < config.proc.filter_dark_videos_thr print('Videos discarded because of darkness:{}'.format(metadata_df['video_dark_flag'].sum())) metadata_df['flagged'] = metadata_df['video_dark_flag'] # Discard videos discarded_videos = metadata_df.loc[metadata_df['flagged'], :][['fn', 'sha256']] discarded_videos = set(tuple(row) for row in discarded_videos.to_numpy()) # Function to check if the (path,hash) row is in the discarded set def is_discarded(row): return tuple(row) in discarded_videos msk_1 = match_df[['query_video', 'query_sha256']].apply(is_discarded, axis=1) msk_2 = match_df[['match_video', 'match_sha256']].apply(is_discarded, axis=1) discard_msk = msk_1 | msk_2 FILTERED_REPORT_PATH = os.path.join(config.repr.directory, f'matches_at_{config.proc.match_distance}_distance_filtered.csv') METADATA_REPORT_PATH = os.path.join(config.repr.directory, 'metadata_signatures.csv') match_df = match_df.loc[~discard_msk, :] if config.database.use: # Connect to database and ensure schema database = Database(uri=config.database.uri) database.create_tables() # Save metadata result_storage = DBResultStorage(database) if metadata_df is not None: metadata_entries = metadata_df[['fn', 'sha256']] metadata_entries['metadata'] = metadata_df.drop(columns=['fn', 'sha256']).to_dict('records') result_storage.add_metadata(metadata_entries.to_numpy()) # Save matches match_columns = ['query_video', 'query_sha256', 'match_video', 'match_sha256', 'distance'] result_storage.add_matches(match_df[match_columns].to_numpy()) if config.save_files: print('Saving metadata to {}'.format(METADATA_REPORT_PATH)) metadata_df.to_csv(METADATA_REPORT_PATH) print('Saving Filtered Matches report to {}'.format(METADATA_REPORT_PATH)) match_df.to_csv(FILTERED_REPORT_PATH)