def download_subset_file(subset_url, dataset_dir): """ Download a subset segments file from the given url to the given directory. Args: subset_url: URL to subset segments file (Type: str) dataset_dir: Dataset directory where subset segment file will be stored (Type: str) Returns: subset_path: Path to subset segments file (Type: str) """ # Get filename of the subset file subset_filename = get_filename(subset_url) subset_name = get_subset_name(subset_url) subset_path = os.path.join(dataset_dir, subset_filename) os.makedirs(dataset_dir, exist_ok=True) # Open subset file as a CSV if not os.path.exists(subset_path): LOGGER.info('Downloading subset file for "{}"'.format(subset_name)) with open(subset_path, 'w') as f: subset_data = urllib.request.urlopen(subset_url).read().decode() f.write(subset_data) return subset_path
def download_subset(subset_path, dataset_dir, ffmpeg_path, ffprobe_path, num_workers, **ffmpeg_cfg): """ Download all files for a subset, including the segment file, and the audio and video files. Args: subset_path: Path to subset segments file (Type: str) dataset_dir: Path to dataset directory where files are saved (Type: str) ffmpeg_path: Path to ffmpeg executable (Type: str) ffprobe_path: Path to ffprobe executable (Type: str) num_workers: Number of workers to download and process videos (Type: int) Keyword Args: **ffmpeg_cfg: Configuration for audio and video downloading and decoding done by ffmpeg (Type: dict[str, *]) Returns: """ if is_url(subset_path): subset_path = download_subset_file(subset_path, dataset_dir) subset_name = get_subset_name(subset_path) data_dir = init_subset_data_dir(dataset_dir, subset_name) download_subset_videos(subset_path, data_dir, ffmpeg_path, ffprobe_path, num_workers, **ffmpeg_cfg)
def download_random_subset_files(subset_url, dataset_dir, ffmpeg_path, ffprobe_path, num_workers, max_videos=None, **ffmpeg_cfg): """ Download a a random subset (of size `max_videos`) of subset segment file and videos Args: subset_path: Path to subset segments file (Type: str) dataset_dir: Directory where dataset files will be saved (Type: str) ffmpeg_path: Path to ffmpeg executable (Type: str) ffprobe_path: Path to ffprobe executable (Type: str) num_workers: Number of multiprocessing workers used to download videos (Type: int) Keyword Args: max_videos: Maximum number of videos to download in this subset. If None, download all files in this subset. (Type int or None) **ffmpeg_cfg: Configuration for audio and video downloading and decoding done by ffmpeg (Type: dict[str, *]) """ # FIXME: This code is outdated and shouldn't be used # Validate max_videos if max_videos is not None and (max_videos < 1 or type(max_videos) != int): err_msg = 'max_videos must be a positive integer, or None' LOGGER.error(err_msg) raise ValueError(err_msg) # Get filename of the subset file subset_filename = get_filename(subset_url) subset_name = get_subset_name(subset_url) subset_path = os.path.join(dataset_dir, subset_filename) data_dir = init_subset_data_dir(dataset_dir, subset_name) # Open subset file as a CSV if not os.path.exists(subset_path): LOGGER.info('Downloading subset file for "{}"'.format(subset_name)) with open(subset_path, 'w') as f: subset_data = urllib.request.urlopen(subset_url).read().decode() f.write(subset_data) subset_data = [] LOGGER.info( 'Starting download jobs for random subset (of size {}) of subset "{}"'. format(max_videos, subset_name)) with open(subset_path, 'r') as f: subset_data_reader = csv.reader(f) try: for row_idx, row in enumerate(subset_data_reader): # Skip commented lines if row[0][0] == '#': continue subset_data.append(row[:3]) except csv.Error as e: err_msg = 'Encountered error in {} at line {}: {}' LOGGER.error(err_msg) sys.exit(err_msg.format(subset_filename, row_idx + 1, e)) # Shuffle data random.shuffle(subset_data) # Set up multiprocessing pool pool = mp.Pool(num_workers) try: for idx, row in enumerate(subset_data): worker_args = [ row[0], float(row[1]), float(row[2]), data_dir, ffmpeg_path, ffprobe_path ] pool.apply_async(partial(segment_mp_worker, **ffmpeg_cfg), worker_args) # Run serially #segment_mp_worker(*worker_args, **ffmpeg_cfg) if max_videos is not None: if idx + 1 >= max_videos: info_msg = 'Reached maximum ({}) for subset {}' LOGGER.info(info_msg.format(max_videos, subset_name)) break except KeyboardInterrupt: LOGGER.info("Forcing exit.") exit() finally: try: pool.close() pool.join() except KeyboardInterrupt: LOGGER.info("Forcing exit.") exit() LOGGER.info('Finished download jobs for subset "{}"'.format(subset_name))
def download_subset_videos(subset_path, data_dir, ffmpeg_path, ffprobe_path, num_workers, **ffmpeg_cfg): """ Download subset segment file and videos Args: subset_path: Path to subset segments file (Type: str) data_dir: Directory where dataset files will be saved (Type: str) ffmpeg_path: Path to ffmpeg executable (Type: str) ffprobe_path: Path to ffprobe executable (Type: str) num_workers: Number of multiprocessing workers used to download videos (Type: int) Keyword Args: **ffmpeg_cfg: Configuration for audio and video downloading and decoding done by ffmpeg (Type: dict[str, *]) """ subset_name = get_subset_name(subset_path) LOGGER.info('Starting download jobs for subset "{}"'.format(subset_name)) with open(subset_path, 'r') as f: subset_data = csv.reader(f) # Set up multiprocessing pool pool = mp.Pool(num_workers) try: for row_idx, row in enumerate(subset_data): # Skip commented lines if row[0][0] == '#': continue ytid, ts_start, ts_end = row[0], float(row[1]), float(row[2]) # Skip files that already have been downloaded media_filename = get_media_filename(ytid, ts_start, ts_end) video_filepath = os.path.join( data_dir, 'video', media_filename + '.' + ffmpeg_cfg.get('video_format', 'mp4')) audio_filepath = os.path.join( data_dir, 'audio', media_filename + '.' + ffmpeg_cfg.get('audio_format', 'flac')) if os.path.exists(video_filepath) and os.path.exists( audio_filepath): info_msg = 'Already downloaded video {} ({} - {}). Skipping.' LOGGER.info(info_msg.format(ytid, ts_start, ts_end)) continue # Skip files that are neither Applause nor Speech with open('filemove/both_id.txt', 'r+') as f: if ytid in f.read(): print("downloaded sth meaningful!" + ytid) else: # print("skip" + ytid) continue worker_args = [ ytid, ts_start, ts_end, data_dir, ffmpeg_path, ffprobe_path ] pool.apply_async(partial(segment_mp_worker, **ffmpeg_cfg), worker_args) # Run serially #segment_mp_worker(*worker_args, **ffmpeg_cfg) except csv.Error as e: err_msg = 'Encountered error in {} at line {}: {}' LOGGER.error(err_msg) sys.exit(err_msg.format(subset_path, row_idx + 1, e)) except KeyboardInterrupt: LOGGER.info("Forcing exit.") exit() finally: try: pool.close() pool.join() except KeyboardInterrupt: LOGGER.info("Forcing exit.") exit() LOGGER.info('Finished download jobs for subset "{}"'.format(subset_name))
def download_subset_videos(subset_path, data_dir, ffmpeg_path, ffprobe_path, num_workers, **ffmpeg_cfg): """ Download subset segment file and videos Args: subset_path: Path to subset segments file (Type: str) data_dir: Directory where dataset files will be saved (Type: str) ffmpeg_path: Path to ffmpeg executable (Type: str) ffprobe_path: Path to ffprobe executable (Type: str) num_workers: Number of multiprocessing workers used to download videos (Type: int) Keyword Args: **ffmpeg_cfg: Configuration for audio and video downloading and decoding done by ffmpeg (Type: dict[str, *]) """ subset_name = get_subset_name(subset_path) failed_ids = load_failures() LOGGER.info('Loaded failures, {}'.format(len(failed_ids))) LOGGER.info('Preparing jobs for subset "{}"'.format(subset_name)) import joblib def setup_jobs(data): jobs = [] remaining = [] try: for row_idx, row in enumerate(data): # Skip commented lines if row[0][0] == '#': continue ytid, ts_start, ts_end = row[0], float(row[1]), float(row[2]) audio_only = not bool(ffmpeg_cfg.get('video_mode')) output_exists = check_output_exists(data_dir, ytid, ts_start, ts_end, audio_only=audio_only) if output_exists: continue if ytid in failed_ids: continue worker_args = [ ytid, ts_start, ts_end, data_dir, ffmpeg_path, ffprobe_path, ffmpeg_cfg, failed_ids ] remaining.append((ytid, ts_start, ts_end)) job = joblib.delayed(process_job)(*worker_args) jobs += [job] except csv.Error as e: LOGGER.error(f'CSV error in {subset_path} at line {row_idx}: {e}') df = pandas.DataFrame.from_records(remaining) df.to_csv("remaining.csv", index=False, header=False) return jobs # Prepare jobs jobs = [] with open(subset_path, 'r') as f: subset_data = csv.reader(f) jobs = setup_jobs(subset_data) LOGGER.info('Starting {} download jobs for subset "{}"'.format( len(jobs), subset_name)) # Execute jobs #print(len(jobs), jobs[0]) results = joblib.Parallel(n_jobs=num_workers)(jobs) LOGGER.info('Finished download jobs for subset "{}"'.format(subset_name))