def create_file_offset_table(document_file_path, expected_number_of_lines): # just rebuild the file every time for the time being. Later on, we might check the data file fingerprint to avoid it lines_read = io.prepare_file_offset_table(document_file_path) if lines_read and lines_read != expected_number_of_lines: io.remove_file_offset_table(document_file_path) raise exceptions.DataError("Data in [%s] for track [%s] are invalid. Expected [%d] lines but got [%d]." % (document_file_path, track, expected_number_of_lines, lines_read))
def prepare_track(track, cfg): """ Ensures that all track data are available for running the benchmark. :param track: A track that is about to be run. :param cfg: The config object. """ def download(cfg, url, local_path, size_in_bytes): offline = cfg.opts("system", "offline.mode") file_exists = os.path.isfile(local_path) # ensure we only skip the download if the file size also matches our expectation if file_exists and (size_in_bytes is None or os.path.getsize(local_path) == size_in_bytes): logger.info("[%s] already exists locally. Skipping download." % local_path) return False if not offline: try: io.ensure_dir(os.path.dirname(local_path)) if size_in_bytes: size_in_mb = round(convert.bytes_to_mb(size_in_bytes)) # ensure output appears immediately logger.info("Downloading data from [%s] (%s MB) to [%s]." % (url, size_in_mb, local_path)) else: logger.info("Downloading data from [%s] to [%s]." % (url, local_path)) # we want to have a bit more accurate download progress as these files are typically very large progress = net.Progress( "[INFO] Downloading data for track %s" % track.name, accuracy=1) net.download(url, local_path, size_in_bytes, progress_indicator=progress) progress.finish() logger.info("Downloaded data from [%s] to [%s]." % (url, local_path)) except urllib.error.URLError: logger.exception("Could not download [%s] to [%s]." % (url, local_path)) # file must exist at this point -> verify if not os.path.isfile(local_path): if offline: raise exceptions.SystemSetupError( "Cannot find %s. Please disable offline mode and retry again." % local_path) else: raise exceptions.SystemSetupError( "Cannot download from %s to %s. Please verify that data are available at %s and " "check your internet connection." % (url, local_path, url)) actual_size = os.path.getsize(local_path) if size_in_bytes is not None and actual_size != size_in_bytes: raise exceptions.DataError( "[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." % (local_path, actual_size, size_in_bytes)) return True def decompress(data_set_path, expected_size_in_bytes): # we assume that track data are always compressed and try to decompress them before running the benchmark basename, extension = io.splitext(data_set_path) decompressed = False if not os.path.isfile(basename) or os.path.getsize( basename) != expected_size_in_bytes: decompressed = True if type.uncompressed_size_in_bytes: console.info( "Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... " % (data_set_path, basename, convert.bytes_to_gb(type.uncompressed_size_in_bytes)), end='', flush=True, logger=logger) else: console.info( "Decompressing track data from [%s] to [%s] ... " % (data_set_path, basename), end='', flush=True, logger=logger) io.decompress(data_set_path, io.dirname(data_set_path)) console.println("[OK]") extracted_bytes = os.path.getsize(basename) if expected_size_in_bytes is not None and extracted_bytes != expected_size_in_bytes: raise exceptions.DataError( "[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." % (basename, extracted_bytes, expected_size_in_bytes)) return basename, decompressed if not track.source_root_url: logger.info( "Track [%s] does not specify a source root URL. Assuming data are available locally." % track.name) data_root = cfg.opts("benchmarks", "local.dataset.cache") for index in track.indices: for type in index.types: if type.document_archive: absolute_archive_path = os.path.join(data_root, type.document_archive) if track.source_root_url: data_url = "%s/%s" % ( track.source_root_url, os.path.basename(absolute_archive_path)) download(cfg, data_url, absolute_archive_path, type.compressed_size_in_bytes) if not os.path.exists(absolute_archive_path): if cfg.opts("track", "test.mode.enabled"): logger.error( "[%s] does not exist so assuming that track [%s] does not support test mode." % (absolute_archive_path, track)) raise exceptions.DataError( "Track [%s] does not support test mode. Please ask the track author to add it or " "disable test mode and retry." % track) else: logger.error("[%s] does not exist." % absolute_archive_path) raise exceptions.DataError( "Track data file [%s] is missing." % absolute_archive_path) decompressed_file_path, was_decompressed = decompress( absolute_archive_path, type.uncompressed_size_in_bytes) # just rebuild the file every time for the time being. Later on, we might check the data file fingerprint to avoid it lines_read = io.prepare_file_offset_table( decompressed_file_path) if lines_read and lines_read != type.number_of_lines: io.remove_file_offset_table(decompressed_file_path) raise exceptions.DataError( "Data in [%s] for track [%s] are invalid. Expected [%d] lines but got [%d]." % (decompressed_file_path, track, type.number_of_lines, lines_read)) else: logger.info( "Type [%s] in index [%s] does not define a document archive. No data are indexed from a file for this type." % (type.name, index.name))
def prepare_track(track, cfg): """ Ensures that all track data are available for running the benchmark. :param track: A track that is about to be run. :param cfg: The config object. """ def download(cfg, url, local_path, size_in_bytes): offline = cfg.opts("system", "offline.mode") file_exists = os.path.isfile(local_path) # ensure we only skip the download if the file size also matches our expectation if file_exists and (size_in_bytes is None or os.path.getsize(local_path) == size_in_bytes): logger.info("[%s] already exists locally. Skipping download." % local_path) return False if not offline: try: io.ensure_dir(os.path.dirname(local_path)) if size_in_bytes: size_in_mb = round(convert.bytes_to_mb(size_in_bytes)) # ensure output appears immediately logger.info("Downloading data from [%s] (%s MB) to [%s]." % (url, size_in_mb, local_path)) else: logger.info("Downloading data from [%s] to [%s]." % (url, local_path)) # we want to have a bit more accurate download progress as these files are typically very large progress = net.Progress("[INFO] Downloading data for track %s" % track.name, accuracy=1) net.download(url, local_path, size_in_bytes, progress_indicator=progress) progress.finish() logger.info("Downloaded data from [%s] to [%s]." % (url, local_path)) except urllib.error.URLError: logger.exception("Could not download [%s] to [%s]." % (url, local_path)) # file must exist at this point -> verify if not os.path.isfile(local_path): if offline: raise exceptions.SystemSetupError( "Cannot find %s. Please disable offline mode and retry again." % local_path) else: raise exceptions.SystemSetupError( "Cannot download from %s to %s. Please verify that data are available at %s and " "check your internet connection." % (url, local_path, url)) actual_size = os.path.getsize(local_path) if size_in_bytes is not None and actual_size != size_in_bytes: raise exceptions.DataError("[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." % (local_path, actual_size, size_in_bytes)) return True def decompress(data_set_path, expected_size_in_bytes): # we assume that track data are always compressed and try to decompress them before running the benchmark basename, extension = io.splitext(data_set_path) decompressed = False if not os.path.isfile(basename) or os.path.getsize(basename) != expected_size_in_bytes: decompressed = True if type.uncompressed_size_in_bytes: console.info("Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... " % (data_set_path, basename, convert.bytes_to_gb(type.uncompressed_size_in_bytes)), end='', flush=True, logger=logger) else: console.info("Decompressing track data from [%s] to [%s] ... " % (data_set_path, basename), end='', flush=True, logger=logger) io.decompress(data_set_path, io.dirname(data_set_path)) console.println("[OK]") extracted_bytes = os.path.getsize(basename) if expected_size_in_bytes is not None and extracted_bytes != expected_size_in_bytes: raise exceptions.DataError("[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." % (basename, extracted_bytes, expected_size_in_bytes)) return basename, decompressed if not track.source_root_url: logger.info("Track [%s] does not specify a source root URL. Assuming data are available locally." % track.name) data_root = cfg.opts("benchmarks", "local.dataset.cache") for index in track.indices: for type in index.types: if type.document_archive: absolute_archive_path = os.path.join(data_root, type.document_archive) if track.source_root_url: data_url = "%s/%s" % (track.source_root_url, os.path.basename(absolute_archive_path)) download(cfg, data_url, absolute_archive_path, type.compressed_size_in_bytes) if not os.path.exists(absolute_archive_path): if cfg.opts("track", "test.mode.enabled"): logger.error("[%s] does not exist so assuming that track [%s] does not support test mode." % (absolute_archive_path, track)) raise exceptions.DataError("Track [%s] does not support test mode. Please ask the track author to add it or " "disable test mode and retry." % track) else: logger.error("[%s] does not exist." % absolute_archive_path) raise exceptions.DataError("Track data file [%s] is missing." % absolute_archive_path) decompressed_file_path, was_decompressed = decompress(absolute_archive_path, type.uncompressed_size_in_bytes) # just rebuild the file every time for the time being. Later on, we might check the data file fingerprint to avoid it lines_read = io.prepare_file_offset_table(decompressed_file_path) if lines_read and lines_read != type.number_of_lines: io.remove_file_offset_table(decompressed_file_path) raise exceptions.DataError("Data in [%s] for track [%s] are invalid. Expected [%d] lines but got [%d]." % (decompressed_file_path, track, type.number_of_lines, lines_read)) else: logger.info("Type [%s] in index [%s] does not define a document archive. No data are indexed from a file for this type." % (type.name, index.name))