Пример #1
0
    def decompress_corpus(archive_path, documents_path, uncompressed_size):
        if uncompressed_size:
            console.info(
                "Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... "
                % (archive_path, documents_path,
                   convert.bytes_to_gb(uncompressed_size)),
                end='',
                flush=True,
                logger=logger)
        else:
            console.info("Decompressing track data from [%s] to [%s] ... " %
                         (archive_path, documents_path),
                         end='',
                         flush=True,
                         logger=logger)

        io.decompress(archive_path, io.dirname(archive_path))
        console.println("[OK]")
        if not os.path.isfile(documents_path):
            raise exceptions.DataError(
                "Decompressing [%s] did not create [%s]. Please check with the track author if the compressed "
                "archive has been created correctly." %
                (archive_path, documents_path))

        extracted_bytes = os.path.getsize(documents_path)
        if uncompressed_size is not None and extracted_bytes != uncompressed_size:
            raise exceptions.DataError(
                "[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected."
                % (documents_path, extracted_bytes, uncompressed_size))
Пример #2
0
    def decompress(data_set_path, expected_size_in_bytes):
        # we assume that track data are always compressed and try to decompress them before running the benchmark
        basename, extension = io.splitext(data_set_path)
        decompressed = False
        if not os.path.isfile(basename) or os.path.getsize(
                basename) != expected_size_in_bytes:
            decompressed = True
            if type.uncompressed_size_in_bytes:
                console.info(
                    "Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... "
                    % (data_set_path, basename,
                       convert.bytes_to_gb(type.uncompressed_size_in_bytes)),
                    end='',
                    flush=True,
                    logger=logger)
            else:
                console.info(
                    "Decompressing track data from [%s] to [%s] ... " %
                    (data_set_path, basename),
                    end='',
                    flush=True,
                    logger=logger)

            io.decompress(data_set_path, io.dirname(data_set_path))
            console.println("[OK]")
            extracted_bytes = os.path.getsize(basename)
            if expected_size_in_bytes is not None and extracted_bytes != expected_size_in_bytes:
                raise exceptions.DataError(
                    "[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected."
                    % (basename, extracted_bytes, expected_size_in_bytes))
        return basename, decompressed
Пример #3
0
 def _update(self, distribution_version):
     try:
         if self.remote and not self.offline:
             branch = versions.best_match(
                 git.branches(self.tracks_dir, remote=self.remote),
                 distribution_version)
             if branch:
                 logger.info(
                     "Rebasing on '%s' in '%s' for distribution version '%s'."
                     % (branch, self.tracks_dir, distribution_version))
                 git.rebase(self.tracks_dir, branch=branch)
                 return
             else:
                 msg = "Could not find track data remotely for distribution version %s. " \
                       "Trying to find track data locally." % distribution_version
                 logger.warn(msg)
         branch = versions.best_match(
             git.branches(self.tracks_dir, remote=False),
             distribution_version)
         if branch:
             logger.info(
                 "Checking out '%s' in '%s' for distribution version '%s'."
                 % (branch, self.tracks_dir, distribution_version))
             git.checkout(self.tracks_dir, branch=branch)
         else:
             raise exceptions.SystemSetupError(
                 "Cannot find track data for distribution version %s" %
                 distribution_version)
     except exceptions.SupplyError as e:
         raise exceptions.DataError("Cannot update track data in '%s': %s" %
                                    (self.tracks_dir, e))
Пример #4
0
 def create_file_offset_table(document_file_path, expected_number_of_lines):
     # just rebuild the file every time for the time being. Later on, we might check the data file fingerprint to avoid it
     lines_read = io.prepare_file_offset_table(document_file_path)
     if lines_read and lines_read != expected_number_of_lines:
         io.remove_file_offset_table(document_file_path)
         raise exceptions.DataError("Data in [%s] for track [%s] are invalid. Expected [%d] lines but got [%d]."
                                    % (document_file_path, track, expected_number_of_lines, lines_read))
Пример #5
0
 def _update(self, distribution_version):
     try:
         if self.remote and not self.offline:
             branch = versions.best_match(git.branches(self.tracks_dir, remote=self.remote), distribution_version)
             if branch:
                 # Allow uncommitted changes iff we do not have to change the branch
                 logger.info(
                     "Checking out [%s] in [%s] for distribution version [%s]." % (branch, self.tracks_dir, distribution_version))
                 git.checkout(self.tracks_dir, branch=branch)
                 logger.info("Rebasing on [%s] in [%s] for distribution version [%s]." % (branch, self.tracks_dir, distribution_version))
                 try:
                     git.rebase(self.tracks_dir, branch=branch)
                 except exceptions.SupplyError:
                     logger.exception("Cannot rebase due to local changes in [%s]" % self.tracks_dir)
                     console.warn(
                         "Local changes in [%s] prevent track update from remote. Please commit your changes." % self.tracks_dir)
                 return
             else:
                 msg = "Could not find track data remotely for distribution version [%s]. " \
                       "Trying to find track data locally." % distribution_version
                 logger.warning(msg)
         branch = versions.best_match(git.branches(self.tracks_dir, remote=False), distribution_version)
         if branch:
             logger.info("Checking out [%s] in [%s] for distribution version [%s]." % (branch, self.tracks_dir, distribution_version))
             git.checkout(self.tracks_dir, branch=branch)
         else:
             raise exceptions.SystemSetupError("Cannot find track data for distribution version %s" % distribution_version)
     except exceptions.SupplyError:
         tb = sys.exc_info()[2]
         raise exceptions.DataError("Cannot update track data in [%s]." % self.tracks_dir).with_traceback(tb)
Пример #6
0
    def download_corpus(root_url, target_path, size_in_bytes, track_name, offline, test_mode):
        file_name = os.path.basename(target_path)

        if not root_url:
            raise exceptions.DataError("%s is missing and it cannot be downloaded because no source URL is provided in the track."
                                       % target_path)
        if offline:
            raise exceptions.SystemSetupError("Cannot find %s. Please disable offline mode and retry again." % target_path)

        data_url = "%s/%s" % (source_root_url, file_name)
        try:
            io.ensure_dir(os.path.dirname(target_path))
            if size_in_bytes:
                size_in_mb = round(convert.bytes_to_mb(size_in_bytes))
                logger.info("Downloading data from [%s] (%s MB) to [%s]." % (data_url, size_in_mb, target_path))
            else:
                logger.info("Downloading data from [%s] to [%s]." % (data_url, target_path))

            # we want to have a bit more accurate download progress as these files are typically very large
            progress = net.Progress("[INFO] Downloading data for track %s" % track_name, accuracy=1)
            net.download(data_url, target_path, size_in_bytes, progress_indicator=progress)
            progress.finish()
            logger.info("Downloaded data from [%s] to [%s]." % (data_url, target_path))
        except urllib.error.HTTPError as e:
            if e.code == 404 and test_mode:
                raise exceptions.DataError("Track [%s] does not support test mode. Please ask the track author to add it or "
                                           "disable test mode and retry." % track_name)
            else:
                msg = "Could not download [%s] to [%s]" % (data_url, target_path)
                if e.reason:
                    msg += " (HTTP status: %s, reason: %s)" % (str(e.code), e.reason)
                else:
                    msg += " (HTTP status: %s)" % str(e.code)
                raise exceptions.DataError(msg)
        except urllib.error.URLError:
            logger.exception("Could not download [%s] to [%s]." % (data_url, target_path))
            raise exceptions.DataError("Could not download [%s] to [%s]." % (data_url, target_path))

        if not os.path.isfile(target_path):
            raise exceptions.SystemSetupError(
                "Cannot download from %s to %s. Please verify that data are available at %s and "
                "check your internet connection." % (data_url, target_path, data_url))

        actual_size = os.path.getsize(target_path)
        if size_in_bytes is not None and actual_size != size_in_bytes:
            raise exceptions.DataError("[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." %
                                       (target_path, actual_size, size_in_bytes))
Пример #7
0
    def download(cfg, url, local_path, size_in_bytes):
        offline = cfg.opts("system", "offline.mode")
        file_exists = os.path.isfile(local_path)

        # ensure we only skip the download if the file size also matches our expectation
        if file_exists and (size_in_bytes is None
                            or os.path.getsize(local_path) == size_in_bytes):
            logger.info("[%s] already exists locally. Skipping download." %
                        local_path)
            return False

        if not offline:
            try:
                io.ensure_dir(os.path.dirname(local_path))
                if size_in_bytes:
                    size_in_mb = round(convert.bytes_to_mb(size_in_bytes))
                    # ensure output appears immediately
                    logger.info("Downloading data from [%s] (%s MB) to [%s]." %
                                (url, size_in_mb, local_path))
                else:
                    logger.info("Downloading data from [%s] to [%s]." %
                                (url, local_path))

                # we want to have a bit more accurate download progress as these files are typically very large
                progress = net.Progress(
                    "[INFO] Downloading data for track %s" % track.name,
                    accuracy=1)
                net.download(url,
                             local_path,
                             size_in_bytes,
                             progress_indicator=progress)
                progress.finish()
                logger.info("Downloaded data from [%s] to [%s]." %
                            (url, local_path))
            except urllib.error.URLError:
                logger.exception("Could not download [%s] to [%s]." %
                                 (url, local_path))

        # file must exist at this point -> verify
        if not os.path.isfile(local_path):
            if offline:
                raise exceptions.SystemSetupError(
                    "Cannot find %s. Please disable offline mode and retry again."
                    % local_path)
            else:
                raise exceptions.SystemSetupError(
                    "Cannot download from %s to %s. Please verify that data are available at %s and "
                    "check your internet connection." % (url, local_path, url))

        actual_size = os.path.getsize(local_path)
        if size_in_bytes is not None and actual_size != size_in_bytes:
            raise exceptions.DataError(
                "[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected."
                % (local_path, actual_size, size_in_bytes))

        return True
Пример #8
0
    def download(cfg, url, local_path, size_in_bytes):
        offline = cfg.opts("system", "offline.mode")
        file_exists = os.path.isfile(local_path)

        # ensure we only skip the download if the file size also matches our expectation
        if file_exists and (size_in_bytes is None
                            or os.path.getsize(local_path) == size_in_bytes):
            logger.info("[%s] already exists locally. Skipping download." %
                        local_path)
            return False

        if not offline:
            try:
                io.ensure_dir(os.path.dirname(local_path))
                if size_in_bytes:
                    size_in_mb = round(convert.bytes_to_mb(size_in_bytes))
                    # ensure output appears immediately
                    console.info(
                        "Downloading data from [%s] (%s MB) to [%s] ... " %
                        (url, size_in_mb, local_path),
                        end='',
                        flush=True,
                        logger=logger)
                else:
                    console.info("Downloading data from [%s] to [%s] ... " %
                                 (url, local_path),
                                 end='',
                                 flush=True,
                                 logger=logger)

                net.download(url, local_path, size_in_bytes)
                console.println("[OK]")
            except urllib.error.URLError:
                logger.exception("Could not download [%s] to [%s]." %
                                 (url, local_path))

        # file must exist at this point -> verify
        if not os.path.isfile(local_path):
            if offline:
                raise exceptions.SystemSetupError(
                    "Cannot find %s. Please disable offline mode and retry again."
                    % local_path)
            else:
                raise exceptions.SystemSetupError(
                    "Cannot download from %s to %s. Please verify that data are available at %s and "
                    "check your internet connection." % (url, local_path, url))

        actual_size = os.path.getsize(local_path)
        if size_in_bytes is not None and actual_size != size_in_bytes:
            raise exceptions.DataError(
                "[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected."
                % (local_path, actual_size, size_in_bytes))

        return True
Пример #9
0
 def decompress(data_set_path, expected_size_in_bytes):
     # we assume that track data are always compressed and try to decompress them before running the benchmark
     basename, extension = io.splitext(data_set_path)
     if not os.path.isfile(basename) or os.path.getsize(basename) != expected_size_in_bytes:
         logger.info("Unzipping track data from [%s] to [%s]." % (data_set_path, basename))
         print("Decompressing %s (resulting size: %.2f GB) ... " %
               (type.document_archive, convert.bytes_to_gb(type.uncompressed_size_in_bytes)), end='', flush=True)
         io.decompress(data_set_path, io.dirname(data_set_path))
         print("Done")
         extracted_bytes = os.path.getsize(basename)
         if extracted_bytes != expected_size_in_bytes:
             raise exceptions.DataError("[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." %
                                        (basename, extracted_bytes, expected_size_in_bytes))
Пример #10
0
def prepare_track(track, cfg):
    """
    Ensures that all track data are available for running the benchmark.

    :param track: A track that is about to be run.
    :param cfg: The config object.
    """
    def download(cfg, url, local_path, size_in_bytes):
        offline = cfg.opts("system", "offline.mode")
        file_exists = os.path.isfile(local_path)

        # ensure we only skip the download if the file size also matches our expectation
        if file_exists and (size_in_bytes is None
                            or os.path.getsize(local_path) == size_in_bytes):
            logger.info("[%s] already exists locally. Skipping download." %
                        local_path)
            return False

        if not offline:
            try:
                io.ensure_dir(os.path.dirname(local_path))
                if size_in_bytes:
                    size_in_mb = round(convert.bytes_to_mb(size_in_bytes))
                    # ensure output appears immediately
                    logger.info("Downloading data from [%s] (%s MB) to [%s]." %
                                (url, size_in_mb, local_path))
                else:
                    logger.info("Downloading data from [%s] to [%s]." %
                                (url, local_path))

                # we want to have a bit more accurate download progress as these files are typically very large
                progress = net.Progress(
                    "[INFO] Downloading data for track %s" % track.name,
                    accuracy=1)
                net.download(url,
                             local_path,
                             size_in_bytes,
                             progress_indicator=progress)
                progress.finish()
                logger.info("Downloaded data from [%s] to [%s]." %
                            (url, local_path))
            except urllib.error.URLError:
                logger.exception("Could not download [%s] to [%s]." %
                                 (url, local_path))

        # file must exist at this point -> verify
        if not os.path.isfile(local_path):
            if offline:
                raise exceptions.SystemSetupError(
                    "Cannot find %s. Please disable offline mode and retry again."
                    % local_path)
            else:
                raise exceptions.SystemSetupError(
                    "Cannot download from %s to %s. Please verify that data are available at %s and "
                    "check your internet connection." % (url, local_path, url))

        actual_size = os.path.getsize(local_path)
        if size_in_bytes is not None and actual_size != size_in_bytes:
            raise exceptions.DataError(
                "[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected."
                % (local_path, actual_size, size_in_bytes))

        return True

    def decompress(data_set_path, expected_size_in_bytes):
        # we assume that track data are always compressed and try to decompress them before running the benchmark
        basename, extension = io.splitext(data_set_path)
        decompressed = False
        if not os.path.isfile(basename) or os.path.getsize(
                basename) != expected_size_in_bytes:
            decompressed = True
            if type.uncompressed_size_in_bytes:
                console.info(
                    "Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... "
                    % (data_set_path, basename,
                       convert.bytes_to_gb(type.uncompressed_size_in_bytes)),
                    end='',
                    flush=True,
                    logger=logger)
            else:
                console.info(
                    "Decompressing track data from [%s] to [%s] ... " %
                    (data_set_path, basename),
                    end='',
                    flush=True,
                    logger=logger)

            io.decompress(data_set_path, io.dirname(data_set_path))
            console.println("[OK]")
            extracted_bytes = os.path.getsize(basename)
            if expected_size_in_bytes is not None and extracted_bytes != expected_size_in_bytes:
                raise exceptions.DataError(
                    "[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected."
                    % (basename, extracted_bytes, expected_size_in_bytes))
        return basename, decompressed

    if not track.source_root_url:
        logger.info(
            "Track [%s] does not specify a source root URL. Assuming data are available locally."
            % track.name)

    data_root = cfg.opts("benchmarks", "local.dataset.cache")
    for index in track.indices:
        for type in index.types:
            if type.document_archive:
                absolute_archive_path = os.path.join(data_root,
                                                     type.document_archive)
                if track.source_root_url:
                    data_url = "%s/%s" % (
                        track.source_root_url,
                        os.path.basename(absolute_archive_path))
                    download(cfg, data_url, absolute_archive_path,
                             type.compressed_size_in_bytes)
                if not os.path.exists(absolute_archive_path):
                    if cfg.opts("track", "test.mode.enabled"):
                        logger.error(
                            "[%s] does not exist so assuming that track [%s] does not support test mode."
                            % (absolute_archive_path, track))
                        raise exceptions.DataError(
                            "Track [%s] does not support test mode. Please ask the track author to add it or "
                            "disable test mode and retry." % track)
                    else:
                        logger.error("[%s] does not exist." %
                                     absolute_archive_path)
                        raise exceptions.DataError(
                            "Track data file [%s] is missing." %
                            absolute_archive_path)
                decompressed_file_path, was_decompressed = decompress(
                    absolute_archive_path, type.uncompressed_size_in_bytes)
                # just rebuild the file every time for the time being. Later on, we might check the data file fingerprint to avoid it
                lines_read = io.prepare_file_offset_table(
                    decompressed_file_path)
                if lines_read and lines_read != type.number_of_lines:
                    io.remove_file_offset_table(decompressed_file_path)
                    raise exceptions.DataError(
                        "Data in [%s] for track [%s] are invalid. Expected [%d] lines but got [%d]."
                        % (decompressed_file_path, track, type.number_of_lines,
                           lines_read))
            else:
                logger.info(
                    "Type [%s] in index [%s] does not define a document archive. No data are indexed from a file for this type."
                    % (type.name, index.name))