Exemplo n.º 1
0
Arquivo: base.py Projeto: kss682/dvc
    def cache_exists(self, checksums, jobs=None):
        """Check if the given checksums are stored in the remote.

        There are two ways of performing this check:

        - Traverse: Get a list of all the files in the remote
            (traversing the cache directory) and compare it with
            the given checksums.

        - No traverse: For each given checksum, run the `exists`
            method and filter the checksums that aren't on the remote.
            This is done in parallel threads.
            It also shows a progress bar when performing the check.

        The reason for such an odd logic is that most of the remotes
        take much shorter time to just retrieve everything they have under
        a certain prefix (e.g. s3, gs, ssh, hdfs). Other remotes that can
        check if particular file exists much quicker, use their own
        implementation of cache_exists (see http, local).

        Returns:
            A list with checksums that were found in the remote
        """
        progress_callback = ProgressCallback(len(checksums))

        def exists_with_progress(chunks):
            return self.batch_exists(chunks, callback=progress_callback)

        if self.no_traverse and hasattr(self, "batch_exists"):
            with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor:
                path_infos = [self.checksum_to_path_info(x) for x in checksums]
                chunks = to_chunks(path_infos, num_chunks=self.JOBS)
                results = executor.map(exists_with_progress, chunks)
                in_remote = itertools.chain.from_iterable(results)
                ret = list(itertools.compress(checksums, in_remote))
                progress_callback.finish("")
                return ret

        return list(set(checksums) & set(self.all()))
Exemplo n.º 2
0
    def cache_exists(self, checksums, jobs=None):
        """This is older implementation used in remote/base.py
        We are reusing it in RemoteSSH, because SSH's batch_exists proved to be
        faster than current approach (relying on exists(path_info)) applied in
        remote/base.
        """
        progress_callback = ProgressCallback(len(checksums))

        def exists_with_progress(chunks):
            return self.batch_exists(chunks, callback=progress_callback)

        if self.no_traverse:
            with ThreadPoolExecutor(max_workers=jobs or self.JOBS) as executor:
                path_infos = [self.checksum_to_path_info(x) for x in checksums]
                chunks = to_chunks(path_infos, num_chunks=self.JOBS)
                results = executor.map(exists_with_progress, chunks)
                in_remote = itertools.chain.from_iterable(results)
                ret = list(itertools.compress(checksums, in_remote))
                progress_callback.finish("")
                return ret

        return list(set(checksums) & set(self.all()))