Exemplo n.º 1
0
    def run_analysis(self, manifest: Manifest, old_manifest: Manifest = None,
                     patch=True, resume=True, file_prefix_filter=None,
                     file_exclude_filter=None, file_install_tag=None,
                     processing_optimization=False) -> AnalysisResult:
        """
        Run analysis on manifest and old manifest (if not None) and return a result
        with a summary resources required in order to install the provided manifest.

        :param manifest: Manifest to install
        :param old_manifest: Old manifest to patch from (if applicable)
        :param patch: Patch instead of redownloading the entire file
        :param resume: Continue based on resume file if it exists
        :param file_prefix_filter: Only download files that start with this prefix
        :param file_exclude_filter: Exclude files with this prefix from download
        :param file_install_tag: Only install files with the specified tag
        :param processing_optimization: Attempt to optimize processing order and RAM usage
        :return: AnalysisResult
        """

        analysis_res = AnalysisResult()
        analysis_res.install_size = sum(fm.file_size for fm in manifest.file_manifest_list.elements)
        analysis_res.biggest_chunk = max(c.window_size for c in manifest.chunk_data_list.elements)
        analysis_res.biggest_file_size = max(f.file_size for f in manifest.file_manifest_list.elements)
        is_1mib = analysis_res.biggest_chunk == 1024 * 1024
        self.log.debug(f'Biggest chunk size: {analysis_res.biggest_chunk} bytes (== 1 MiB? {is_1mib})')

        self.log.debug(f'Creating manifest comparison...')
        mc = ManifestComparison.create(manifest, old_manifest)
        analysis_res.manifest_comparison = mc

        if resume and self.resume_file and os.path.exists(self.resume_file):
            self.log.info('Found previously interrupted download. Download will be resumed if possible.')
            try:
                missing = 0
                mismatch = 0
                completed_files = set()

                for line in open(self.resume_file).readlines():
                    file_hash, _, filename = line.strip().partition(':')
                    _p = os.path.join(self.dl_dir, filename)
                    if not os.path.exists(_p):
                        self.log.debug(f'File does not exist but is in resume file: "{_p}"')
                        missing += 1
                    elif file_hash != manifest.file_manifest_list.get_file_by_path(filename).sha_hash.hex():
                        mismatch += 1
                    else:
                        completed_files.add(filename)

                if missing:
                    self.log.warning(f'{missing} previously completed file(s) are missing, they will be redownloaded.')
                if mismatch:
                    self.log.warning(f'{mismatch} existing file(s) have been changed and will be redownloaded.')

                # remove completed files from changed/added and move them to unchanged for the analysis.
                mc.added -= completed_files
                mc.changed -= completed_files
                mc.unchanged |= completed_files
                self.log.info(f'Skipping {len(completed_files)} files based on resume data.')
            except Exception as e:
                self.log.warning(f'Reading resume file failed: {e!r}, continuing as normal...')

        # Install tags are used for selective downloading, e.g. for language packs
        additional_deletion_tasks = []
        if file_install_tag is not None:
            if isinstance(file_install_tag, str):
                file_install_tag = [file_install_tag]

            files_to_skip = set(i.filename for i in manifest.file_manifest_list.elements
                                if not any((fit in i.install_tags) or (not fit and not i.install_tags)
                                           for fit in file_install_tag))
            self.log.info(f'Found {len(files_to_skip)} files to skip based on install tag.')
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip
            for fname in sorted(files_to_skip):
                additional_deletion_tasks.append(FileTask(fname, delete=True, silent=True))

        # if include/exclude prefix has been set: mark all files that are not to be downloaded as unchanged
        if file_exclude_filter:
            if isinstance(file_exclude_filter, str):
                file_exclude_filter = [file_exclude_filter]

            file_exclude_filter = [f.lower() for f in file_exclude_filter]
            files_to_skip = set(i.filename for i in manifest.file_manifest_list.elements if
                                any(i.filename.lower().startswith(pfx) for pfx in file_exclude_filter))
            self.log.info(f'Found {len(files_to_skip)} files to skip based on exclude prefix.')
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip

        if file_prefix_filter:
            if isinstance(file_prefix_filter, str):
                file_prefix_filter = [file_prefix_filter]

            file_prefix_filter = [f.lower() for f in file_prefix_filter]
            files_to_skip = set(i.filename for i in manifest.file_manifest_list.elements if not
                                any(i.filename.lower().startswith(pfx) for pfx in file_prefix_filter))
            self.log.info(f'Found {len(files_to_skip)} files to skip based on include prefix(es)')
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip

        if file_prefix_filter or file_exclude_filter or file_install_tag:
            self.log.info(f'Remaining files after filtering: {len(mc.added) + len(mc.changed)}')
            # correct install size after filtering
            analysis_res.install_size = sum(fm.file_size for fm in manifest.file_manifest_list.elements
                                            if fm.filename in mc.added)

        if mc.removed:
            analysis_res.removed = len(mc.removed)
            self.log.debug(f'{analysis_res.removed} removed files')
        if mc.added:
            analysis_res.added = len(mc.added)
            self.log.debug(f'{analysis_res.added} added files')
        if mc.changed:
            analysis_res.changed = len(mc.changed)
            self.log.debug(f'{analysis_res.changed} changed files')
        if mc.unchanged:
            analysis_res.unchanged = len(mc.unchanged)
            self.log.debug(f'{analysis_res.unchanged} unchanged files')

        if processing_optimization and len(manifest.file_manifest_list.elements) > 100_000:
            self.log.warning('Manifest contains too many files, processing optimizations will be disabled.')
            processing_optimization = False
        elif processing_optimization:
            self.log.info('Processing order optimization is enabled, analysis may take a few seconds longer...')

        # count references to chunks for determining runtime cache size later
        references = Counter()
        fmlist = sorted(manifest.file_manifest_list.elements,
                        key=lambda a: a.filename.lower())

        for fm in fmlist:
            self.hash_map[fm.filename] = fm.sha_hash.hex()

            # chunks of unchanged files are not downloaded so we can skip them
            if fm.filename in mc.unchanged:
                analysis_res.unchanged += fm.file_size
                continue

            for cp in fm.chunk_parts:
                references[cp.guid_num] += 1

        if processing_optimization:
            s_time = time.time()
            # reorder the file manifest list to group files that share many chunks
            # 4 is mostly arbitrary but has shown in testing to be a good choice
            min_overlap = 4
            # ignore files with less than N chunk parts, this speeds things up dramatically
            cp_threshold = 5

            remaining_files = {fm.filename: {cp.guid_num for cp in fm.chunk_parts}
                               for fm in fmlist if fm.filename not in mc.unchanged}
            _fmlist = []

            # iterate over all files that will be downloaded and pair up those that share the most chunks
            for fm in fmlist:
                if fm.filename not in remaining_files:
                    continue

                _fmlist.append(fm)
                f_chunks = remaining_files.pop(fm.filename)
                if len(f_chunks) < cp_threshold:
                    continue

                best_overlap, match = 0, None
                for fname, chunks in remaining_files.items():
                    if len(chunks) < cp_threshold:
                        continue
                    overlap = len(f_chunks & chunks)
                    if overlap > min_overlap and overlap > best_overlap:
                        best_overlap, match = overlap, fname

                if match:
                    _fmlist.append(manifest.file_manifest_list.get_file_by_path(match))
                    remaining_files.pop(match)

            fmlist = _fmlist
            opt_delta = time.time() - s_time
            self.log.debug(f'Processing optimizations took {opt_delta:.01f} seconds.')

        # determine reusable chunks and prepare lookup table for reusable ones
        re_usable = defaultdict(dict)
        if old_manifest and mc.changed and patch:
            self.log.debug('Analyzing manifests for re-usable chunks...')
            for changed in mc.changed:
                old_file = old_manifest.file_manifest_list.get_file_by_path(changed)
                new_file = manifest.file_manifest_list.get_file_by_path(changed)

                existing_chunks = defaultdict(list)
                off = 0
                for cp in old_file.chunk_parts:
                    existing_chunks[cp.guid_num].append((off, cp.offset, cp.offset + cp.size))
                    off += cp.size

                for cp in new_file.chunk_parts:
                    key = (cp.guid_num, cp.offset, cp.size)
                    for file_o, cp_o, cp_end_o in existing_chunks[cp.guid_num]:
                        # check if new chunk part is wholly contained in the old chunk part
                        if cp_o <= cp.offset and (cp.offset + cp.size) <= cp_end_o:
                            references[cp.guid_num] -= 1
                            re_usable[changed][key] = file_o + (cp.offset - cp_o)
                            analysis_res.reuse_size += cp.size
                            break

        last_cache_size = current_cache_size = 0
        # set to determine whether a file is currently cached or not
        cached = set()
        # Using this secondary set is orders of magnitude faster than checking the deque.
        chunks_in_dl_list = set()
        # This is just used to count all unique guids that have been cached
        dl_cache_guids = set()

        # run through the list of files and create the download jobs and also determine minimum
        # runtime cache requirement by simulating adding/removing from cache during download.
        self.log.debug('Creating filetasks and chunktasks...')
        for current_file in fmlist:
            # skip unchanged and empty files
            if current_file.filename in mc.unchanged:
                continue
            elif not current_file.chunk_parts:
                self.tasks.append(FileTask(current_file.filename, empty=True))
                continue

            existing_chunks = re_usable.get(current_file.filename, None)
            chunk_tasks = []
            reused = 0

            for cp in current_file.chunk_parts:
                ct = ChunkTask(cp.guid_num, cp.offset, cp.size)

                # re-use the chunk from the existing file if we can
                if existing_chunks and (cp.guid_num, cp.offset, cp.size) in existing_chunks:
                    reused += 1
                    ct.chunk_file = current_file.filename
                    ct.chunk_offset = existing_chunks[(cp.guid_num, cp.offset, cp.size)]
                else:
                    # add to DL list if not already in it
                    if cp.guid_num not in chunks_in_dl_list:
                        self.chunks_to_dl.append(cp.guid_num)
                        chunks_in_dl_list.add(cp.guid_num)

                    # if chunk has more than one use or is already in cache,
                    # check if we need to add or remove it again.
                    if references[cp.guid_num] > 1 or cp.guid_num in cached:
                        references[cp.guid_num] -= 1

                        # delete from cache if no references left
                        if references[cp.guid_num] < 1:
                            current_cache_size -= analysis_res.biggest_chunk
                            cached.remove(cp.guid_num)
                            ct.cleanup = True
                        # add to cache if not already cached
                        elif cp.guid_num not in cached:
                            dl_cache_guids.add(cp.guid_num)
                            cached.add(cp.guid_num)
                            current_cache_size += analysis_res.biggest_chunk
                    else:
                        ct.cleanup = True

                chunk_tasks.append(ct)

            if reused:
                self.log.debug(f' + Reusing {reused} chunks from: {current_file.filename}')
                # open temporary file that will contain download + old file contents
                self.tasks.append(FileTask(current_file.filename + u'.tmp', fopen=True))
                self.tasks.extend(chunk_tasks)
                self.tasks.append(FileTask(current_file.filename + u'.tmp', close=True))
                # delete old file and rename temporary
                self.tasks.append(FileTask(current_file.filename, delete=True, rename=True,
                                           temporary_filename=current_file.filename + u'.tmp'))
            else:
                self.tasks.append(FileTask(current_file.filename, fopen=True))
                self.tasks.extend(chunk_tasks)
                self.tasks.append(FileTask(current_file.filename, close=True))

            # check if runtime cache size has changed
            if current_cache_size > last_cache_size:
                self.log.debug(f' * New maximum cache size: {current_cache_size / 1024 / 1024:.02f} MiB')
                last_cache_size = current_cache_size

        self.log.debug(f'Final cache size requirement: {last_cache_size / 1024 / 1024} MiB.')
        analysis_res.min_memory = last_cache_size + (1024 * 1024 * 32)  # add some padding just to be safe

        # Todo implement on-disk caching to avoid this issue.
        if analysis_res.min_memory > self.max_shared_memory:
            shared_mib = f'{self.max_shared_memory / 1024 / 1024:.01f} MiB'
            required_mib = f'{analysis_res.min_memory / 1024 / 1024:.01f} MiB'
            suggested_mib = round(self.max_shared_memory / 1024 / 1024 +
                                  (analysis_res.min_memory - self.max_shared_memory) / 1024 / 1024 + 32)

            if processing_optimization:
                message = f'Try running legendary with "--enable-reordering --max-shared-memory {suggested_mib:.0f}"'
            else:
                message = 'Try running legendary with "--enable-reordering" to reduce memory usage, ' \
                          f'or use "--max-shared-memory {suggested_mib:.0f}" to increase the limit.'

            raise MemoryError(f'Current shared memory cache is smaller than required: {shared_mib} < {required_mib}. '
                              + message)

        # calculate actual dl and patch write size.
        analysis_res.dl_size = \
            sum(c.file_size for c in manifest.chunk_data_list.elements if c.guid_num in chunks_in_dl_list)
        analysis_res.uncompressed_dl_size = \
            sum(c.window_size for c in manifest.chunk_data_list.elements if c.guid_num in chunks_in_dl_list)

        # add jobs to remove files
        for fname in mc.removed:
            self.tasks.append(FileTask(fname, delete=True))
        self.tasks.extend(additional_deletion_tasks)

        analysis_res.num_chunks_cache = len(dl_cache_guids)
        self.chunk_data_list = manifest.chunk_data_list
        self.analysis = analysis_res

        return analysis_res
Exemplo n.º 2
0
    def run_analysis(self,
                     manifest: Manifest,
                     old_manifest: Manifest = None,
                     patch=True,
                     resume=True,
                     file_prefix_filter=None,
                     file_exclude_filter=None,
                     file_install_tag=None) -> AnalysisResult:
        """
        Run analysis on manifest and old manifest (if not None) and return a result
        with a summary resources required in order to install the provided manifest.

        :param manifest: Manifest to install
        :param old_manifest: Old manifest to patch from (if applicable)
        :param patch: Patch instead of redownloading the entire file
        :param resume: Continue based on resume file if it exists
        :param file_prefix_filter: Only download files that start with this prefix
        :param file_exclude_filter: Exclude files with this prefix from download
        :return: AnalysisResult
        """

        analysis_res = AnalysisResult()
        analysis_res.install_size = sum(
            fm.file_size for fm in manifest.file_manifest_list.elements)
        analysis_res.biggest_chunk = max(
            c.window_size for c in manifest.chunk_data_list.elements)
        analysis_res.biggest_file_size = max(
            f.file_size for f in manifest.file_manifest_list.elements)
        is_1mib = analysis_res.biggest_chunk == 1024 * 1024
        self.log.debug(
            f'Biggest chunk size: {analysis_res.biggest_chunk} bytes (== 1 MiB? {is_1mib})'
        )

        self.log.debug(f'Creating manifest comparison...')
        mc = ManifestComparison.create(manifest, old_manifest)
        analysis_res.manifest_comparison = mc

        if resume and self.resume_file and os.path.exists(self.resume_file):
            try:
                completed_files = set(
                    i.strip() for i in open(self.resume_file).readlines())
                # remove completed files from changed/added and move them to unchanged for the analysis.
                mc.added -= completed_files
                mc.changed -= completed_files
                mc.unchanged |= completed_files
                self.log.debug(
                    f'Skipped {len(completed_files)} files based on resume data!'
                )
            except Exception as e:
                self.log.warning(
                    f'Reading resume file failed: {e!r}, continuing as normal...'
                )

        # Not entirely sure what install tags are used for, only some titles have them.
        # Let's add it for testing anyway.
        if file_install_tag:
            files_to_skip = set(i.filename
                                for i in manifest.file_manifest_list.elements
                                if file_install_tag not in i.install_tags)
            self.log.info(
                f'Found {len(files_to_skip)} files to skip based on install tag.'
            )
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip

        # if include/exclude prefix has been set: mark all files that are not to be downloaded as unchanged
        if file_exclude_filter:
            file_exclude_filter = file_exclude_filter.lower()
            files_to_skip = set(i for i in mc.added | mc.changed
                                if i.lower().startswith(file_exclude_filter))
            self.log.info(
                f'Found {len(files_to_skip)} files to skip based on exclude prefix.'
            )
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip

        if file_prefix_filter:
            file_prefix_filter = file_prefix_filter.lower()
            files_to_skip = set(
                i for i in mc.added | mc.changed
                if not i.lower().startswith(file_prefix_filter))
            self.log.info(
                f'Found {len(files_to_skip)} files to skip based on include prefix.'
            )
            mc.added -= files_to_skip
            mc.changed -= files_to_skip
            mc.unchanged |= files_to_skip

        if file_prefix_filter or file_exclude_filter or file_install_tag:
            self.log.info(
                f'Remaining files after filtering: {len(mc.added) + len(mc.changed)}'
            )
            # correct install size after filtering
            analysis_res.install_size = sum(
                fm.file_size for fm in manifest.file_manifest_list.elements
                if fm.filename in mc.added)

        if mc.removed:
            analysis_res.removed = len(mc.removed)
            self.log.debug(f'{analysis_res.removed} removed files')
        if mc.added:
            analysis_res.added = len(mc.added)
            self.log.debug(f'{analysis_res.added} added files')
        if mc.changed:
            analysis_res.changed = len(mc.changed)
            self.log.debug(f'{analysis_res.changed} changed files')
        if mc.unchanged:
            analysis_res.unchanged = len(mc.unchanged)
            self.log.debug(f'{analysis_res.unchanged} unchanged files')

        # count references to chunks for determining runtime cache size later
        references = Counter()
        for fm in manifest.file_manifest_list.elements:
            # chunks of unchanged files are not downloaded so we can skip them
            if fm.filename in mc.unchanged:
                analysis_res.unchanged += fm.file_size
                continue

            for cp in fm.chunk_parts:
                references[cp.guid_num] += 1

        # determine reusable chunks and prepare lookup table for reusable ones
        re_usable = defaultdict(dict)
        if old_manifest and mc.changed and patch:
            self.log.debug('Analyzing manifests for re-usable chunks...')
            for changed in mc.changed:
                old_file = old_manifest.file_manifest_list.get_file_by_path(
                    changed)
                new_file = manifest.file_manifest_list.get_file_by_path(
                    changed)

                existing_chunks = dict()
                off = 0
                for cp in old_file.chunk_parts:
                    existing_chunks[(cp.guid_num, cp.offset, cp.size)] = off
                    off += cp.size

                for cp in new_file.chunk_parts:
                    key = (cp.guid_num, cp.offset, cp.size)
                    if key in existing_chunks:
                        references[cp.guid_num] -= 1
                        re_usable[changed][key] = existing_chunks[key]
                        analysis_res.reuse_size += cp.size

        last_cache_size = current_cache_size = 0
        # set to determine whether a file is currently cached or not
        cached = set()
        # Using this secondary set is orders of magnitude faster than checking the deque.
        chunks_in_dl_list = set()
        # This is just used to count all unique guids that have been cached
        dl_cache_guids = set()

        # run through the list of files and create the download jobs and also determine minimum
        # runtime cache requirement by simulating adding/removing from cache during download.
        self.log.debug('Creating filetasks and chunktasks...')
        for current_file in sorted(manifest.file_manifest_list.elements,
                                   key=lambda a: a.filename.lower()):
            # skip unchanged and empty files
            if current_file.filename in mc.unchanged:
                continue
            elif not current_file.chunk_parts:
                self.tasks.append(FileTask(current_file.filename, empty=True))
                continue

            existing_chunks = re_usable.get(current_file.filename, None)
            chunk_tasks = []
            reused = 0

            for cp in current_file.chunk_parts:
                ct = ChunkTask(cp.guid_num, cp.offset, cp.size)

                # re-use the chunk from the existing file if we can
                if existing_chunks and (cp.guid_num, cp.offset,
                                        cp.size) in existing_chunks:
                    reused += 1
                    ct.chunk_file = current_file.filename
                    ct.chunk_offset = existing_chunks[(cp.guid_num, cp.offset,
                                                       cp.size)]
                else:
                    # add to DL list if not already in it
                    if cp.guid_num not in chunks_in_dl_list:
                        self.chunks_to_dl.append(cp.guid_num)
                        chunks_in_dl_list.add(cp.guid_num)

                    # if chunk has more than one use or is already in cache,
                    # check if we need to add or remove it again.
                    if references[cp.guid_num] > 1 or cp.guid_num in cached:
                        references[cp.guid_num] -= 1

                        # delete from cache if no references left
                        if references[cp.guid_num] < 1:
                            current_cache_size -= analysis_res.biggest_chunk
                            cached.remove(cp.guid_num)
                            ct.cleanup = True
                        # add to cache if not already cached
                        elif cp.guid_num not in cached:
                            dl_cache_guids.add(cp.guid_num)
                            cached.add(cp.guid_num)
                            current_cache_size += analysis_res.biggest_chunk
                    else:
                        ct.cleanup = True

                chunk_tasks.append(ct)

            if reused:
                self.log.debug(
                    f' + Reusing {reused} chunks from: {current_file.filename}'
                )
                # open temporary file that will contain download + old file contents
                self.tasks.append(
                    FileTask(current_file.filename + u'.tmp', fopen=True))
                self.tasks.extend(chunk_tasks)
                self.tasks.append(
                    FileTask(current_file.filename + u'.tmp', close=True))
                # delete old file and rename temproary
                self.tasks.append(
                    FileTask(current_file.filename,
                             delete=True,
                             rename=True,
                             temporary_filename=current_file.filename +
                             u'.tmp'))
            else:
                self.tasks.append(FileTask(current_file.filename, fopen=True))
                self.tasks.extend(chunk_tasks)
                self.tasks.append(FileTask(current_file.filename, close=True))

            # check if runtime cache size has changed
            if current_cache_size > last_cache_size:
                self.log.debug(
                    f' * New maximum cache size: {current_cache_size / 1024 / 1024:.02f} MiB'
                )
                last_cache_size = current_cache_size

        self.log.debug(
            f'Final cache size requirement: {last_cache_size / 1024 / 1024} MiB.'
        )
        analysis_res.min_memory = last_cache_size + (
            1024 * 1024 * 32)  # add some padding just to be safe

        # Todo implement on-disk caching to avoid this issue.
        if analysis_res.min_memory > self.max_shared_memory:
            shared_mib = f'{self.max_shared_memory / 1024 / 1024:.01f} MiB'
            required_mib = f'{analysis_res.min_memory / 1024 / 1024:.01} MiB'
            raise MemoryError(
                f'Current shared memory cache is smaller than required! {shared_mib} < {required_mib}'
            )

        # calculate actual dl and patch write size.
        analysis_res.dl_size = \
            sum(c.file_size for c in manifest.chunk_data_list.elements if c.guid_num in chunks_in_dl_list)
        analysis_res.uncompressed_dl_size = \
            sum(c.window_size for c in manifest.chunk_data_list.elements if c.guid_num in chunks_in_dl_list)

        # add jobs to remove files
        for fname in mc.removed:
            self.tasks.append(FileTask(fname, delete=True))

        analysis_res.num_chunks_cache = len(dl_cache_guids)
        self.chunk_data_list = manifest.chunk_data_list
        self.analysis = analysis_res

        return analysis_res