def get_dupe_groups(self, files, j=job.nulljob): j = j.start_subjob([8, 2]) for f in (f for f in files if not hasattr(f, 'is_ref')): f.is_ref = False files = remove_dupe_paths(files) logging.info("Getting matches. Scan type: %d", self.scan_type) matches = self._getmatches(files, j) logging.info('Found %d matches' % len(matches)) j.set_progress(100, tr("Removing false matches")) # In removing what we call here "false matches", we first want to remove, if we scan by # folders, we want to remove folder matches for which the parent is also in a match (they're # "duplicated duplicates if you will). Then, we also don't want mixed file kinds if the # option isn't enabled, we want matches for which both files exist and, lastly, we don't # want matches with both files as ref. if self.scan_type == ScanType.Folders and matches: allpath = {m.first.path for m in matches} allpath |= {m.second.path for m in matches} sortedpaths = sorted(allpath) toremove = set() last_parent_path = sortedpaths[0] for p in sortedpaths[1:]: if p in last_parent_path: toremove.add(p) else: last_parent_path = p matches = [m for m in matches if m.first.path not in toremove or m.second.path not in toremove] if not self.mix_file_kind: matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)] matches = [m for m in matches if m.first.path.exists() and m.second.path.exists()] matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)] if self.ignore_list: j = j.start_subjob(2) iter_matches = j.iter_with_progress(matches, tr("Processed %d/%d matches against the ignore list")) matches = [ m for m in iter_matches if not self.ignore_list.AreIgnored(str(m.first.path), str(m.second.path)) ] logging.info('Grouping matches') groups = engine.get_groups(matches, j) matched_files = dedupe([m.first for m in matches] + [m.second for m in matches]) if self.scan_type in {ScanType.Filename, ScanType.Fields, ScanType.FieldsNoOrder, ScanType.Tag}: self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups) else: # Ticket #195 # To speed up the scan, we don't bother comparing contents of files that are both ref # files. However, this messes up "discarded" counting because there's a missing match # in cases where we end up with a dupe group anyway (with a non-ref file). Because it's # impossible to have discarded matches in exact dupe scans, we simply set it at 0, thus # bypassing our tricky problem. # Also, although ScanType.FuzzyBlock is not always doing exact comparisons, we also # bypass ref comparison, thus messing up with our "discarded" count. So we're # effectively disabling the "discarded" feature in PE, but it's better than falsely # reporting discarded matches. self.discarded_file_count = 0 groups = [g for g in groups if any(not f.is_ref for f in g)] logging.info('Created %d groups' % len(groups)) j.set_progress(100, tr("Doing group prioritization")) for g in groups: g.prioritize(self._key_func, self._tie_breaker) return groups
def get_dupe_groups(self, files, ignore_list=None, j=job.nulljob): for f in (f for f in files if not hasattr(f, "is_ref")): f.is_ref = False files = remove_dupe_paths(files) logging.info("Getting matches. Scan type: %d", self.scan_type) matches = self._getmatches(files, j) logging.info("Found %d matches" % len(matches)) j.set_progress(100, tr("Almost done! Fiddling with results...")) # In removing what we call here "false matches", we first want to remove, if we scan by # folders, we want to remove folder matches for which the parent is also in a match (they're # "duplicated duplicates if you will). Then, we also don't want mixed file kinds if the # option isn't enabled, we want matches for which both files exist and, lastly, we don't # want matches with both files as ref. if self.scan_type == ScanType.FOLDERS and matches: allpath = {m.first.path for m in matches} allpath |= {m.second.path for m in matches} sortedpaths = sorted(allpath) toremove = set() last_parent_path = sortedpaths[0] for p in sortedpaths[1:]: if p in last_parent_path: toremove.add(p) else: last_parent_path = p matches = [m for m in matches if m.first.path not in toremove or m.second.path not in toremove] if not self.mix_file_kind: matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)] matches = [m for m in matches if m.first.path.exists() and m.second.path.exists()] matches = [m for m in matches if not (m.first.is_ref and m.second.is_ref)] if ignore_list: matches = [m for m in matches if not ignore_list.are_ignored(str(m.first.path), str(m.second.path))] logging.info("Grouping matches") groups = engine.get_groups(matches) if self.scan_type in { ScanType.FILENAME, ScanType.FIELDS, ScanType.FIELDSNOORDER, ScanType.TAG, }: matched_files = dedupe([m.first for m in matches] + [m.second for m in matches]) self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups) else: # Ticket #195 # To speed up the scan, we don't bother comparing contents of files that are both ref # files. However, this messes up "discarded" counting because there's a missing match # in cases where we end up with a dupe group anyway (with a non-ref file). Because it's # impossible to have discarded matches in exact dupe scans, we simply set it at 0, thus # bypassing our tricky problem. # Also, although ScanType.FuzzyBlock is not always doing exact comparisons, we also # bypass ref comparison, thus messing up with our "discarded" count. So we're # effectively disabling the "discarded" feature in PE, but it's better than falsely # reporting discarded matches. self.discarded_file_count = 0 groups = [g for g in groups if any(not f.is_ref for f in g)] logging.info("Created %d groups" % len(groups)) for g in groups: g.prioritize(self._key_func, self._tie_breaker) return groups
def can_handle(cls, path): return fs.File.can_handle(path) and get_file_ext( path.name) in cls.HANDLED_EXTS
def extension(self): return get_file_ext(self.name)
def can_handle(cls, path): if not fs.File.can_handle(path): return False return get_file_ext(path.name) in auto.EXT2CLASS
def can_handle(cls, path): return fs.File.can_handle(path) and get_file_ext(path.name) in cls.HANDLED_EXTS
def can_handle(cls, path): if not fs.File.can_handle(path): return False return get_file_ext(path.name) in SUPPORTED_EXTS
def _create_sub_file(self, name, with_parent=True): parent = self if with_parent else None ext = get_file_ext(name) if ext in ASSOC: return ASSOC[ext](parent, name)