def _get(self, image_file_path: str) -> dict or None:
        """
        Get a store entry by it's file_path
        :param image_file_path: file path to search for
        :return: elasticsearch result dictionary
        """
        es_query = {
            'query': {
                "constant_score": {
                    "filter": {
                        "term": {'path': image_file_path}
                    }
                }
            }
        }

        query_result = self._store.es.search(index=self._el_index, body=es_query)

        hits = query_result['hits']['hits']

        if len(hits) > 1:
            echo(f"WARNING: More than a single entry for a file, cleaning up: {image_file_path}", color='yellow')
            self.remove(image_file_path)
            self.add(image_file_path)

        if len(hits) == 0:
            return None
        else:
            return hits[0]['_source']
Пример #2
0
 def remove_empty_folders(self):
     phase_6_text = "Phase 6/6: Removing empty folders"
     if not self._config.REMOVE_EMPTY_FOLDERS.value:
         echo(phase_6_text + " - Skipping", color='yellow')
     else:
         echo(phase_6_text, color='cyan')
         self._remove_empty_folders(self._config.SOURCE_DIRECTORIES.value, self._config.RECURSIVE.value)
Пример #3
0
    def _echo_table(table: str):
        lines = table.splitlines()

        for line in lines[:2]:
            echo(line, color='cyan')

        for line in lines[2:]:
            echo(line)
Пример #4
0
    def find_duplicates_of_file(self, root_directories: List[Path], root_directory: Path, reference_file_path: Path):
        """
        Finds duplicates and marks all but the best copy as "to-be-deleted".
        :param root_directories: valid root directories
        :param root_directory: root directory of reference_file_path
        :param reference_file_path: the file to check for duplicates
        """
        self._progress_manager.inc()
        self._progress_manager.set_postfix(self._truncate_middle(reference_file_path))

        # remember processed files to prevent processing files in multiple directions
        if reference_file_path in self._processed_files:
            # already found a better candidate for this file
            return

        duplicate_candidates = self._persistence.find_similar(str(reference_file_path))

        if self._config.SEARCH_ACROSS_ROOT_DIRS.value:
            # filter by files in at least one of the specified root directories
            # this is necessary because the database might hold items for other paths already
            # and those are not interesting to us
            duplicate_candidates = [
                candidate for candidate in duplicate_candidates if
                any(root_dir in Path(candidate[MetadataKey.PATH.value]).parents for root_dir in root_directories)
            ]
        else:
            # filter by files in the same root directory
            duplicate_candidates = [
                candidate for candidate in duplicate_candidates if
                root_directory in Path(candidate[MetadataKey.PATH.value]).parents
            ]

        if len(duplicate_candidates) <= 0:
            echo(f"No duplication candidates found in database for '{reference_file_path}'. "
                 "This is an indication that the file has not been analysed yet or "
                 "there was an issue analysing it.",
                 color='yellow')

        if len(duplicate_candidates) <= 1:
            for candidate in duplicate_candidates:
                candidate_path = Path(candidate[MetadataKey.PATH.value])

                if candidate_path != reference_file_path:
                    echo(f"Unexpected unique duplication candidate '{candidate_path}' for "
                         f"reference file '{reference_file_path}'", color='yellow')

                self._processed_files[candidate_path] = True

            # nothing to do here since the result is unique
            return

        # sort by quality criteria and redo the search to use the best candidate as the reference image
        sorted_duplicate_candidates = self._sort_by_quality_descending(duplicate_candidates)
        new_reference_file_path = sorted_duplicate_candidates[0][MetadataKey.PATH.value]
        duplicate_candidates = self._persistence.find_similar(new_reference_file_path)

        candidates_to_keep, candidates_to_delete = self._select_images_to_delete(duplicate_candidates)
        self._save_duplicates_for_result(candidates_to_keep, candidates_to_delete)
Пример #5
0
def c_deduplicate(skip_analyse_phase: bool, dry_run: bool):
    config = DeduplicatorConfig()
    if dry_run is not None:
        config.DRY_RUN.value = dry_run
    deduplicator = ImageMatchDeduplicator(interactive=True)
    result = deduplicator.deduplicate_all(
        skip_analyze_phase=skip_analyse_phase, )

    echo()
    result.print_to_console()
Пример #6
0
    def analyse_all(self):
        """
        Runs the analysis phase independently.
        """
        directories = self._config.SOURCE_DIRECTORIES.value

        echo("Phase 1/2: Counting files ...", color='cyan')
        directory_map = self._count_files(directories)

        echo("Phase 2/2: Analyzing files ...", color='cyan')
        self.analyze_directories(directory_map)
Пример #7
0
 def process_duplicates(self):
     """
     Moves or removes duplicates based on the configuration
     """
     dry_run = self._config.DRY_RUN.value
     duplicate_target_directory = self._config.DEDUPLICATOR_DUPLICATES_TARGET_DIRECTORY.value
     if duplicate_target_directory:
         echo("Phase 5/6: Moving duplicates ...", color='cyan')
         self._move_files_marked_as_delete(duplicate_target_directory, dry_run)
     else:
         echo("Phase 5/6: Removing duplicates ...", color='cyan')
         self._remove_files_marked_as_delete(dry_run)
    def find_similar(self, reference_image_file_path: str) -> []:
        try:
            entry = self._get(reference_image_file_path)
            if entry is not None:
                result = []
                rec = self._store.search_single_record(entry)
                result.extend(rec)

                return result
            else:
                return self._store.search_image(reference_image_file_path, all_orientations=True)
        except Exception as e:
            echo(f"Error querying database for similar images of '{reference_image_file_path}': {e}", color="red")
            return []
Пример #9
0
    def analyze_file(self, file_path: Path):
        """
        Analyzes a single file
        :param file_path: the file path
        """
        self._progress_manager.set_postfix(self._truncate_middle(file_path))

        try:
            self._persistence.add(str(file_path))
        except Exception as e:
            logging.exception(e)
            echo(f"Error analyzing file '{file_path}': {e}")
        finally:
            self._progress_manager.inc()
Пример #10
0
    def on_any_event(self, event):
        if not self._event_matches_filter(event):
            return

        FILE_EVENT_COUNT.labels(type=event.event_type).inc()

        echo("FileSystemEvent: {} {} {}".format(
            event.event_type, "directory" if event.is_directory else "file",
            event.src_path))

        _actions = {
            EVENT_TYPE_CREATED: self.created,
            EVENT_TYPE_MODIFIED: self.modified,
            EVENT_TYPE_MOVED: self.moved,
            EVENT_TYPE_DELETED: self.deleted,
        }
        _actions[event.event_type](event)
Пример #11
0
    def _remove_folders(self, root_path: Path, folders: [str], dry_run: bool):
        """
        Function to remove empty folders
        :param root_path:
        """
        echo(f"Removing empty folders ({len(folders)}) in: '{root_path}' ...")

        if len(folders) == 0:
            return

        self._progress_manager.start("Removing empty folders", len(folders), "Folder", self.interactive)
        for folder in folders:
            self._progress_manager.set_postfix(self._truncate_middle(folder))

            if not dry_run:
                os.rmdir(folder)

            self._deduplication_result.add_removed_empty_folder(folder)
            self._progress_manager.inc()
        self._progress_manager.clear()
Пример #12
0
def c_daemon(dry_run: bool):
    echo("Starting daemon...")

    config: DeduplicatorConfig = DeduplicatorConfig()
    if dry_run is not None:
        config.DRY_RUN.value = dry_run

    if config.STATS_ENABLED.value:
        from prometheus_client import start_http_server
        echo("Starting prometheus reporter...")
        start_http_server(config.STATS_PORT.value)

    deduplicator = ImageMatchDeduplicator(interactive=False)
    processing_manager = ProcessingManager(deduplicator)

    deduplicator.deduplicate_all()
    processing_manager.start()

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        processing_manager.stop()
Пример #13
0
    def cleanup_database(self, directories: List[Path]):
        """
        Removes database entries of files that don't exist on disk.
        Note that this cleanup will only consider files within one
        of the root directories specified in constructor, as other file paths
        might have been added on other machines.
        :param directories: directories in this run
        """
        # TODO: This iterates through all db entries - even the ones we are ignoring.
        # The db query should be improved to speed this up

        count, entries = self._persistence.get_all()
        if count <= 0:
            return

        self._progress_manager.start(f"Cleanup database", count, "entries",
                                     self.interactive)
        for entry in entries:
            try:
                image_entry = entry['_source']
                metadata = image_entry[MetadataKey.METADATA.value]

                file_path = Path(image_entry[MetadataKey.PATH.value])
                self._progress_manager.set_postfix(
                    self._truncate_middle(str(file_path)))

                if MetadataKey.DATAMODEL_VERSION.value not in metadata:
                    echo(
                        f"Removing db entry with missing db model version number: {file_path}"
                    )
                    self._persistence.remove(str(file_path))
                    continue

                data_version = metadata[MetadataKey.DATAMODEL_VERSION.value]
                if data_version != self._persistence.DATAMODEL_VERSION:
                    echo(
                        f"Removing db entry with old db model version: {file_path}"
                    )
                    self._persistence.remove(str(file_path))
                    continue

                # filter by files in at least one of the specified root directories
                # this is necessary because the database might hold items for other paths already
                # and those are not interesting to us
                if not any(root_dir in file_path.parents
                           for root_dir in directories):
                    continue

                if not file_path.exists():
                    echo(f"Removing db entry for missing file: {file_path}")
                    self._persistence.remove(str(file_path))

            finally:
                self._progress_manager.inc()
        self._progress_manager.clear()
Пример #14
0
    def print_to_console(self):
        title = "" * 7 + "Summary"
        echo(title, color='cyan')
        echo('=' * 21, color='cyan')
        echo(f"Files with duplicates: {self.get_duplicate_count()}")
        echo(f"Files moved: {len(self.get_file_with_action(ActionEnum.MOVE))}")
        echo(
            f"Files deleted: {len(self.get_file_with_action(ActionEnum.DELETE))}"
        )

        headers = ("Action", "File path", "Dist", "Filesize", "Pixels")

        for reference_file_path, folder in self.get_file_duplicates().items():
            duplicate_count = len(folder)
            if duplicate_count > 0:
                columns = []
                echo()

                for item in [self._reference_files[reference_file_path]
                             ] + folder:
                    file_path = Path(item[MetadataKey.PATH.value])
                    distance = item[MetadataKey.DISTANCE.value]
                    distance_rounded = round(distance, 3)
                    file_size = item[MetadataKey.METADATA.value][
                        MetadataKey.FILE_SIZE.value]
                    file_size_mb = round(file_size / BYTE_IN_A_MB, 3)
                    pixel_count = item[MetadataKey.METADATA.value][
                        MetadataKey.PIXELCOUNT.value]

                    action = self.item_actions.get(file_path, ActionEnum.NONE)
                    row = [
                        action.name, file_path, distance_rounded, file_size_mb,
                        pixel_count
                    ]

                    # apply action style
                    row = list(
                        map(lambda x: str(click.style(str(x), action.color)),
                            row))
                    columns.append(row)

                self._echo_table(
                    tabulate(
                        columns,
                        headers=headers,
                        colalign=['center', 'left', 'left', 'right', 'right']))

        echo()
        echo(
            f"Removed (empty) folders ({len(self.get_removed_empty_folders())}):"
        )
        for folder in self.get_removed_empty_folders():
            echo(f"{folder}", color='red')
Пример #15
0
    def deduplicate_all(self, skip_analyze_phase: bool = False) -> DeduplicationResult:
        """
        Runs the full 6 deduplication phases.
        :param skip_analyze_phase: useful if you already did a dry run and want to do a real run afterwards
        :return: result of the operation
        """
        # see: https://stackoverflow.com/questions/14861891/runtimewarning-invalid-value-encountered-in-divide
        # and: https://stackoverflow.com/questions/29347987/why-cant-i-suppress-numpy-warnings
        import numpy
        numpy.warnings.filterwarnings('ignore')

        directories = self._config.SOURCE_DIRECTORIES.value
        if len(directories) <= 0:
            raise ValueError("No root directories to scan")

        if self._config.DRY_RUN.value:
            echo("==> DRY RUN! No files or folders will actually be deleted! <==", color='yellow')

        echo("Phase 1/6: Cleaning up database ...", color='cyan')
        self.cleanup_database(directories)

        echo("Phase 2/6: Counting files ...", color='cyan')
        directory_map = self._count_files(directories)

        phase_3_text = "Phase 3/6: Analyzing files"
        if skip_analyze_phase:
            echo(phase_3_text + " - Skipping", color='yellow')
        else:
            echo(phase_3_text, color='cyan')
            self.analyze_directories(directory_map)

        echo("Phase 4/6: Finding duplicate files ...", color='cyan')
        self.find_duplicates_in_directories(directory_map)

        # Phase 5/6: Move or Delete duplicate files
        self.process_duplicates()

        self.remove_empty_folders()

        return self._deduplication_result