Exemplo n.º 1
0
def compare(files, threshold):
    import tempfile
    import glob

    tempdir = tempfile.mkdtemp()
    print tempdir

    for filepath in files:
        shutil.copy2(filepath, tempdir)

    files = glob.glob(os.path.join(tempdir, '*'))
    fingerprints = Fingerprints(output_dir=tempdir, threshold=threshold)
    globals()['fingerprints'] = fingerprints

    results = utils.apply_async(calculate, [
        (tempdir, filepath) for filepath in files
    ])
    for r in results:
        fingerprints.add(r)
    fingerprints.uncompress()

    for filepath in files:
        duplicates = get_duplicates(fingerprints.find_hash(filepath))
        print duplicates
Exemplo n.º 2
0
def process(sources, dest, threshold, ignore, clean, normalize, cpus):
    sources = utils.normalize_paths(list(sources))
    ignore = utils.normalize_paths(list(ignore) + [dest])

    fingerprints = Fingerprints(output_dir=dest, threshold=threshold, ignore=ignore)
    duplicates_file = os.path.join(dest, ".duplicates.json")

    print "Max distance of similarity:", fingerprints.max_distance

    # to get progress, count the total number of files we will touch
    files = []
    for source in sources:
        files += utils.walker(source, lambda x: x, ignore=ignore)

    print "Processing {0} files.".format(len(files))

    if len(files) == 0:
        print "No files found."
        quit()

    if normalize:
        print "Normalizing images..."
        files = utils.apply_async(utils.normalize_image, [
            (fingerprints._backup_dir, filepath, clean) for filepath in files
        ], cpus=cpus)

    need_fingerprint = [
        (fingerprints._backup_dir, filepath)
        for filepath in files if not fingerprints.find_hash(filepath)
    ]

    if need_fingerprint:
        def save_progress(results):
            for r in results:
                fingerprints.add(r)
            fingerprints.save()
            print "Saved."

        print "Calculating fingerprints for {0} files...".format(len(need_fingerprint))
        results = utils.apply_async(calculate, need_fingerprint,
                                    aborted_callback=save_progress, cpus=cpus)
        save_progress(results)

    fingerprints.uncompress()

    output = {"threshold": threshold, "duplicates": {}}
    if os.path.exists(duplicates_file):
        with open(duplicates_file, 'r') as fp:
            loaded = json.load(fp)
            if loaded.get("threshold") == threshold:
                output = loaded

    output_duplicates = output.get("duplicates", {})

    # we need to recalculate *all* duplicate information if there is a new file
    # added since the last time we ran, since it might be a duplicate of anything
    # TODO: handle ctrl-c again
    old_duplicates = set(output_duplicates.keys())
    missing_duplicate_info = set(files) - old_duplicates

    if missing_duplicate_info:
        need_duplicate = files
        output_duplicates.clear()
    else:
        need_duplicate = [
            filepath for filepath in files if filepath not in output_duplicates.keys()
        ]

    # seen_files = []
    # for filepath, duplicate_files in output_duplicates.iteritems():
    #     seen_files.append(filepath)
    #     for distance, duplicate_path in duplicate_files:
    #         seen_files.append(duplicate_path)

    if need_duplicate:
        print "Looking for duplicates for {0} files...".format(len(need_duplicate))

        def save_progress(results):
            for image_hash, duplicates in results:
                if not image_hash:
                    continue

                filepath = image_hash.path
                output_duplicates[filepath] = duplicates

            with open(duplicates_file, "w") as fp:
                json.dump(output, fp)

            print "Saved."

        def setup(fingerprints):
            """
            hack: store the fingerprints into globals for each process so the worker
            function can pull them from globals()
            """
            globals()['fingerprints'] = fingerprints

        results = utils.apply_async(
            get_duplicates,
            [(fingerprints.find_hash(filepath),) for filepath in need_duplicate],
            pool_args=dict(
                initializer=setup,
                initargs=(fingerprints,)
            ),
            aborted_callback=save_progress,
            cpus=cpus
        )

        # results = []
        # setup(fingerprints)
        # progress = utils.Progress(len(need_duplicate))
        # for filepath in need_duplicate:
        #     results.append(get_duplicates(fingerprints.find_hash(filepath)))
        #     progress.incr()
        # progress.done()

        save_progress(results)

    keys = sorted(output_duplicates.keys(), key=get_file_date)

    seen_files = []
    count = 0
    duplicates_count = 0
    duplicates_dest = os.path.join(dest, "duplicates")
    for filepath in keys:
        if filepath in seen_files:
            continue

        duplicates = output_duplicates[filepath]
        count += 1
        seen_files.append(filepath)
        seen_files.extend([duplicate_path for distance, duplicate_path in duplicates])

        all_files = [(0, filepath)] + duplicates
        distance, best_image = get_best_match(all_files)
        new_filepath = get_new_image_path(best_image, count, dest)
        print "Copying {0} to {1}".format(best_image, new_filepath)
        shutil.copy2(best_image, new_filepath)

        if duplicates:
            duplicates_count += 1
            filedest = os.path.join(duplicates_dest, str(duplicates_count))

            all_files = [(distance, filepath)] + duplicates
            for idx, (distance, filepath) in enumerate(all_files, start=1):
                # only copy non-exact matches for spot checking
                if distance > 0 and filepath != best_image:
                    if not os.path.isdir(filedest):
                        os.makedirs(filedest)
                    shutil.copy2(
                        filepath,
                        get_new_image_path(
                            filepath,
                            idx,
                            filedest,
                            "{0}_{1}".format(distance, "{0}{1}")
                        )
                    )

            if os.path.isdir(filedest):
                shutil.copy2(
                    new_filepath, get_new_image_path(new_filepath, None, filedest, "kept{1}")
                )