示例#1
0
def main():
    parser = argparse.ArgumentParser(
        description='Generate image signatures for files in database.')
    add_common_command_line_arguments(parser)
    parser.add_argument('-T', '--threads', metavar='THREADS',
                        nargs=1,
                        default=[DEFAULT_THREADS],
                        help=("Specify the number of threads to use. 'C' is " +
                              "substitued for the number of cores. " +
                              "Default is %r. Examples: '1', '10' or '1.5C'.")
                        % DEFAULT_THREADS)
    parser.add_argument('-s', '--min-similarity',
                        default=0.8,
                        help='require at least this image similarity')
    parser.add_argument('-S', '--signatures',
                        nargs=1,
                        default="3",
                        help=("Which signatures should be computed. " +
                              "Default is %r. Examples: '1,2', '3' or 'all'.")
                        % "3")

    args = parser.parse_args()
    repo = dfr.db.Database(args.db[0])
    signatures = args.signatures[0]
    if signatures == "all":
        signatures = [1, 2, 3, 4, 5]
    else:
        signatures = [int(x) for x in signatures.split(",")]
    threads = eval_thread_config(args.threads[0])
    indexer = ImageIndexer(repo, signatures, parallel_threads=threads)
    indexer.run()
    sim = float(args.min_similarity)
    for iht in signatures:
        comperator = ImageComperator(repo, iht)
        comperator.ensure_that_differences_are_calculated(sim)
示例#2
0
def main():
    parser = argparse.ArgumentParser(
        description='Index directories recursive.')
    parser.add_argument('roots', metavar='DIR', nargs='*', default=["."],
                        help="a directory to index " +
                        "(if not given '.' will be used)")
    add_common_command_line_arguments(parser)
    parser.add_argument('-x', '--exclude-files', metavar='GLOBPATTERNS',
                        nargs=1, action="append",
                        default=[DEFAULT_FILE_EXCLUDE],
                        help=("Exclude files based on comma separated " +
                              "glob patterns. Default is %r.") %
                        DEFAULT_FILE_EXCLUDE)
    parser.add_argument('-X', '--exclude-dirs', metavar='GLOBPATTERNS',
                        nargs=1, action="append",
                        default=[DEFAULT_DIR_EXCLUDE],
                        help=("Exclude directories based on comma separated " +
                              "glob patterns. Default is %r.") %
                        DEFAULT_DIR_EXCLUDE)

    args = parser.parse_args()
    repo = dfr.db.Database(args.db[0])

    excluded_files = cleanup(args.exclude_files, DEFAULT_FILE_EXCLUDE)
    excluded_dirs = cleanup(args.exclude_dirs, DEFAULT_DIR_EXCLUDE)
    excluded_files = globs_to_regexp(excluded_files)
    excluded_dirs = globs_to_regexp(excluded_dirs)

    indexer = BitIndexer(repo, excluded_files, excluded_dirs)
    indexer.run(args.roots)
示例#3
0
def main():
    parser = argparse.ArgumentParser(
        description='Reports statistics about the different image signatures.')
    parser.add_argument('roots', metavar='DIR', nargs='*', default=["."],
                        help="a directory to scan for duplicate files " +
                        "(if not given '.' will be used)")
    add_common_command_line_arguments(parser)
    parser.add_argument('-s', '--min-similarity',
                        default=0.9,
                        help='require at least this image similarity')

    args = parser.parse_args()
    repo = dfr.db.Database(args.db[0])

    known = repo.imagefeedback.find()
    positive = [x for x in known if x.aresimilar == 1]
    print ("There are %d classified image pairs. %d (%.1f%%) " +
           "are classifized as similar.") % \
          (len(known), len(positive), (100.0*len(positive))/len(known))
    print ("%10s | %12s | %10s | %12s | %10s | %10s | %10s " +
           "| %10s | %10s | %10s") % \
          ("Signature", "Description", "Detected", "Classified", "TP",
           "FP", "FN", "Precision", "Recall", "F-Measure")

    for sig, sim in [(1, 0.95), (2, 0.999), (3, 0.9),
                     (4, 0.95), (5, 0.8)]:
        finder = ImageSimilarFinder(repo, args.roots, sig, 0)
        pairs = list(finder.find(sim))
        report(sig, pairs, known)
    def test_add_common_command_line_arguments(self):
        parser = argparse.ArgumentParser()
        add_common_command_line_arguments(parser)

        default_db = parser.parse_args([]).db[0]
        self.assertTrue(default_db.startswith("/"))
        self.assertTrue(default_db.endswith("files.sdb"))
        self.assertEqual(parser.parse_args(["--db-file", "foo"]).db[0], "foo")
        self.assertEqual(parser.parse_args(["--db-file=foo"]).db[0], "foo")
def main():
    parser = argparse.ArgumentParser(
        description='Find files with equal or similar content.')
    parser.add_argument('roots', metavar='DIR', nargs='*', default=["."],
                        help="a directory to scan for duplicate files " +
                        "(if not given '.' will be used)")
    add_common_command_line_arguments(parser)
    parser.add_argument('-t', '--output-type', default="interactive",
                        help='determine the output type. Valid values are ' +
                        '"interactive", "csv" and "json". ' +
                        'Default is "interactive".')
    parser.add_argument('-o', '--output', default="-",
                        help='The output file name. "-" stands for stdout. ' +
                        'Default is "-".')
    parser.add_argument('-w', '--what', default="bitequal",
                        help='determine what is searched. Valid values are ' +
                        '"bitequal" for files which are equal ' +
                        'bit-wise, "truncated" for files which are ' +
                        'truncated (the larger files consists of the ' +
                        'smaller file and some extra content ' +
                        'afterwards) and "image" to search for similar ' +
                        'images. Default is "bitequal".')
    parser.add_argument('-s', '--min-similarity',
                        default=0.9,
                        help='require at least this image similarity. ' +
                        'Default is "0.9".')
    parser.add_argument('-S', '--image-signature',
                        default=3,
                        help='Image signature to use. Valid is 1, 2, 3, 4 ' +
                        'and 5. Default is "3".')
    parser.add_argument('-n', '--dry-run', action="store_true", dest='dry_run',
                        help='do not delete any files')

    args = parser.parse_args()
    repo = dfr.db.Database(args.db[0])

    if args.what == "image":
        finder = ImageSimilarFinder(repo, args.roots,
                                    int(args.image_signature))
        if args.output_type == "json":
            resolver = JsonImageSimilarResolver(args.output)
            finder = ImageSimilarBucketFinder(repo, args.roots,
                                              int(args.image_signature))
        elif args.output_type == "csv":
            resolver = CsvImageSimilarResolver(args.output)
        else:
            resolver = GuiImageSimilarResolver(args.dry_run)
        found_items = finder.find(float(args.min_similarity))
    elif args.what == "truncated":
        if args.output_type == "csv":
            resolver = CsvBitTruncatedResolver(args.output)
        else:
            resolver = InteractiveBitTruncatedResolver(args.dry_run)
        finder = BitTruncatedFinder(repo, args.roots)
        found_items = finder.find()
    else:
        finder = BitEqualFinder(repo, args.roots)
        if args.output_type == "json":
            resolver = JsonBitEqualResolver(args.output)
            finder = BitEqualBucketFinder(repo, args.roots)
        elif args.output_type == "csv":
            resolver = CsvBitEqualResolver(args.output)
        else:
            resolver = InteractiveBitEqualResolver(args.dry_run)

        found_items = finder.find()

    for item in found_items:
        resolver.resolve(item)
    resolver.finished()