def test_delete(self): with DataGenerator() as test_scenario: duplicates = ('1/a.data', '2/a.data', '3/a.data', '4/a.data') test_scenario.create_duplicates(duplicates, size=10) connection_string = ':memory:' with connection_factory(connection_string) as conn, \ repository(conn) as repo, patch('os.remove') as mock_path: DupScanner(repo).scan((test_scenario.root_path,)) for f in duplicates[:-1]: deleteable = test_scenario.abs_path(f) repo.delete_file(deleteable) mock_path.assert_called_with(deleteable) mock_path.reset_mock() try: not_deleteable = test_scenario.abs_path(duplicates[-1]) repo.delete_file(not_deleteable) assert False, 'Should have raised an exception' except Exception as e: expected = Exception('409 Can\'t delete a file without duplicates: {}'.format(not_deleteable)) assert repr(expected) == repr(e) assert not mock_path.called, 'File should have not been deleted' mock_path.reset_mock
def test_delete(self): with DataGenerator() as test_scenario: duplicates = ('1/a.data', '2/a.data', '3/a.data', '4/a.data') test_scenario.create_duplicates(duplicates, size=10) connection_string = ':memory:' with connection_factory(connection_string) as conn, \ repository(conn) as repo, patch('os.remove') as mock_path: DupScanner(repo).scan((test_scenario.root_path, )) for f in duplicates[:-1]: deleteable = test_scenario.abs_path(f) repo.delete_file(deleteable) mock_path.assert_called_with(deleteable) mock_path.reset_mock() try: not_deleteable = test_scenario.abs_path(duplicates[-1]) repo.delete_file(not_deleteable) assert False, 'Should have raised an exception' except Exception as e: expected = Exception( '409 Can\'t delete a file without duplicates: {}'. format(not_deleteable)) assert repr(expected) == repr(e) assert not mock_path.called, 'File should have not been deleted' mock_path.reset_mock
def test_false_duplicates_in_path(self): with DataGenerator() as test_scenario: duplicates_expected = set() uniques_expected = set() uniques_expected.add(test_scenario.create_file('1/a.data', size=4097)) uniques_expected.add(test_scenario.create_file('1/b.data', size=4096)) uniques_expected.add(test_scenario.create_file('1/c.data', size=512)) uniques_expected.add(test_scenario.create_file('1/e.data', size=4069)) uniques_expected.add(test_scenario.create_file('1/x.data', size=2048, readable=False)) uniques_expected.add(test_scenario.create_file('1/xx.data', size=2048, readable=False)) # TODO: This behavior is not consistent with don't follow links test_scenario.symlink('1/', 'links/1') uniques_expected.add(path.join(test_scenario.root_path, 'links/1/a.data')) uniques_expected.add(path.join(test_scenario.root_path, 'links/1/b.data')) uniques_expected.add(path.join(test_scenario.root_path, 'links/1/c.data')) uniques_expected.add(path.join(test_scenario.root_path, 'links/1/e.data')) uniques_expected.add(path.join(test_scenario.root_path, 'links/1/x.data')) uniques_expected.add(path.join(test_scenario.root_path, 'links/1/xx.data')) connection_string = ':memory:' with connection_factory(connection_string) as conn, repository(conn) as repo: # DupScanner(repo).scan((test_scenario.root_path,)) DupScanner(repo).scan(( path.join(test_scenario.root_path, '1'), path.join(test_scenario.root_path, 'links'), )) duplicates_found = {fullname for hash, size, fullname, path, abspath in repo.findBy_duplicate_hash()} duplicates_missing = duplicates_expected - duplicates_found assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format( duplicates_missing) duplicates_unexpected = duplicates_found - duplicates_expected assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format( duplicates_unexpected) assert duplicates_expected == duplicates_found, \ 'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(duplicates_expected, duplicates_found) uniques_found = {fullname for hash, size, fullname, path, abspath in repo.findBy_unique_hash()} uniques_missing = uniques_expected - uniques_found assert not uniques_missing, 'Expected unique elements were not found: {}'.format( uniques_missing) uniques_unexpected = uniques_found - uniques_expected assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format( uniques_unexpected) assert uniques_expected == uniques_found, \ 'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(uniques_expected, uniques_found)
def test_dont_follow_links(self): with DataGenerator() as test_scenario: duplicates_expected = set() uniques_expected = set() uniques_expected.add(test_scenario.create_file('1/a.data', size=4097)) uniques_expected.add(test_scenario.create_file('2/b.data', size=4096)) uniques_expected.add(test_scenario.create_file('3/c.data', size=512)) uniques_expected.add(test_scenario.create_file('3/e.data', size=4069)) uniques_expected.add(test_scenario.create_file('4/x.data', size=2048, readable=False)) uniques_expected.add(test_scenario.create_file('4/xx.data', size=2048, readable=False)) ignored_links = set() ignored_links.add(test_scenario.symlink('1/a.data', '2/lnk-a.data')) ignored_links.add(test_scenario.symlink('2/b.data', '1/lnk-b.data')) ignored_links.add(test_scenario.symlink('4/x.data', '4/lnk-x.data')) ignored_links.add(test_scenario.symlink('4/xx.data', '4/lnk-xx.data')) connection_string = ':memory:' with connection_factory(connection_string) as conn, repository(conn) as repo: DupScanner(repo).scan((test_scenario.root_path,)) duplicates_found = {fullname for hash, size, fullname, path, abspath in repo.findBy_duplicate_hash()} duplicates_missing = duplicates_expected - duplicates_found assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format( duplicates_missing) duplicates_unexpected = duplicates_found - duplicates_expected assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format( duplicates_unexpected) assert duplicates_expected == duplicates_found, \ 'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(duplicates_expected, duplicates_found) uniques_found = {fullname for hash, size, fullname, path, abspath in repo.findBy_unique_hash()} uniques_missing = uniques_expected - uniques_found assert not uniques_missing, 'Expected unique elements were not found: {}'.format( uniques_missing) uniques_unexpected = uniques_found - uniques_expected assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format( uniques_unexpected) assert uniques_expected == uniques_found, \ 'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(uniques_expected, uniques_found) links_unexpected = (uniques_found | duplicates_found) & ignored_links assert not links_unexpected, 'Unexpected links found'
def test_happy_path_with_nested_dirs(self): with DataGenerator() as test_scenario: duplicates_expected = test_scenario.create_duplicates( ('1/a.data', '2/a.data', '3/a.data', '4/a.data'), size=4097) # 4097 = block size + 1 duplicates_expected |= test_scenario.create_duplicates( ('2/aa.data', '4/aa.data'), size=4096) duplicates_expected |= test_scenario.create_duplicates( ('1/aaa.data', '4/aaa.data'), size=1024) duplicates_expected |= test_scenario.create_duplicates( ('1/aaaa.data', '3/aaaa.data'), size=0) uniques_expected = set() uniques_expected.add(test_scenario.create_file('1/b.data', size=4097)) uniques_expected.add(test_scenario.create_file('2/c.data', size=4096)) uniques_expected.add(test_scenario.create_file('3/d.data', size=512)) uniques_expected.add(test_scenario.create_file('1/e.data', size=4097)) uniques_expected.add(test_scenario.create_file('4/x.data', size=2048, readable=False)) uniques_expected.add(test_scenario.create_file('4/xx.data', size=2048, readable=False)) connection_string = ':memory:' with connection_factory(connection_string) as conn, repository(conn) as repo: DupScanner(repo).scan((test_scenario.root_path, test_scenario.abs_path('1/'))) duplicates_found = {abspath for hash, size, fullname, path, abspath in repo.findBy_duplicate_hash()} duplicates_missing = duplicates_expected - duplicates_found assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format( duplicates_missing) duplicates_unexpected = duplicates_found - duplicates_expected assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format( duplicates_unexpected) assert duplicates_expected == duplicates_found, 'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format( duplicates_expected, duplicates_found) uniques_found = {abspath for hash, size, fullname, path, abspath in repo.findBy_unique_hash()} uniques_missing = uniques_expected - uniques_found assert not uniques_missing, 'Expected unique elements were not found: {}'.format( uniques_missing) uniques_unexpected = uniques_found - uniques_expected assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format( uniques_unexpected) assert uniques_expected == uniques_found, 'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format( uniques_expected, uniques_found)
def test_false_duplicates_in_path(self): with DataGenerator() as test_scenario: duplicates_expected = set() uniques_expected = set() uniques_expected.add( test_scenario.create_file('1/a.data', size=4097)) uniques_expected.add( test_scenario.create_file('1/b.data', size=4096)) uniques_expected.add( test_scenario.create_file('1/c.data', size=512)) uniques_expected.add( test_scenario.create_file('1/e.data', size=4069)) uniques_expected.add( test_scenario.create_file('1/x.data', size=2048, readable=False)) uniques_expected.add( test_scenario.create_file('1/xx.data', size=2048, readable=False)) # TODO: This behavior is not consistent with don't follow links test_scenario.symlink('1/', 'links/1') uniques_expected.add( path.join(test_scenario.root_path, 'links/1/a.data')) uniques_expected.add( path.join(test_scenario.root_path, 'links/1/b.data')) uniques_expected.add( path.join(test_scenario.root_path, 'links/1/c.data')) uniques_expected.add( path.join(test_scenario.root_path, 'links/1/e.data')) uniques_expected.add( path.join(test_scenario.root_path, 'links/1/x.data')) uniques_expected.add( path.join(test_scenario.root_path, 'links/1/xx.data')) connection_string = ':memory:' with connection_factory(connection_string) as conn, repository( conn) as repo: # DupScanner(repo).scan((test_scenario.root_path,)) DupScanner(repo).scan(( path.join(test_scenario.root_path, '1'), path.join(test_scenario.root_path, 'links'), )) duplicates_found = { fullname for hash, size, fullname, path, abspath in repo.findBy_duplicate_hash() } duplicates_missing = duplicates_expected - duplicates_found assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format( duplicates_missing) duplicates_unexpected = duplicates_found - duplicates_expected assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format( duplicates_unexpected) assert duplicates_expected == duplicates_found, \ 'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(duplicates_expected, duplicates_found) uniques_found = { fullname for hash, size, fullname, path, abspath in repo.findBy_unique_hash() } uniques_missing = uniques_expected - uniques_found assert not uniques_missing, 'Expected unique elements were not found: {}'.format( uniques_missing) uniques_unexpected = uniques_found - uniques_expected assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format( uniques_unexpected) assert uniques_expected == uniques_found, \ 'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(uniques_expected, uniques_found)
def test_dont_follow_links(self): with DataGenerator() as test_scenario: duplicates_expected = set() uniques_expected = set() uniques_expected.add( test_scenario.create_file('1/a.data', size=4097)) uniques_expected.add( test_scenario.create_file('2/b.data', size=4096)) uniques_expected.add( test_scenario.create_file('3/c.data', size=512)) uniques_expected.add( test_scenario.create_file('3/e.data', size=4069)) uniques_expected.add( test_scenario.create_file('4/x.data', size=2048, readable=False)) uniques_expected.add( test_scenario.create_file('4/xx.data', size=2048, readable=False)) ignored_links = set() ignored_links.add(test_scenario.symlink('1/a.data', '2/lnk-a.data')) ignored_links.add(test_scenario.symlink('2/b.data', '1/lnk-b.data')) ignored_links.add(test_scenario.symlink('4/x.data', '4/lnk-x.data')) ignored_links.add( test_scenario.symlink('4/xx.data', '4/lnk-xx.data')) connection_string = ':memory:' with connection_factory(connection_string) as conn, repository( conn) as repo: DupScanner(repo).scan((test_scenario.root_path, )) duplicates_found = { fullname for hash, size, fullname, path, abspath in repo.findBy_duplicate_hash() } duplicates_missing = duplicates_expected - duplicates_found assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format( duplicates_missing) duplicates_unexpected = duplicates_found - duplicates_expected assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format( duplicates_unexpected) assert duplicates_expected == duplicates_found, \ 'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(duplicates_expected, duplicates_found) uniques_found = { fullname for hash, size, fullname, path, abspath in repo.findBy_unique_hash() } uniques_missing = uniques_expected - uniques_found assert not uniques_missing, 'Expected unique elements were not found: {}'.format( uniques_missing) uniques_unexpected = uniques_found - uniques_expected assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format( uniques_unexpected) assert uniques_expected == uniques_found, \ 'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format(uniques_expected, uniques_found) links_unexpected = (uniques_found | duplicates_found) & ignored_links assert not links_unexpected, 'Unexpected links found'
def test_happy_path_with_nested_dirs(self): with DataGenerator() as test_scenario: duplicates_expected = test_scenario.create_duplicates( ('1/a.data', '2/a.data', '3/a.data', '4/a.data'), size=4097) # 4097 = block size + 1 duplicates_expected |= test_scenario.create_duplicates( ('2/aa.data', '4/aa.data'), size=4096) duplicates_expected |= test_scenario.create_duplicates( ('1/aaa.data', '4/aaa.data'), size=1024) duplicates_expected |= test_scenario.create_duplicates( ('1/aaaa.data', '3/aaaa.data'), size=0) uniques_expected = set() uniques_expected.add( test_scenario.create_file('1/b.data', size=4097)) uniques_expected.add( test_scenario.create_file('2/c.data', size=4096)) uniques_expected.add( test_scenario.create_file('3/d.data', size=512)) uniques_expected.add( test_scenario.create_file('1/e.data', size=4097)) uniques_expected.add( test_scenario.create_file('4/x.data', size=2048, readable=False)) uniques_expected.add( test_scenario.create_file('4/xx.data', size=2048, readable=False)) connection_string = ':memory:' with connection_factory(connection_string) as conn, repository( conn) as repo: DupScanner(repo).scan( (test_scenario.root_path, test_scenario.abs_path('1/'))) duplicates_found = { abspath for hash, size, fullname, path, abspath in repo.findBy_duplicate_hash() } duplicates_missing = duplicates_expected - duplicates_found assert not duplicates_missing, 'Expected duplicate elements were not found: {}'.format( duplicates_missing) duplicates_unexpected = duplicates_found - duplicates_expected assert not duplicates_unexpected, 'Unexpected duplicate elements were found: {}'.format( duplicates_unexpected) assert duplicates_expected == duplicates_found, 'Expected duplicate set doesn\'t match found set. \n Expected: {}\n Found: {}'.format( duplicates_expected, duplicates_found) uniques_found = { abspath for hash, size, fullname, path, abspath in repo.findBy_unique_hash() } uniques_missing = uniques_expected - uniques_found assert not uniques_missing, 'Expected unique elements were not found: {}'.format( uniques_missing) uniques_unexpected = uniques_found - uniques_expected assert not uniques_unexpected, 'Unexpected unique elements were found: {}'.format( uniques_unexpected) assert uniques_expected == uniques_found, 'Expected unique set doesn\'t match found set. \n Expected: {}\n Found: {}'.format( uniques_expected, uniques_found)
def setUp(self): self.conn = sqlite3.connect(":memory:") self.repo = dupscanner.repository(self.conn) self.repo.create_schema()
def main(): args_parser = argparse.ArgumentParser() parser = argparse.ArgumentParser() parser.set_defaults(action='duplicates') parser.add_argument("path", help="Path where to look for duplicates", nargs='+') parser.add_argument("-d", "--database", help="Stores a temporary SQLite database in a file", default=":memory:") parser.add_argument("-lf", "--log-format", help="Logging format", default='%(message)s') parser.add_argument("-l", "--log", help="File to output the log messages") # parser.add_argument("-u", "--unique", help="Find unique files", action="store_const", const='unique', dest='action') parser.add_argument("-u", "--unique", help="Find unique files", action="store_true") parser.add_argument("-o", "--output-file", help="Output file (default: stdout)", default='-', type=file('w', encoding='UTF-8')) g = parser.add_mutually_exclusive_group() g.add_argument( "-t", "--template", help="""Output template. Variables ${hash}, ${size}, ${fullname}, ${path}, ${abspath}, ${realpath} will be replaced with the actual values""", default="${hash}\t${size}\t${fullname}" ) g.add_argument( "-e", "--evaluate", help="""For each result, evaluate the given python code to process the output. Variables hash, size, filename and output_file will be bounded to the appropiate values""" ) g.add_argument( "-x", "--execute-script", help="""Executes the given python script to process the results. Variables results and output_file will be bounded to an iterator and the appropiate output stream""" ) g.add_argument( "-p", "--pretty-print", action="store_true", help="Groups the results by hash and file size and displays a pretty output" ) g.add_argument( "-i", "--interactive", action="store_true", help="Interactive mode" ) parser.add_argument( "-v", "--verbosity", help="Verbosity level (default: WARN)", default='WARN', choices=['DEBUG','INFO','WARN','ERROR','CRITICAL'], type=lambda level: level.upper() ) args = parser.parse_args() logging.basicConfig( level=args.verbosity, format=args.log_format, filename=args.log ) connection_string = args.database path = args.path action = args.action template = args.template with connection_factory(connection_string) as conn, repository(conn) as repo, args.output_file as output_file: dupscanner = DupScanner(repo) # command = { 'unique': dupscanner.find_unique, 'duplicates': dupscanner.find_duplicates } # results = command[action](path) results = dupscanner.find_unique(path) if args.unique else dupscanner.find_duplicates(path) if args.execute_script: exec_script(args.execute_script, output_file, results, conn, repo) elif args.interactive: run_server(repo) elif args.evaluate: func = exec_command(args.evaluate, output_file, results) elif args.pretty_print: func = pretty_print(results, output_file) else: plain_print(template, results, output_file)