def test_find_duplicates_jobs(self): """ Test finding duplicates using correct job count from parameter. """ max_cpu = 666 patch_cpu = patch('bear.output.cpu_count', return_value=max_cpu) patch_pool = patch('bear.output.Pool') with patch_cpu, patch_pool as pool: self.assertEqual(find_duplicates([], processes=1), {}) self.assertEqual(find_duplicates([], processes=0), {}) self.assertEqual(find_duplicates([], processes=4), {}) self.assertEqual( [ # remove __iter__() calls because those return a tuple # iterator instead of call().__enter__().map().__iter__() item for item in pool.mock_calls if '__iter__' not in str(item) ], [ call(processes=1), call().__enter__(), call().__enter__().map(hash_files, []), call().__exit__(None, None, None), call(processes=666), call().__enter__(), call().__enter__().map(hash_files, []), call().__exit__(None, None, None), call(processes=4), call().__enter__(), call().__enter__().map(hash_files, []), call().__exit__(None, None, None), ])
def test_find_duplicates_chunks(self): """ Test chunking list of files for multiple processes. """ patch_pool = patch('bear.output.Pool') files = [str(num) for num in range(15)] def side_effect(folder): return { 'a': files[:5], 'b': files[5:10], 'c': files[10:] }[basename(folder)] patch_find_files = patch('bear.output.find_files', side_effect=side_effect) # pylint: disable=confusing-with-statement with patch_pool as pool, patch_find_files: self.assertEqual(find_duplicates(['a', 'b', 'c'], processes=1), {}) self.assertEqual(find_duplicates(['a', 'b', 'c'], processes=3), {}) self.assertEqual(find_duplicates(['a', 'b', 'c'], processes=4), {}) self.assertEqual( [ # remove __iter__() calls because those return a tuple # iterator instead of call().__enter__().map().__iter__() item for item in pool.mock_calls if '__iter__' not in str(item) ], [ call(processes=1), call().__enter__(), # all files to single process call().__enter__().map(hash_files, [files]), call().__exit__(None, None, None), call(processes=3), call().__enter__(), # files chunked len / processors -> 15 // 3 chunks call().__enter__().map( hash_files, [files[0:5], files[5:10], files[10:15]]), call().__exit__(None, None, None), call(processes=4), call().__enter__(), # files chunked len / processors -> 15 // 4 chunks call().__enter__().map(hash_files, [ files[0:3], files[3:6], files[6:9], files[9:12], files[12:] ]), call().__exit__(None, None, None), ])
def test_find_duplicates_join(self): """ Test joining duplicates from multiple jobs. """ mock_pool = MagicMock( **{ '__enter__.return_value.map.return_value': [{ '123': ['original'], '456': ['ori', 'dupli'] }, { '123': ['duplicate'] }, { '789': ['original'], '012': ['orig', 'dup'] }] }) patch_pool = patch('bear.output.Pool', return_value=mock_pool) with patch_pool: # 789 is single-file original, ignored in filter_files() self.assertEqual( find_duplicates([], processes=2), { '012': ['orig', 'dup'], '123': ['original', 'duplicate'], '456': ['ori', 'dupli'] })
def handle_duplicates(ctx: Context, hasher: Hasher): """ Handle --duplicate related behavior. """ duplicates = find_duplicates(ctx=ctx, hasher=hasher) output_duplicates(hashes=duplicates, out=ctx.output) if ctx.keep_oldest: remove_except_oldest(files=duplicates.values()) elif ctx.keep_newest: remove_except_newest(files=duplicates.values())
def test_find_duplicates_join(self): """ Test joining duplicates from multiple jobs. """ mock_pool = MagicMock(**{ '__enter__.return_value.map.return_value': [ {'123': ['original'], '456': ['ori', 'dupli']}, {'123': ['duplicate']}, {'789': ['original'], '012': ['orig', 'dup']} ] }) patch_pool = patch('bear.output.Pool', return_value=mock_pool) ctx = Context(Namespace( duplicates=[], jobs=2, files=[], traverse=[], hash=[] )) with patch_pool: # 789 is single-file original, ignored in filter_files() self.assertEqual(find_duplicates( ctx=ctx, hasher=Hasher.MD5 ), { '012': ['orig', 'dup'], '123': ['original', 'duplicate'], '456': ['ori', 'dupli'] })
def test_find_duplicates_chunks(self): """ Test chunking list of files for multiple processes. """ patch_pool = patch('bear.output.Pool') files = [str(num) for num in range(15)] # pylint: disable=dangerous-default-value def side_effect(ctx, folder): assert isinstance(ctx, Context) return { 'a': files[:5], 'b': files[5:10], 'c': files[10:] }[basename(folder)] patch_find_files = patch( 'bear.output.find_files', side_effect=side_effect ) # because partial() != partial() in mock calls! # partially fun, partially headache -_-' fun_part = patch('bear.output.partial') # pylint: disable=confusing-with-statement with patch_pool as pool, patch_find_files, fun_part as partial_fun: self.assertEqual(find_duplicates( ctx=Context(Namespace( duplicates=['a', 'b', 'c'], jobs=1, files=[], traverse=[], hash=[] )), hasher=Hasher.MD5 ), {}) self.assertEqual(find_duplicates( ctx=Context(Namespace( duplicates=['a', 'b', 'c'], jobs=3, files=[], traverse=[], hash=[] )), hasher=Hasher.MD5 ), {}) self.assertEqual(find_duplicates( ctx=Context(Namespace( duplicates=['a', 'b', 'c'], jobs=4, files=[], traverse=[], hash=[] )), hasher=Hasher.MD5 ), {}) self.assertEqual([ # remove __iter__() calls because those return a tuple # iterator instead of call().__enter__().map().__iter__() item for item in pool.mock_calls if '__iter__' not in str(item) ], [ call(processes=1), call().__enter__(), # all files to single process call().__enter__().map(partial_fun( hash_files, hasher=Hasher.MD5 ), [files]), call().__exit__(None, None, None), call(processes=3), call().__enter__(), # files chunked len / processors -> 15 // 3 chunks call().__enter__().map(partial_fun( hash_files, hasher=Hasher.MD5 ), [ files[0:5], files[5:10], files[10:15] ]), call().__exit__(None, None, None), call(processes=4), call().__enter__(), # files chunked len / processors -> 15 // 4 chunks call().__enter__().map(partial_fun( hash_files, hasher=Hasher.MD5 ), [ files[0:3], files[3:6], files[6:9], files[9:12], files[12:] ]), call().__exit__(None, None, None), ])
def test_find_duplicates_jobs(self): """ Test finding duplicates using correct job count from parameter. """ max_cpu = 666 patch_cpu = patch('bear.output.cpu_count', return_value=max_cpu) patch_pool = patch('bear.output.Pool') # because partial() != partial() in mock calls! # partially fun, partially headache -_-' fun_part = patch('bear.output.partial') # pylint: disable=confusing-with-statement with patch_cpu, patch_pool as pool, fun_part as partial_fun: self.assertEqual(find_duplicates( ctx=Context(Namespace( duplicates=[], jobs=1, files=[], traverse=[], hash=[] )), hasher=Hasher.MD5 ), {}) self.assertEqual(find_duplicates( ctx=Context(Namespace( duplicates=[], jobs=0, files=[], traverse=[], hash=[] )), hasher=Hasher.MD5 ), {}) self.assertEqual(find_duplicates( ctx=Context(Namespace( duplicates=[], jobs=4, files=[], traverse=[], hash=[] )), hasher=Hasher.MD5 ), {}) self.assertEqual([ # remove __iter__() calls because those return a tuple # iterator instead of call().__enter__().map().__iter__() item for item in pool.mock_calls if '__iter__' not in str(item) ], [ call(processes=1), call().__enter__(), call().__enter__().map(partial_fun( hash_files, hasher=Hasher.MD5 ), []), call().__exit__(None, None, None), call(processes=666), call().__enter__(), call().__enter__().map(partial_fun( hash_files, hasher=Hasher.MD5 ), []), call().__exit__(None, None, None), call(processes=4), call().__enter__(), call().__enter__().map(partial_fun( hash_files, hasher=Hasher.MD5 ), []), call().__exit__(None, None, None), ])