예제 #1
0
    def test_moveTheDuplicatesWithDuplicatesInSameFoldersWorks(self):
        inputFilesList = dff.buildInputFilesList([
            os.path.join(self._testBasePath, 'folder3'),
            os.path.join(self._testBasePath, 'folder4')
        ], {})
        filesMap = {}
        dff.checkForDuplicates(inputFilesList, filesMap)
        self.assertEqual(len(filesMap), 10)

        filesMapBeforeMove = copy.deepcopy(filesMap)
        dff.moveTheDuplicates(filesMap, '/tmp/duplicates')

        self.assertTrue(
            all([os.path.exists(f._original) for f in filesMap.values()]))

        self.assertTrue(
            all([
                os.path.exists(f) for dupFiles in filesMap.values()
                for f in dupFiles._duplicates
            ]))
        self.assertTrue(not any([
            os.path.exists(f) for dupFiles in filesMapBeforeMove.values()
            for f in dupFiles._duplicates
        ]))
        self.assertTrue(
            all([
                f.startswith('/tmp/duplicates/')
                for dupFiles in filesMap.values() for f in dupFiles._duplicates
            ]))
예제 #2
0
    def test_checkForDuplicatesWorksWithNoDuplicateFiles(self):
        knownFilesMap = {
            dff.calculateMD5Hash(f): dff.File(f)
            for f in self._imagePaths
        }

        inputFilesList = dff.buildInputFilesList([
            os.path.join(self._testBasePath, 'folder1', 'folder3'),
            os.path.join(self._testBasePath, 'folder2', 'folder4'),
        ], knownFilesMap)

        filesMap = copy.deepcopy(knownFilesMap)
        dff.checkForDuplicates(inputFilesList, filesMap)
        self.assertEqual(
            len(self._imagePaths3) + len(self._imagePaths4) +
            len(knownFilesMap), len(filesMap))
        self.assertEqual(
            sorted(self._imagePaths3 + self._imagePaths4 +
                   [f._original for f in knownFilesMap.values()]),
            sorted([f._original for f in filesMap.values()]))

        previouslyKnownFiles = [v._original for v in knownFilesMap.values()]
        for f in filesMap.values():
            self.assertTrue(f._original in previouslyKnownFiles
                            or f._original in self._imagePaths3
                            or f._original in self._imagePaths4)
            self.assertFalse(f._duplicates)
예제 #3
0
 def test_buildInputFilesListWorksWhenSingleInputPathIsGiven(self):
     basepath = os.path.join(self._testBasePath, 'folder1', 'folder3')
     inputFilesList = dff.buildInputFilesList([basepath], None)
     self.assertEqual(len(inputFilesList), 5)
     for i in range(5):
         self.assertIn(os.path.join(basepath, 'img_{}.png'.format(15 + i)),
                       inputFilesList)
예제 #4
0
    def test_buildInputFilesListDoesntReturnKnownFilePaths(self):
        inputFilesList = dff.buildInputFilesList([
            self._testBasePath,
            os.path.join(self._testBasePath, 'folder1'),
            os.path.join(self._testBasePath, 'folder2'),
            os.path.join(self._testBasePath, 'folder1', 'folder3'),
            os.path.join(self._testBasePath, 'folder2', 'folder4'),
        ], self._knownFilesMap)

        self.assertFalse(
            set(inputFilesList) & set(self._knownFilesMap.values()))
예제 #5
0
def main():
    base_path = '/tmp/profiler'
    np.random.seed(0)
    createRandomTestImages(base_path, 100, 100, 0, 10)
    np.random.seed(0)
    createRandomTestImages(base_path, 100, 100, 50000, 10)
    # images = glob.glob('/tmp/profiler/*.png')
    # filesMap = {dff.calculateMD5Hash(f) : f for f in images}
    # with open('/tmp/filesMap.json','w') as fw:
    #     json.dump({dff.calculateMD5Hash(f) : f for f in images}, fw, indent=1)
    filesMap = {}
    inputFiles = dff.buildInputFilesList([base_path], {})
    dff.checkForDuplicates(inputFiles, filesMap)
    dff.moveTheDuplicates(filesMap, '/tmp/duplicates')
예제 #6
0
 def test_checkForDuplicatesWorksWithDuplicateFilesInSameFolder(self):
     inputFilesList = dff.buildInputFilesList(
         [os.path.join(self._testBasePath, 'folder7')], {})
     filesMap = {}
     dff.checkForDuplicates(inputFilesList, filesMap)
     self.assertEqual(len(filesMap), 10)
     originalFiles = [f._original for f in filesMap.values()]
     duplicateFiles = [
         f for dupFiles in filesMap.values() for f in dupFiles._duplicates
     ]
     self.assertEqual(sorted(originalFiles),
                      sorted(self._imagePaths7Originals))
     self.assertEqual(len(duplicateFiles), 20)
     for dup in duplicateFiles:
         self.assertNotIn(dup, self._imagePaths7Originals)
예제 #7
0
    def test_buildInputFilesListWorksWithNestedPath(self):
        inputFilesList = dff.buildInputFilesList([
            os.path.join(self._testBasePath, 'folder1', 'folder3'),
            os.path.join(self._testBasePath, 'folder1'),
        ], None)
        self.assertEqual(len(inputFilesList), 10)

        basepath = os.path.join(self._testBasePath, 'folder1', 'folder3')
        for i in range(5):
            self.assertIn(os.path.join(basepath, 'img_{}.png'.format(15 + i)),
                          inputFilesList)

        basepath = os.path.join(self._testBasePath, 'folder1')
        for i in range(5):
            self.assertIn(os.path.join(basepath, 'img_{}.png'.format(5 + i)),
                          inputFilesList)
예제 #8
0
    def test_buildInputFilesListWorksWhenMultipleInputPathsAteGiven(self):
        inputFilesList = dff.buildInputFilesList([
            os.path.join(self._testBasePath, 'folder1', 'folder3'),
            os.path.join(self._testBasePath, 'folder2', 'folder4')
        ], None)
        self.assertEqual(len(inputFilesList), 15)

        basepath = os.path.join(self._testBasePath, 'folder1', 'folder3')
        for i in range(5):
            self.assertIn(os.path.join(basepath, 'img_{}.png'.format(15 + i)),
                          inputFilesList)

        basepath = os.path.join(self._testBasePath, 'folder2', 'folder4')
        for i in range(5):
            self.assertIn(os.path.join(basepath, 'img_{}.png'.format(20 + i)),
                          inputFilesList)
예제 #9
0
 def test_checkForDuplicatesWorksWithDuplicateFilesInDifferentFolders(self):
     inputFilesList = dff.buildInputFilesList([self._testBasePath], {})
     filesMap = {}
     dff.checkForDuplicates(inputFilesList, filesMap)
     self.assertEqual(len(filesMap), 40)
     originalFileHashes = [dff.calculateMD5Hash(f) for f in self._originals]
     self.assertEqual(len(set(originalFileHashes)), len(originalFileHashes))
     self.assertEqual(sorted(originalFileHashes),
                      sorted(list(filesMap.keys())))
     self.assertEqual(sorted([f._original for f in filesMap.values()]),
                      sorted(self._originals))
     ##assert duplicate file count matches
     duplicateFiles = [
         f for dupFiles in filesMap.values() for f in dupFiles._duplicates
     ]
     self.assertEqual(len(duplicateFiles), 45)
     for df in duplicateFiles:
         self.assertNotIn(df, self._originals)
예제 #10
0
    def test_saveFilesMapWorks(self):
        inputFilesList = dff.buildInputFilesList([self._testBasePath], {})
        filesMap = {}
        dff.checkForDuplicates(inputFilesList, filesMap)
        dff.moveTheDuplicates(filesMap, '/tmp/duplicates')
        knownFilesPath = '/tmp/logs/knownFiles.json'
        allFilesPath = '/tmp/logs/allFiles.json'
        if os.path.exists(knownFilesPath):
            os.remove(knownFilesPath)
        if os.path.exists(allFilesPath):
            os.remove(allFilesPath)
        if not os.path.exists(os.path.split(knownFilesPath)[0]):
            os.makedirs(os.path.split(knownFilesPath)[0])
        self.assertFalse(os.path.exists(knownFilesPath))
        self.assertFalse(os.path.exists(allFilesPath))
        dff.saveFileList(filesMap, knownFilesPath, allFilesPath)
        self.assertTrue(os.path.exists(knownFilesPath))
        self.assertTrue(os.path.exists(allFilesPath))

        # Test that the file is loadable
        filesMapNew = dff.loadKnownFilesMap(knownFilesPath)
        self.assertEqual(len(filesMap), len(filesMapNew))
        self.assertEqual(sorted(filesMap.keys()), sorted(filesMapNew.keys()))
예제 #11
0
    def test_moveTheDuplicatesWithDuplicatesInDifferentFoldersWorks(self):
        inputFilesList = dff.buildInputFilesList([
            os.path.join(self._testBasePath, 'folder1'),
            os.path.join(self._testBasePath, 'folder2')
        ], {})
        filesMap = {}
        dff.checkForDuplicates(inputFilesList, filesMap)
        self.assertEqual(len(filesMap), 10)

        filesMapBeforeMove = copy.deepcopy(filesMap)
        dff.moveTheDuplicates(filesMap, '/tmp/duplicates')

        self.assertTrue(
            all([os.path.exists(f._original) for f in filesMap.values()]))

        self.assertTrue(
            all([
                os.path.exists(f) for dupFiles in filesMap.values()
                for f in dupFiles._duplicates
            ]))
        self.assertTrue(not any([
            os.path.exists(f) for dupFiles in filesMapBeforeMove.values()
            for f in dupFiles._duplicates
        ]))
        self.assertTrue(
            all([
                f.startswith('/tmp/duplicates/')
                for dupFiles in filesMap.values() for f in dupFiles._duplicates
            ]))
        kk = None
        for k, v in filesMap.items():
            if v._duplicates:
                kk = k
                os.remove(v._duplicates[0])
                break
        self.assertFalse(os.path.exists(filesMap[kk]._duplicates[0]))
예제 #12
0
 def test_buildInputFilesListFailsWhenInputPathsIsNotAList(self):
     with self.assertRaises(TypeError):
         dff.buildInputFilesList(self._testBasePath, self._knownFilesMap)
예제 #13
0
 def test_buildInputFilesListFailsWithInvalidPath(self):
     with self.assertRaisesRegex(OSError, "Invalid input folder"):
         inputFilesList = dff.buildInputFilesList([
             os.path.join(self._testBasePath, 'folder1', 'folder3'),
             os.path.join(self._testBasePath, 'folder5'),
         ], None)
예제 #14
0
 def test_buildInputFilesListReturnsEmptyListWhenNoPathsAreGivenAsInput(
         self):
     self.assertFalse(dff.buildInputFilesList([], {}))