예제 #1
0
    def garbage_collector(self):
        any_metadata = False
        removed_files = 0
        reclaimed_space = 0
        for entity in EntityType:
            repo_type = entity.value
            if self.metadata_exists(repo_type):
                log.info(output_messages['INFO_STARTING_GC'] % repo_type,
                         class_name=REPOSITORY_CLASS_NAME)
                any_metadata = True
                index_path = get_index_path(self.__config, repo_type)
                objects_path = get_objects_path(self.__config, repo_type)
                blobs_hashes = self._get_blobs_hashes(index_path, objects_path,
                                                      repo_type)

                cache = Cache(get_cache_path(self.__config, repo_type))
                count_removed_cache, reclaimed_cache_space = cache.garbage_collector(
                    blobs_hashes)
                objects = Objects('', objects_path)
                count_removed_objects, reclaimed_objects_space = objects.garbage_collector(
                    blobs_hashes)

                reclaimed_space += reclaimed_objects_space + reclaimed_cache_space
                removed_files += count_removed_objects + count_removed_cache
        if not any_metadata:
            log.error(output_messages['ERROR_UNINITIALIZED_METADATA'],
                      class_name=REPOSITORY_CLASS_NAME)
            return
        log.info(output_messages['INFO_REMOVED_FILES'] %
                 (humanize.intword(removed_files),
                  os.path.join(get_root_path(), '.ml-git')),
                 class_name=REPOSITORY_CLASS_NAME)
        log.info(output_messages['INFO_RECLAIMED_SPACE'] %
                 humanize.naturalsize(reclaimed_space),
                 class_name=REPOSITORY_CLASS_NAME)
예제 #2
0
 def create_hard_links_in_cache(self, cache_path, index_path,
                                is_shared_cache, mutability, path, spec):
     mf = os.path.join(index_path, 'metadata', spec, MANIFEST_FILE)
     with change_mask_for_routine(is_shared_cache):
         if mutability in [
                 Mutability.STRICT.value, Mutability.FLEXIBLE.value
         ]:
             cache = Cache(cache_path, path, mf)
             cache.update()
예제 #3
0
    def test_get_update_links_wspace_with_duplicates(self):
        wspath = os.path.join(self.tmp_dir, 'wspace')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1, DATA_IMG_2}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')

        wspace_file = os.path.join(wspath, DATA_IMG_1)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))

        wspace_file = os.path.join(wspath, DATA_IMG_2)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))
        st = os.stat(wspace_file)
        self.assertTrue(st.st_nlink == 3)
        self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh',
                                  DATA_IMG_2: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})

        wspath = os.path.join(self.tmp_dir, 'wspace')
        ensure_path_exists(wspath)
        to_be_removed = os.path.join(wspath, 'to_be_removed')
        with open(to_be_removed, 'w') as f:
            f.write('DEAD\n')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')
        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1, DATA_IMG_2}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')
        r._remove_unused_links_wspace(wspath, mfiles)
        self.assertFalse(os.path.exists(to_be_removed))
예제 #4
0
파일: index.py 프로젝트: HPInc/ml-git
 def _update_file_status(self, cache, filepath, fullpath, scid, st, value):
     status = Status.a.name
     prev_hash = value['hash']
     scid_ret = scid
     is_flexible = self._mutability == MutabilityType.FLEXIBLE.value
     is_strict = self._mutability == MutabilityType.STRICT.value
     not_unlocked = value['mtime'] != st.st_mtime and 'untime' not in value
     bare_mode = os.path.exists(
         os.path.join(self._path, 'metadata', self._spec, 'bare'))
     if (is_flexible and not_unlocked) or is_strict:
         if value['status'] == Status.c.name and 'previous_hash' in value:
             prev_hash = value['previous_hash']
             if scid == prev_hash:
                 prev_hash = None
                 status = Status.u.name
                 log.debug(output_messages['DEBUG_RESTORED_FILE'].format(
                     posix_path(filepath)),
                           class_name=MULTI_HASH_CLASS_NAME)
         else:
             status = Status.c.name
             scid_ret = None
             file_path = Cache(cache).get_keypath(value['hash'])
             if os.path.exists(file_path):
                 os.unlink(file_path)
     elif bare_mode and self._mutability == MutabilityType.MUTABLE.value:
         print('\n')
         log.warn(output_messages['WARN_FILE_EXISTS_IN_REPOSITORY'] %
                  filepath,
                  class_name=MULTI_HASH_CLASS_NAME)
     self.update_full_index(posix_path(filepath), fullpath, status, scid,
                            prev_hash)
     return scid_ret
예제 #5
0
    def _update_file_status(self, cache, filepath, fullpath, scid, st, value):
        status = Status.a.name
        prev_hash = value['hash']
        scid_ret = scid
        is_flexible = self._mutability == Mutability.FLEXIBLE.value
        is_strict = self._mutability == Mutability.STRICT.value
        not_unlocked = value['mtime'] != st.st_mtime and 'untime' not in value
        bare_mode = os.path.exists(
            os.path.join(self._path, 'metadata', self._spec, 'bare'))
        if (is_flexible and not_unlocked) or is_strict:
            status = Status.c.name
            prev_hash = None
            scid_ret = None

            file_path = Cache(cache).get_keypath(value['hash'])
            if os.path.exists(file_path):
                os.unlink(file_path)
        elif bare_mode and self._mutability == Mutability.MUTABLE.value:
            print('\n')
            log.warn(
                'The file %s already exists in the repository. If you commit, the'
                ' file will be overwritten.' % filepath,
                class_name=MULTI_HASH_CLASS_NAME)
        self.update_full_index(posix_path(filepath), fullpath, status, scid,
                               prev_hash)
        return scid_ret
예제 #6
0
    def test_get_update_links_wspace(self):
        wspath = os.path.join(self.tmp_dir, 'wspace')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')

        wspace_file = os.path.join(wspath, DATA_IMG_1)
        set_write_read(wspace_file)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))
        st = os.stat(wspace_file)
        fi = fidx.get_index()
        for k, v in fi.items():
            self.assertEqual(k, os.path.join('data', 'imghires.jpg'))
            self.assertEqual(v['hash'], 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh')
            self.assertEqual(v['status'], 'u')
            self.assertEqual(v['ctime'], st.st_ctime)
            self.assertEqual(v['mtime'], st.st_mtime)
        self.assertTrue(st.st_nlink == 2)
        self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})
예제 #7
0
    def test_get_update_cache(self):
        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)

        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        self.assertTrue(os.path.exists(cache.get_keypath(key)))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(cache.get_keypath(key)))
예제 #8
0
 def test_update(self):
     mlgit_dir = os.path.join(self.tmp_dir, '.ml-git')
     objectpath = os.path.join(mlgit_dir, 'objects-test')
     manifest = os.path.join(self.tmp_dir, 'manifest.yaml')
     yaml_save(
         {
             'zdj7WgHSKJkoJST5GWGgS53ARqV7oqMGYVvWzEWku3MBfnQ9u':
             {'think-hires.jpg'}
         }, manifest)
     data = os.path.join(self.test_dir, 'data')
     c = Cache(objectpath, data, manifest)
     c.update()
     set_write_read(os.path.join(self.test_dir, data, 'think-hires.jpg'))
     st = os.stat(os.path.join(self.test_dir, data, 'think-hires.jpg'))
     self.assertTrue(st.st_nlink > 1)
     self.assertTrue(
         c.exists('zdj7WgHSKJkoJST5GWGgS53ARqV7oqMGYVvWzEWku3MBfnQ9u'))
예제 #9
0
    def check_and_update(self, key, value, hfs, filepath, fullpath, cache):
        st = os.stat(fullpath)
        if key == filepath and value['ctime'] == st.st_ctime and value['mtime'] == st.st_mtime:
            log.debug('File [%s] already exists in ml-git repository' % filepath, class_name=MULTI_HASH_CLASS_NAME)
            return None
        elif key == filepath and value['ctime'] != st.st_ctime or value['mtime'] != st.st_mtime:
            log.debug('File [%s] was modified' % filepath, class_name=MULTI_HASH_CLASS_NAME)
            scid = hfs.get_scid(fullpath)
            if value['hash'] != scid:
                status = Status.a.name
                prev_hash = value['hash']
                scid_ret = scid

                is_flexible = self._mutability == Mutability.FLEXIBLE.value
                is_strict = self._mutability == Mutability.STRICT.value
                not_unlocked = value['mtime'] != st.st_mtime and 'untime' not in value

                bare_mode = os.path.exists(os.path.join(self._path, 'metadata', self._spec, 'bare'))
                if (is_flexible and not_unlocked) or is_strict:
                    status = Status.c.name
                    prev_hash = None
                    scid_ret = None

                    file_path = Cache(cache).get_keypath(value['hash'])
                    if os.path.exists(file_path):
                        os.unlink(file_path)
                elif bare_mode and self._mutability == Mutability.MUTABLE.value:
                    print('\n')
                    log.warn('The file %s already exists in the repository. If you commit, the'
                             ' file will be overwritten.' % filepath,
                             class_name=MULTI_HASH_CLASS_NAME)

                self.update_full_index(posix_path(filepath), fullpath, status, scid, prev_hash)

                return scid_ret
        return None