def garbage_collector(self): any_metadata = False removed_files = 0 reclaimed_space = 0 for entity in EntityType: repo_type = entity.value if self.metadata_exists(repo_type): log.info(output_messages['INFO_STARTING_GC'] % repo_type, class_name=REPOSITORY_CLASS_NAME) any_metadata = True index_path = get_index_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) blobs_hashes = self._get_blobs_hashes(index_path, objects_path, repo_type) cache = Cache(get_cache_path(self.__config, repo_type)) count_removed_cache, reclaimed_cache_space = cache.garbage_collector( blobs_hashes) objects = Objects('', objects_path) count_removed_objects, reclaimed_objects_space = objects.garbage_collector( blobs_hashes) reclaimed_space += reclaimed_objects_space + reclaimed_cache_space removed_files += count_removed_objects + count_removed_cache if not any_metadata: log.error(output_messages['ERROR_UNINITIALIZED_METADATA'], class_name=REPOSITORY_CLASS_NAME) return log.info(output_messages['INFO_REMOVED_FILES'] % (humanize.intword(removed_files), os.path.join(get_root_path(), '.ml-git')), class_name=REPOSITORY_CLASS_NAME) log.info(output_messages['INFO_RECLAIMED_SPACE'] % humanize.naturalsize(reclaimed_space), class_name=REPOSITORY_CLASS_NAME)
def create_hard_links_in_cache(self, cache_path, index_path, is_shared_cache, mutability, path, spec): mf = os.path.join(index_path, 'metadata', spec, MANIFEST_FILE) with change_mask_for_routine(is_shared_cache): if mutability in [ Mutability.STRICT.value, Mutability.FLEXIBLE.value ]: cache = Cache(cache_path, path, mf) cache.update()
def test_get_update_links_wspace_with_duplicates(self): wspath = os.path.join(self.tmp_dir, 'wspace') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1, DATA_IMG_2} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') wspace_file = os.path.join(wspath, DATA_IMG_1) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) wspace_file = os.path.join(wspath, DATA_IMG_2) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) st = os.stat(wspace_file) self.assertTrue(st.st_nlink == 3) self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh', DATA_IMG_2: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'}) wspath = os.path.join(self.tmp_dir, 'wspace') ensure_path_exists(wspath) to_be_removed = os.path.join(wspath, 'to_be_removed') with open(to_be_removed, 'w') as f: f.write('DEAD\n') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1, DATA_IMG_2} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') r._remove_unused_links_wspace(wspath, mfiles) self.assertFalse(os.path.exists(to_be_removed))
def _update_file_status(self, cache, filepath, fullpath, scid, st, value): status = Status.a.name prev_hash = value['hash'] scid_ret = scid is_flexible = self._mutability == MutabilityType.FLEXIBLE.value is_strict = self._mutability == MutabilityType.STRICT.value not_unlocked = value['mtime'] != st.st_mtime and 'untime' not in value bare_mode = os.path.exists( os.path.join(self._path, 'metadata', self._spec, 'bare')) if (is_flexible and not_unlocked) or is_strict: if value['status'] == Status.c.name and 'previous_hash' in value: prev_hash = value['previous_hash'] if scid == prev_hash: prev_hash = None status = Status.u.name log.debug(output_messages['DEBUG_RESTORED_FILE'].format( posix_path(filepath)), class_name=MULTI_HASH_CLASS_NAME) else: status = Status.c.name scid_ret = None file_path = Cache(cache).get_keypath(value['hash']) if os.path.exists(file_path): os.unlink(file_path) elif bare_mode and self._mutability == MutabilityType.MUTABLE.value: print('\n') log.warn(output_messages['WARN_FILE_EXISTS_IN_REPOSITORY'] % filepath, class_name=MULTI_HASH_CLASS_NAME) self.update_full_index(posix_path(filepath), fullpath, status, scid, prev_hash) return scid_ret
def _update_file_status(self, cache, filepath, fullpath, scid, st, value): status = Status.a.name prev_hash = value['hash'] scid_ret = scid is_flexible = self._mutability == Mutability.FLEXIBLE.value is_strict = self._mutability == Mutability.STRICT.value not_unlocked = value['mtime'] != st.st_mtime and 'untime' not in value bare_mode = os.path.exists( os.path.join(self._path, 'metadata', self._spec, 'bare')) if (is_flexible and not_unlocked) or is_strict: status = Status.c.name prev_hash = None scid_ret = None file_path = Cache(cache).get_keypath(value['hash']) if os.path.exists(file_path): os.unlink(file_path) elif bare_mode and self._mutability == Mutability.MUTABLE.value: print('\n') log.warn( 'The file %s already exists in the repository. If you commit, the' ' file will be overwritten.' % filepath, class_name=MULTI_HASH_CLASS_NAME) self.update_full_index(posix_path(filepath), fullpath, status, scid, prev_hash) return scid_ret
def test_get_update_links_wspace(self): wspath = os.path.join(self.tmp_dir, 'wspace') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') wspace_file = os.path.join(wspath, DATA_IMG_1) set_write_read(wspace_file) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) st = os.stat(wspace_file) fi = fidx.get_index() for k, v in fi.items(): self.assertEqual(k, os.path.join('data', 'imghires.jpg')) self.assertEqual(v['hash'], 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh') self.assertEqual(v['status'], 'u') self.assertEqual(v['ctime'], st.st_ctime) self.assertEqual(v['mtime'], st.st_mtime) self.assertTrue(st.st_nlink == 2) self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})
def test_get_update_cache(self): hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) self.assertTrue(os.path.exists(cache.get_keypath(key))) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(cache.get_keypath(key)))
def test_update(self): mlgit_dir = os.path.join(self.tmp_dir, '.ml-git') objectpath = os.path.join(mlgit_dir, 'objects-test') manifest = os.path.join(self.tmp_dir, 'manifest.yaml') yaml_save( { 'zdj7WgHSKJkoJST5GWGgS53ARqV7oqMGYVvWzEWku3MBfnQ9u': {'think-hires.jpg'} }, manifest) data = os.path.join(self.test_dir, 'data') c = Cache(objectpath, data, manifest) c.update() set_write_read(os.path.join(self.test_dir, data, 'think-hires.jpg')) st = os.stat(os.path.join(self.test_dir, data, 'think-hires.jpg')) self.assertTrue(st.st_nlink > 1) self.assertTrue( c.exists('zdj7WgHSKJkoJST5GWGgS53ARqV7oqMGYVvWzEWku3MBfnQ9u'))
def check_and_update(self, key, value, hfs, filepath, fullpath, cache): st = os.stat(fullpath) if key == filepath and value['ctime'] == st.st_ctime and value['mtime'] == st.st_mtime: log.debug('File [%s] already exists in ml-git repository' % filepath, class_name=MULTI_HASH_CLASS_NAME) return None elif key == filepath and value['ctime'] != st.st_ctime or value['mtime'] != st.st_mtime: log.debug('File [%s] was modified' % filepath, class_name=MULTI_HASH_CLASS_NAME) scid = hfs.get_scid(fullpath) if value['hash'] != scid: status = Status.a.name prev_hash = value['hash'] scid_ret = scid is_flexible = self._mutability == Mutability.FLEXIBLE.value is_strict = self._mutability == Mutability.STRICT.value not_unlocked = value['mtime'] != st.st_mtime and 'untime' not in value bare_mode = os.path.exists(os.path.join(self._path, 'metadata', self._spec, 'bare')) if (is_flexible and not_unlocked) or is_strict: status = Status.c.name prev_hash = None scid_ret = None file_path = Cache(cache).get_keypath(value['hash']) if os.path.exists(file_path): os.unlink(file_path) elif bare_mode and self._mutability == Mutability.MUTABLE.value: print('\n') log.warn('The file %s already exists in the repository. If you commit, the' ' file will be overwritten.' % filepath, class_name=MULTI_HASH_CLASS_NAME) self.update_full_index(posix_path(filepath), fullpath, status, scid, prev_hash) return scid_ret return None