def test_get(self): idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir) idx.add('data', '') mf = idx.get('zdj7WgHSKJkoJST5GWGgS53ARqV7oqMGYVvWzEWku3MBfnQ9u', self.tmp_dir, 'think-hires.jpg') self.assertEqual(singlefile.get('index'), mf)
def test_add_idmpotent(self): idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir) idx.add('data', '') idx.add('data', '') mf = os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'MANIFEST.yaml') self.assertEqual(yaml_load(mf), singlefile['manifest'])
def test_add_manifest(self): manifestfile = os.path.join(self.tmp_dir, 'MANIFEST.yaml') yaml_save(singlefile['manifest'], manifestfile) idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir) idx.add('data', manifestfile) self.assertFalse(os.path.exists(os.path.join(self.tmp_dir, 'files', 'dataset-spec', 'MANIFEST.yaml')))
def _get_blobs_hashes(self, index_path, objects_path, repo_type): blobs_hashes = [] for root, dirs, files in os.walk(os.path.join(index_path, 'metadata')): for spec in dirs: try: self._check_is_valid_entity(repo_type, spec) idx = MultihashIndex(spec, index_path, objects_path) blobs_hashes.extend(idx.get_hashes_list()) except Exception: log.debug(output_messages['INFO_ENTITY_DELETED'] % spec, class_name=REPOSITORY_CLASS_NAME) return blobs_hashes
def test_put(self): idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir) idx.add('data', self.tmp_dir) mf = idx.get_index() self.assertTrue(mf.exists('zdj7WgHSKJkoJST5GWGgS53ARqV7oqMGYVvWzEWku3MBfnQ9u')) idx.add('image.jpg', self.tmp_dir) idx.update_index('zdj7WemKEtQMVL81UU6PSuYaoxvBQ6CiUMq1fMvoXBhPUsCK2', 'image.jpg') self.assertTrue(mf.exists('zdj7WemKEtQMVL81UU6PSuYaoxvBQ6CiUMq1fMvoXBhPUsCK2'))
def test_add_full_index(self): manifestfile = os.path.join(self.tmp_dir, 'MANIFEST.yaml') yaml_save(singlefile['manifest'], manifestfile) idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir) idx.add('data', manifestfile) f_idx = yaml_load(os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'INDEX.yaml')) self.assertTrue(len(f_idx) > 0) for k, v in f_idx.items(): self.assertEqual(k, 'think-hires.jpg') self.assertEqual(v['hash'], 'zdj7WgHSKJkoJST5GWGgS53ARqV7oqMGYVvWzEWku3MBfnQ9u') self.assertEqual(v['status'], 'a') self.assertFalse(os.path.exists(os.path.join(self.tmp_dir, 'dataset-spec', 'INDEX.yaml')))
def test_add(self): idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir) # TODO: there is incorrect behavior here. During unit test runs, the link count can be > 1 in some cases # incorrectly, so the file doesn't get added to the index. I think this is a design issue for index.py # add_file in general; for now we will allow the unit tests to not trust this data and add the file anyway # by adding a trust_links parameter that defaults to True and cascades its way through the calls. idx.add('data', '') mf = os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'MANIFEST.yaml') self.assertEqual(yaml_load(mf), singlefile['manifest']) fi = yaml_load(os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'INDEX.yaml')) for k, v in fi.items(): self.assertEqual(v['hash'], singlefile['datastore'])
def test_push(self): mlgit_dir = os.path.join(self.tmp_dir, '.ml-git') indexpath = os.path.join(mlgit_dir, 'index-test') mdpath = os.path.join(mlgit_dir, 'metadata-test') objectpath = os.path.join(mlgit_dir, 'objects-test') specpath = os.path.join(mdpath, 'vision-computing/images/dataset-ex') ensure_path_exists(specpath) ensure_path_exists(indexpath) shutil.copy('hdata/dataset-ex.spec', specpath + '/dataset-ex.spec') shutil.copy('hdata/config.yaml', mlgit_dir + '/config.yaml') manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save( { 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'} }, manifestpath) # adds chunks to ml-git Index idx = MultihashIndex(specpath, indexpath, objectpath) idx.add('data-test-push/', manifestpath) fi = yaml_load(os.path.join(specpath, 'INDEX.yaml')) self.assertTrue(len(fi) > 0) self.assertTrue(os.path.exists(indexpath)) o = Objects(specpath, objectpath) o.commit_index(indexpath, self.tmp_dir) self.assertTrue(os.path.exists(objectpath)) c = yaml_load('hdata/config.yaml') r = LocalRepository(c, objectpath) r.push(objectpath, specpath + '/dataset-ex.spec') s3 = boto3.resource( 's3', region_name='eu-west-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) for key in idx.get_index(): self.assertIsNotNone(s3.Object(testbucketname, key))
def fsck(self): repo_type = self.__repo_type try: objects_path = get_objects_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) except RootPathException: return o = Objects('', objects_path) corrupted_files_obj = o.fsck() corrupted_files_obj_len = len(corrupted_files_obj) idx = MultihashIndex('', index_path, objects_path) corrupted_files_idx = idx.fsck() corrupted_files_idx_len = len(corrupted_files_idx) print('[%d] corrupted file(s) in Local Repository: %s' % (corrupted_files_obj_len, corrupted_files_obj)) print('[%d] corrupted file(s) in Index: %s' % (corrupted_files_idx_len, corrupted_files_idx)) print('Total of corrupted files: %d' % (corrupted_files_obj_len + corrupted_files_idx_len))
def test_push(self): indexpath = os.path.join(self.tmp_dir, 'index-test') mdpath = os.path.join(self.tmp_dir, 'metadata-test') objectpath = os.path.join(self.tmp_dir, 'objects-test') specpath = os.path.join(mdpath, 'vision-computing/images/dataset-ex') ensure_path_exists(indexpath) ensure_path_exists(specpath) shutil.copy('hdata/dataset-ex.spec', specpath + '/dataset-ex.spec') manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save(files_mock, manifestpath) idx = MultihashIndex(specpath, indexpath, objectpath) idx.add('data-test-push-1/', manifestpath) fidx = FullIndex(specpath, indexpath) self.assertTrue(os.path.exists(indexpath)) c = yaml_load('hdata/config.yaml') o = Objects(specpath, objectpath) o.commit_index(indexpath, self.tmp_dir) self.assertTrue(os.path.exists(objectpath)) r = LocalRepository(c, objectpath) self.assertTrue(r.push(objectpath, specpath + '/dataset-ex.spec') == 0) self.assertTrue(len(fidx.get_index()) == 1)
def test_add2(self): idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir) idx.add('data', '') mf = os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'MANIFEST.yaml') self.assertEqual(yaml_load(mf), singlefile['manifest']) fi = yaml_load(os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'INDEX.yaml')) for k, v in fi.items(): self.assertEqual(v['hash'], singlefile['datastore']) idx.add('data2', '') self.assertEqual(yaml_load(mf), secondfile['manifest']) fi = yaml_load(os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'INDEX.yaml')) hashs = [] for k, v in fi.items(): hashs.append(v['hash']) self.assertIn(secondfile['datastore'], hashs)
def test_remove_hash(self): idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir) data = str(self.test_dir / 'data') idx.add(data, '') idx.add(str(self.test_dir / 'data2'), '') hfs = HashFS(self.tmp_dir, blocksize=1024 * 1024) o = Objects('dataset-spec', self.tmp_dir) o.commit_index(self.tmp_dir, data) for h in hash_list: with open(os.path.join(self.tmp_dir, 'hashfs', 'log', STORAGE_LOG)) as f: self.assertTrue(h in f.read()) for h in hash_list: hfs.remove_hash(h) for h in hash_list: with open(os.path.join(self.tmp_dir, 'hashfs', 'log', STORAGE_LOG)) as f: self.assertFalse(h in f.read())
def reset(self, spec, reset_type, head): log.info(output_messages['INFO_INITIALIZING_RESET'] % (reset_type, head), class_name=REPOSITORY_CLASS_NAME) if (reset_type == '--soft' or reset_type == '--mixed') and head == HEAD: return try: repo_type = self.__repo_type metadata_path = get_metadata_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) object_path = get_objects_path(self.__config, repo_type) met = Metadata(spec, metadata_path, self.__config, repo_type) ref = Refs(refs_path, spec, repo_type) idx = MultihashIndex(spec, index_path, object_path) fidx = FullIndex(spec, index_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return # get tag before reset tag = met.get_current_tag() categories_path = get_path_with_categories(str(tag)) # current manifest file before reset manifest_path = os.path.join(metadata_path, categories_path, spec, MANIFEST_FILE) _manifest = Manifest(manifest_path).load() if head == HEAD_1: # HEAD~1 try: # reset the repo met.reset() except Exception: return # get tag after reset tag_after_reset = met.get_current_tag() sha = met.sha_from_tag(tag_after_reset) # update ml-git ref HEAD ref.update_head(str(tag_after_reset), sha) # # get path to reset workspace in case of --hard path, file = None, None try: path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) if reset_type == '--hard' and path is None: return # get manifest from metadata after reset _manifest_changed = Manifest(manifest_path) hash_files, file_names = _manifest_changed.get_diff(_manifest) idx_mf = idx.get_index().load() if reset_type == '--soft': # add in index/metadata/<entity-name>/MANIFEST idx.update_index_manifest(idx_mf) idx.update_index_manifest(hash_files) fidx.update_index_status(file_names, Status.a.name) else: # --hard or --mixed # remove hash from index/hashsh/store.log file_names.update(*idx_mf.values()) objs = MultihashFS(index_path) for key_hash in hash_files: objs.remove_hash(key_hash) idx.remove_manifest() fidx.remove_from_index_yaml(file_names) fidx.remove_uncommitted() if reset_type == '--hard': # reset workspace remove_from_workspace(file_names, path, spec)
def add(self, spec, file_path, bump_version=False, run_fsck=False): repo_type = self.__repo_type is_shared_objects = 'objects_path' in self.__config[repo_type] is_shared_cache = 'cache_path' in self.__config[repo_type] if not validate_config_spec_hash(self.__config): log.error( '.ml-git/config.yaml invalid. It should look something like this:\n%s' % get_yaml_str( get_sample_config_spec('somebucket', 'someprofile', 'someregion')), class_name=REPOSITORY_CLASS_NAME) return None path, file = None, None try: refs_path = get_refs_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) cache_path = get_cache_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) repo = LocalRepository(self.__config, objects_path, repo_type) mutability, check_mutability = repo.get_mutability_from_spec( spec, repo_type) sampling_flag = os.path.exists( os.path.join(index_path, 'metadata', spec, 'sampling')) if sampling_flag: log.error( 'You cannot add new data to an entity that is based on a checkout with the --sampling option.', class_name=REPOSITORY_CLASS_NAME) return if not mutability: return if not check_mutability: log.error('Spec mutability cannot be changed.', class_name=REPOSITORY_CLASS_NAME) return if not self._has_new_data(repo, spec): return None ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if path is None: return spec_path = os.path.join(path, file) if not self._is_spec_valid(spec_path): return None # Check tag before anything to avoid creating unstable state log.debug('Repository: check if tag already exists', class_name=REPOSITORY_CLASS_NAME) m = Metadata(spec, metadata_path, self.__config, repo_type) if not m.check_exists(): log.error('The %s has not been initialized' % self.__repo_type, class_name=REPOSITORY_CLASS_NAME) return try: m.update() except Exception: pass # get version of current manifest file manifest = self._get_current_manifest_file(m, tag) try: # adds chunks to ml-git Index log.info('%s adding path [%s] to ml-git index' % (repo_type, path), class_name=REPOSITORY_CLASS_NAME) with change_mask_for_routine(is_shared_objects): idx = MultihashIndex(spec, index_path, objects_path, mutability, cache_path) idx.add(path, manifest, file_path) # create hard links in ml-git Cache self.create_hard_links_in_cache(cache_path, index_path, is_shared_cache, mutability, path, spec) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return None if bump_version and not increment_version_in_spec( spec_path, self.__repo_type): return None idx.add_metadata(path, file) self._check_corrupted_files(spec, repo) # Run file check if run_fsck: self.fsck()
def commit(self, spec, specs, version=None, run_fsck=False, msg=None): # Move chunks from index to .ml-git/objects repo_type = self.__repo_type try: index_path = get_index_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) repo = LocalRepository(self.__config, objects_path, repo_type) mutability, check_mutability = repo.get_mutability_from_spec( spec, repo_type) if not mutability: return if not check_mutability: log.error('Spec mutability cannot be changed.', class_name=REPOSITORY_CLASS_NAME) return except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) manifest_path = os.path.join(metadata_path, categories_path, spec, MANIFEST_FILE) path, file = None, None try: path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) if path is None: return None, None, None spec_path = os.path.join(path, file) idx = MultihashIndex(spec, index_path, objects_path) if version: set_version_in_spec(version, spec_path, self.__repo_type) idx.add_metadata(path, file) # Check tag before anything to avoid creating unstable state log.debug('Check if tag already exists', class_name=REPOSITORY_CLASS_NAME) m = Metadata(spec, metadata_path, self.__config, repo_type) if not m.check_exists(): log.error('The %s has not been initialized' % self.__repo_type, class_name=REPOSITORY_CLASS_NAME) return full_metadata_path, categories_sub_path, metadata = m.tag_exists( index_path) if metadata is None: return None log.debug('%s -> %s' % (index_path, objects_path), class_name=REPOSITORY_CLASS_NAME) # commit objects in index to ml-git objects o = Objects(spec, objects_path) changed_files, deleted_files = o.commit_index(index_path, path) bare_mode = os.path.exists( os.path.join(index_path, 'metadata', spec, 'bare')) if not bare_mode: manifest = m.get_metadata_manifest(manifest_path) self._remove_deleted_files(idx, index_path, m, manifest, spec, deleted_files) m.remove_files_added_after_base_tag(manifest, path) else: tag, _ = ref.branch() self._checkout_ref(tag) # update metadata spec & README.md # option --dataset-spec --labels-spec tag, sha = m.commit_metadata(index_path, specs, msg, changed_files, mutability, path) # update ml-git ref spec HEAD == to new SHA-1 / tag if tag is None: return None ref = Refs(refs_path, spec, repo_type) ref.update_head(tag, sha) # Run file check if run_fsck: self.fsck() return tag