Exemplo n.º 1
0
    def test_validate_config_spec_hash(self):
        # Success case
        spec = get_sample_config_spec('somebucket', 'someprofile',
                                      'someregion')
        self.assertTrue(validate_config_spec_hash(spec))

        # Same but with s3 instead of s3h
        spec = get_sample_config_spec('somebucket', 'someprofile',
                                      'someregion')
        spec['store']['s3'] = spec['store'].pop('s3h')
        self.assertTrue(validate_config_spec_hash(spec))

        # None or empty cases
        self.assertFalse(validate_config_spec_hash(None))
        self.assertFalse(validate_config_spec_hash({}))

        # Missing elements
        spec = get_sample_config_spec('somebucket', 'someprofile',
                                      'someregion')
        spec['store'].pop('s3h')
        self.assertFalse(validate_config_spec_hash(spec))
        spec = get_sample_config_spec('somebucket', 'someprofile',
                                      'someregion')
        spec.pop('store')
        self.assertFalse(validate_config_spec_hash(spec))
Exemplo n.º 2
0
    def test_validate_config_spec_hash(self):
        # Success case
        spec = get_sample_config_spec('somebucket', 'someprofile',
                                      'someregion')
        self.assertTrue(validate_config_spec_hash(spec))

        # Same but with s3 instead of s3h
        spec = get_sample_config_spec('somebucket', 'someprofile',
                                      'someregion')
        spec[STORAGE_CONFIG_KEY][S3] = spec[STORAGE_CONFIG_KEY].pop(S3H)
        self.assertTrue(validate_config_spec_hash(spec))

        # None or empty cases
        self.assertFalse(validate_config_spec_hash(None))
        self.assertFalse(validate_config_spec_hash({}))

        # Missing elements
        spec = get_sample_config_spec('somebucket', 'someprofile',
                                      'someregion')
        spec[STORAGE_CONFIG_KEY].pop(S3H)
        self.assertFalse(validate_config_spec_hash(spec))
        spec = get_sample_config_spec('somebucket', 'someprofile',
                                      'someregion')
        spec.pop(STORAGE_CONFIG_KEY)
        self.assertFalse(validate_config_spec_hash(spec))
Exemplo n.º 3
0
    def test_fetch(self):
        mdpath = os.path.join(self.tmp_dir, 'metadata-test')
        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        config_spec = get_sample_config_spec(testbucketname, testprofile,
                                             testregion)
        dataset_spec = get_sample_spec(testbucketname)

        specpath = os.path.join(mdpath, 'vision-computing', 'images',
                                'dataset-ex')
        ensure_path_exists(specpath)
        yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec'))

        manifestpath = os.path.join(specpath, 'MANIFEST.yaml')
        yaml_save(
            {
                'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh':
                {'imghires.jpg'}
            }, manifestpath)

        objectpath = os.path.join(self.tmp_dir, 'objects-test')
        spec = 'vision-computing__images__dataset-ex__5'

        r = LocalRepository(config_spec, objectpath)
        r.fetch(mdpath, spec, None)

        fs = set()
        for root, dirs, files in os.walk(objectpath):
            for file in files:
                fs.add(file)

        self.assertEqual(len(hs), len(fs))
        self.assertTrue(len(hs.difference(fs)) == 0)
Exemplo n.º 4
0
    def test_get_update_links_wspace(self):
        wspath = os.path.join(self.tmp_dir, 'wspace')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')

        wspace_file = os.path.join(wspath, DATA_IMG_1)
        set_write_read(wspace_file)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))
        st = os.stat(wspace_file)
        fi = fidx.get_index()
        for k, v in fi.items():
            self.assertEqual(k, os.path.join('data', 'imghires.jpg'))
            self.assertEqual(v['hash'], 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh')
            self.assertEqual(v['status'], 'u')
            self.assertEqual(v['ctime'], st.st_ctime)
            self.assertEqual(v['mtime'], st.st_mtime)
        self.assertTrue(st.st_nlink == 2)
        self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})
Exemplo n.º 5
0
    def unlock_file(self, spec, file_path):
        repo_type = self.__repo_type

        if not validate_config_spec_hash(self.__config):
            log.error(
                '.ml-git/config.yaml invalid.  It should look something like this:\n%s'
                % get_yaml_str(
                    get_sample_config_spec('somebucket', 'someprofile',
                                           'someregion')),
                class_name=REPOSITORY_CLASS_NAME)
            return None

        path, file = None, None
        try:
            refs_path = get_refs_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            index_path = get_index_path(self.__config, repo_type)
            cache_path = get_cache_path(self.__config, repo_type)

            ref = Refs(refs_path, spec, repo_type)
            tag, sha = ref.branch()
            categories_path = get_path_with_categories(tag)

            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        if path is None:
            return

        spec_path = os.path.join(path, file)
        spec_file = yaml_load(spec_path)

        try:
            mutability = spec_file[repo_type]['mutability']
            if mutability not in Mutability.list():
                log.error('Invalid mutability type.',
                          class_name=REPOSITORY_CLASS_NAME)
                return
        except Exception:
            log.info(
                'The spec does not have the \'mutability\' property set. Default: strict.',
                class_name=REPOSITORY_CLASS_NAME)
            return

        if mutability != Mutability.STRICT.value:
            try:
                local = LocalRepository(self.__config, objects_path, repo_type)
                local.unlock_file(path, file_path, index_path, objects_path,
                                  spec, cache_path)
            except Exception as e:
                log.error(e, class_name=REPOSITORY_CLASS_NAME)
                return
        else:
            log.error(
                'You cannot use this command for this entity because mutability cannot be strict.',
                class_name=REPOSITORY_CLASS_NAME)
Exemplo n.º 6
0
    def test_get_update_links_wspace_with_duplicates(self):
        wspath = os.path.join(self.tmp_dir, 'wspace')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1, DATA_IMG_2}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')

        wspace_file = os.path.join(wspath, DATA_IMG_1)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))

        wspace_file = os.path.join(wspath, DATA_IMG_2)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))
        st = os.stat(wspace_file)
        self.assertTrue(st.st_nlink == 3)
        self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh',
                                  DATA_IMG_2: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})

        wspath = os.path.join(self.tmp_dir, 'wspace')
        ensure_path_exists(wspath)
        to_be_removed = os.path.join(wspath, 'to_be_removed')
        with open(to_be_removed, 'w') as f:
            f.write('DEAD\n')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')
        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1, DATA_IMG_2}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')
        r._remove_unused_links_wspace(wspath, mfiles)
        self.assertFalse(os.path.exists(to_be_removed))
Exemplo n.º 7
0
    def test_get_update_cache(self):
        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)

        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        self.assertTrue(os.path.exists(cache.get_keypath(key)))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(cache.get_keypath(key)))
Exemplo n.º 8
0
    def add(self, spec, file_path, bump_version=False, run_fsck=False):
        repo_type = self.__repo_type

        is_shared_objects = 'objects_path' in self.__config[repo_type]
        is_shared_cache = 'cache_path' in self.__config[repo_type]

        if not validate_config_spec_hash(self.__config):
            log.error(
                '.ml-git/config.yaml invalid. It should look something like this:\n%s'
                % get_yaml_str(
                    get_sample_config_spec('somebucket', 'someprofile',
                                           'someregion')),
                class_name=REPOSITORY_CLASS_NAME)
            return None

        path, file = None, None
        try:

            refs_path = get_refs_path(self.__config, repo_type)
            index_path = get_index_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            cache_path = get_cache_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            repo = LocalRepository(self.__config, objects_path, repo_type)
            mutability, check_mutability = repo.get_mutability_from_spec(
                spec, repo_type)
            sampling_flag = os.path.exists(
                os.path.join(index_path, 'metadata', spec, 'sampling'))
            if sampling_flag:
                log.error(
                    'You cannot add new data to an entity that is based on a checkout with the --sampling option.',
                    class_name=REPOSITORY_CLASS_NAME)
                return

            if not mutability:
                return

            if not check_mutability:
                log.error('Spec mutability cannot be changed.',
                          class_name=REPOSITORY_CLASS_NAME)
                return

            if not self._has_new_data(repo, spec):
                return None

            ref = Refs(refs_path, spec, repo_type)
            tag, sha = ref.branch()

            categories_path = get_path_with_categories(tag)

            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        if path is None:
            return
        spec_path = os.path.join(path, file)
        if not self._is_spec_valid(spec_path):
            return None

        # Check tag before anything to avoid creating unstable state
        log.debug('Repository: check if tag already exists',
                  class_name=REPOSITORY_CLASS_NAME)

        m = Metadata(spec, metadata_path, self.__config, repo_type)

        if not m.check_exists():
            log.error('The %s has not been initialized' % self.__repo_type,
                      class_name=REPOSITORY_CLASS_NAME)
            return

        try:
            m.update()
        except Exception:
            pass

        # get version of current manifest file
        manifest = self._get_current_manifest_file(m, tag)

        try:
            # adds chunks to ml-git Index
            log.info('%s adding path [%s] to ml-git index' % (repo_type, path),
                     class_name=REPOSITORY_CLASS_NAME)
            with change_mask_for_routine(is_shared_objects):
                idx = MultihashIndex(spec, index_path, objects_path,
                                     mutability, cache_path)
                idx.add(path, manifest, file_path)

            # create hard links in ml-git Cache
            self.create_hard_links_in_cache(cache_path, index_path,
                                            is_shared_cache, mutability, path,
                                            spec)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return None

        if bump_version and not increment_version_in_spec(
                spec_path, self.__repo_type):
            return None

        idx.add_metadata(path, file)

        self._check_corrupted_files(spec, repo)

        # Run file check
        if run_fsck:
            self.fsck()