Пример #1
0
    def test_spec_parse(self):

        tag = 'computer-vision__images__imagenet8__1'
        spec = 'imagenet8'
        categories = ['computer-vision', 'images', spec]
        version = '1'

        self.assertEqual((os.sep.join(categories), spec, version),
                         spec_parse(tag))
        self.assertRaises(SearchSpecException, lambda: spec_parse(''))
Пример #2
0
    def export(self, bucket, tag, retry):
        try:
            categories_path, spec_name, _ = spec_parse(tag)
            get_root_path()
            if not self._tag_exists(tag):
                return None, None
        except InvalidGitRepositoryError:
            log.error('You are not in an initialized ml-git repository.',
                      class_name=LOCAL_REPOSITORY_CLASS_NAME)
            return None, None
        except Exception as e:
            log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME)
            return None, None

        try:
            self._checkout_ref(tag)
        except Exception:
            log.error('Unable to checkout to %s' % tag,
                      class_name=REPOSITORY_CLASS_NAME)
            return None, None

        local = LocalRepository(
            self.__config, get_objects_path(self.__config, self.__repo_type),
            self.__repo_type)
        local.export_tag(get_metadata_path(self.__config, self.__repo_type),
                         tag, bucket, retry)

        self._checkout_ref()
Пример #3
0
 def _get_related_entity_info(spec_file, entity_type):
     related_entity = spec_file.get(entity_type, None)
     if related_entity:
         entity_tag = related_entity['tag']
         _, entity_name, version = spec_parse(entity_tag)
         return entity_tag, '{} - ({})'.format(entity_name, version)
     return None, None
Пример #4
0
def checkout(entity,
             tag,
             sampling=None,
             retries=2,
             force=False,
             dataset=False,
             labels=False,
             version=-1):
    """This command allows retrieving the data of a specific version of an ML entity.

    Example:
        checkout('datasets', 'computer-vision__images3__imagenet__1')

    Args:
        entity (str): The type of an ML entity. (datasets, labels or models)
        tag (str): An ml-git tag to identify a specific version of an ML entity.
        sampling (dict): group: <amount>:<group> The group sample option consists of amount and group used to
                                 download a sample.\n
                         range: <start:stop:step> The range sample option consists of start, stop and step used
                                to download a sample. The start parameter can be equal or greater than zero. The
                                stop parameter can be 'all', -1 or any integer above zero.\n
                         random: <amount:frequency> The random sample option consists of amount and frequency
                                used to download a sample.
                         seed: The seed is used to initialize the pseudorandom numbers.
        retries (int, optional): Number of retries to download the files from the storage [default: 2].
        force (bool, optional): Force checkout command to delete untracked/uncommitted files from the local repository [default: False].
        dataset (bool, optional): If exist a dataset related with the model or labels, this one must be downloaded [default: False].
        labels (bool, optional): If exist labels related with the model, they must be downloaded [default: False].

    Returns:
        str: Return the path where the data was checked out.

    """

    repo = get_repository_instance(entity)
    repo.update()
    if sampling is not None and not validate_sample(sampling):
        return None
    options = {}
    options['with_dataset'] = dataset
    options['with_labels'] = labels
    options['retry'] = retries
    options['force'] = force
    options['bare'] = False
    options['version'] = version
    repo.checkout(tag, sampling, options)

    spec_name = tag
    if re.search(RGX_TAG_FORMAT, tag):
        _, spec_name, _ = spec_parse(tag)
    spec_path, _ = search_spec_file(entity, spec_name)
    data_path = os.path.relpath(spec_path, get_root_path())
    if not os.path.exists(data_path):
        data_path = None
    return data_path
Пример #5
0
    def _checkout(self, tag, samples, options):
        dataset = options['with_dataset']
        labels = options['with_labels']
        retries = options['retry']
        force_get = options['force']
        bare = options['bare']
        version = options['version']
        repo_type = self.__repo_type
        try:
            cache_path = get_cache_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)

            if not re.search(RGX_TAG_FORMAT, tag):
                metadata_path = get_metadata_path(self.__config, repo_type)
                metadata = Metadata(tag, metadata_path, self.__config,
                                    repo_type)
                tag = metadata.get_tag(tag, version)
                if not tag:
                    return None, None
            elif not self._tag_exists(tag):
                return None, None
            categories_path, spec_name, _ = spec_parse(tag)
            root_path = get_root_path()
            ws_path = os.path.join(root_path,
                                   os.sep.join([repo_type, categories_path]))
        except Exception as e:
            log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME)
            return None, None

        ref = Refs(refs_path, spec_name, repo_type)
        cur_tag, _ = ref.branch()

        if cur_tag == tag:
            log.info('already at tag [%s]' % tag,
                     class_name=REPOSITORY_CLASS_NAME)
            return None, None

        local_rep = LocalRepository(self.__config, objects_path, repo_type)
        # check if no data left untracked/uncommitted. otherwise, stop.
        if not force_get and local_rep.exist_local_changes(spec_name) is True:
            return None, None

        try:
            self._checkout_ref(tag)
        except Exception:
            log.error('Unable to checkout to %s' % tag,
                      class_name=REPOSITORY_CLASS_NAME)
            return None, None

        dataset_tag, labels_tag = self._get_related_tags(
            categories_path, dataset, labels, metadata_path, repo_type,
            spec_name)

        fetch_success = self._fetch(tag, samples, retries, bare)
        if not fetch_success:
            objs = Objects('', objects_path)
            objs.fsck(remove_corrupted=True)
            self._checkout_ref()
            return None, None
        ensure_path_exists(ws_path)

        try:
            spec_index_path = os.path.join(
                get_index_metadata_path(self.__config, repo_type), spec_name)
        except Exception:
            return
        self._delete_spec_and_readme(spec_index_path, spec_name)

        try:
            r = LocalRepository(self.__config, objects_path, repo_type)
            r.checkout(cache_path, metadata_path, ws_path, tag, samples, bare)
        except OSError as e:
            self._checkout_ref()
            if e.errno == errno.ENOSPC:
                log.error(
                    'There is not enough space in the disk. Remove some files and try again.',
                    class_name=REPOSITORY_CLASS_NAME)
            else:
                log.error(
                    'An error occurred while creating the files into workspace: %s \n.'
                    % e,
                    class_name=REPOSITORY_CLASS_NAME)
                return None, None
        except Exception as e:
            self._checkout_ref()
            log.error(
                'An error occurred while creating the files into workspace: %s \n.'
                % e,
                class_name=REPOSITORY_CLASS_NAME)
            return None, None

        m = Metadata('', metadata_path, self.__config, repo_type)
        sha = m.sha_from_tag(tag)
        ref.update_head(tag, sha)

        # restore to master/head
        self._checkout_ref()
        return dataset_tag, labels_tag
Пример #6
0
    def _checkout(self,
                  tag,
                  samples,
                  retries=2,
                  force_get=False,
                  dataset=False,
                  labels=False,
                  bare=False):
        repo_type = self.__repo_type
        try:
            cache_path = get_cache_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)
            # find out actual workspace path to save data
            if not self._tag_exists(tag):
                return None, None
            categories_path, spec_name, _ = spec_parse(tag)
            dataset_tag = None
            labels_tag = None
            root_path = get_root_path()
            ws_path = os.path.join(root_path,
                                   os.sep.join([repo_type, categories_path]))
            ensure_path_exists(ws_path)
        except Exception as e:
            log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME)
            return None, None

        ref = Refs(refs_path, spec_name, repo_type)
        cur_tag, _ = ref.branch()

        if cur_tag == tag:
            log.info('already at tag [%s]' % tag,
                     class_name=REPOSITORY_CLASS_NAME)
            return None, None

        local_rep = LocalRepository(self.__config, objects_path, repo_type)
        # check if no data left untracked/uncommitted. otherwise, stop.
        if not force_get and local_rep.exist_local_changes(spec_name) is True:
            return None, None

        try:
            self._checkout_ref(tag)
        except Exception:
            log.error('Unable to checkout to %s' % tag,
                      class_name=REPOSITORY_CLASS_NAME)
            return None, None

        spec_path = os.path.join(metadata_path, categories_path,
                                 spec_name + '.spec')

        if dataset is True:
            dataset_tag = get_entity_tag(spec_path, repo_type, 'dataset')
        if labels is True:
            labels_tag = get_entity_tag(spec_path, repo_type, 'labels')

        fetch_success = self._fetch(tag, samples, retries, bare)

        if not fetch_success:
            objs = Objects('', objects_path)
            objs.fsck(remove_corrupted=True)
            self._checkout_ref('master')
            return None, None

        try:
            spec_index_path = os.path.join(
                get_index_metadata_path(self.__config, repo_type), spec_name)
        except Exception:
            return
        if os.path.exists(spec_index_path):
            if os.path.exists(
                    os.path.join(spec_index_path, spec_name + '.spec')):
                os.unlink(os.path.join(spec_index_path, spec_name + '.spec'))
            if os.path.exists(os.path.join(spec_index_path, 'README.md')):
                os.unlink(os.path.join(spec_index_path, 'README.md'))

        try:
            r = LocalRepository(self.__config, objects_path, repo_type)
            r.checkout(cache_path, metadata_path, objects_path, ws_path, tag,
                       samples, bare)
        except OSError as e:
            self._checkout_ref('master')
            if e.errno == errno.ENOSPC:
                log.error(
                    'There is not enough space in the disk. Remove some files and try again.',
                    class_name=REPOSITORY_CLASS_NAME)
            else:
                log.error(
                    'An error occurred while creating the files into workspace: %s \n.'
                    % e,
                    class_name=REPOSITORY_CLASS_NAME)
                return None, None
        except Exception as e:
            self._checkout_ref('master')
            log.error(
                'An error occurred while creating the files into workspace: %s \n.'
                % e,
                class_name=REPOSITORY_CLASS_NAME)
            return None, None

        m = Metadata('', metadata_path, self.__config, repo_type)
        sha = m.sha_from_tag(tag)
        ref.update_head(tag, sha)

        # restore to master/head
        self._checkout_ref('master')
        return dataset_tag, labels_tag
Пример #7
0
 def get_metadata_path(self, tag):
     _, specname, _ = spec_parse(tag)
     entity_dir = get_entity_dir(self.__repo_type, specname)
     return os.path.join(self.__path, entity_dir)