Exemplo n.º 1
0
    def _check_bucket(self):
        """
        Check bucket name is exist. If not exist, create new bucket
        If bucket and metadata sub folder exist, get metadata(attributes, compressor) from there.

        """
        _client = (Minio(self.endpoint,
                         access_key=self.access_key,
                         secret_key=self.secret_key,
                         secure=self.secure,
                         region=self.region)
                   if not check_nas(self.endpoint) else NAS(self.endpoint))
        if _client.bucket_exists(self.bucket_name):
            try:
                _metadata = _client.get_object(self.bucket_name,
                                               "metadata.json")
            except:
                _client.remove_bucket(self.bucket_name)
                raise FileNotFoundError(
                    "metadata.json is not in bucket name {}"
                    ", So this bucket will be removed".format(
                        self.bucket_name))

            metadata_dict = json.loads(_metadata.read().decode("utf-8"))
            if self.endpoint != metadata_dict["endpoint"]:
                raise ValueError(
                    "Already created endpoint({}) doesn't current endpoint str({})"
                    " It may occurs permission denied error".format(
                        metadata_dict["endpoint"], self.endpoint))

            self.compressor = metadata_dict["compressor"]
            self.metadata = metadata_dict
        else:
            logger.info("{} {} is not exist!".format(self.optimizer_name,
                                                     str(self.additional)))
Exemplo n.º 2
0
    def __init__(
        self,
        config,
        num_worker_threads=4,
        clear=True,
        cache_folder_path="~/.matorage",
        index=False,
    ):
        self.config = config
        self.attribute = self._set_attribute()

        # Storage configuration
        self.num_worker_threads = num_worker_threads
        self.clear = clear
        self.index = index

        self._check_bucket()

        # merge all metadatas and load in memory.
        self.merged_indexer, self.merged_filetype = self._merge_metadata()
        self.end_indices = list(self.merged_indexer.keys())

        self._clients = {}

        if not self.index:
            # cache object which is downloaded.
            if not check_nas(self.config.endpoint):
                self._caching(cache_folder_path=cache_folder_path)
            else:
                self._object_file_mapper = {}

            # download all object in /tmp folder
            self._init_download()

            atexit.register(self._exit)
Exemplo n.º 3
0
 def _create_client(self):
     return (Minio(
         endpoint=self.config.endpoint,
         access_key=self.config.access_key,
         secret_key=self.config.secret_key,
         secure=self.config.secure,
     ) if not check_nas(self.config.endpoint) else NAS(
         self.config.endpoint))
Exemplo n.º 4
0
 def _create_client(self):
     return (
         Minio(
             endpoint=self.endpoint,
             access_key=self.access_key,
             secret_key=self.secret_key,
             secure=self.secure,
             region=self.region,
         )
         if not check_nas(self.endpoint)
         else NAS(self.endpoint)
     )
Exemplo n.º 5
0
    def _exit(self):
        """
        Close all opened files and remove.

        """

        if self.clear and not check_nas(self.config.endpoint):
            for _local_file in list(self._object_file_mapper.values()):
                if os.path.exists(_local_file):
                    os.remove(_local_file)
            if os.path.exists(self.cache_path):
                os.remove(self.cache_path)
Exemplo n.º 6
0
    def _init_download(self):
        """
        Download all object from bucket with multi thread.
        cache to `_object_file_mapper` downloaded object paths.

        """
        _client = self._create_client()
        _downloader = Downloader(
            client=_client,
            bucket=self.config.bucket_name,
            num_worker_threads=self.num_worker_threads,
        )

        _remote_files = list(self.merged_indexer.values()) + list(
            self.merged_filetype)
        for _remote_file in _remote_files:
            if not check_nas(self.config.endpoint):
                _local_file = tempfile.mktemp(_remote_file)
                if _remote_file not in self._object_file_mapper:
                    self._object_file_mapper[_remote_file] = _local_file
                    _downloader.set_queue(local_file=_local_file,
                                          remote_file=_remote_file)
            else:
                if _remote_file not in self._object_file_mapper:
                    self._object_file_mapper[_remote_file] = os.path.join(
                        self.config.endpoint, self.config.bucket_name,
                        _remote_file)
        _downloader.join_queue()

        assert len(self._object_file_mapper) == (len(self.merged_indexer) +
                                                 len(self.merged_filetype))

        if not check_nas(self.config.endpoint) and not os.path.exists(
                self.cache_path):
            with open(self.cache_path, "w") as f:
                json.dump(self._object_file_mapper, f)
            logger.info("All {} {} datasets are downloaded done.".format(
                self.config.dataset_name, str(self.config.additional)))
Exemplo n.º 7
0
    def __init__(
        self,
        config,
        multipart_upload_size=5 * _MB,
        num_worker_threads=4,
        inmemory=False,
        refresh=False,
    ):

        self.config = config

        # Storage configuration
        self.multipart_upload_size = multipart_upload_size
        self.num_worker_threads = num_worker_threads

        # HDF5 configuration
        self.inmemory = inmemory

        self.filter = tb.Filters(**config.compressor)

        self._filelist = []
        self._file, self._earray = self._get_newfile()

        self._disconnected = False

        self._client = (Minio(
            endpoint=self.config.endpoint,
            access_key=self.config.access_key,
            secret_key=self.config.secret_key,
            secure=self.config.secure,
            region=self.config.region,
        ) if not check_nas(self.config.endpoint) else NAS(
            self.config.endpoint))
        self._check_and_create_bucket(refresh=refresh)

        self._uploader = Uploader(
            client=self._client,
            bucket=self.config.bucket_name,
            num_worker_threads=self.num_worker_threads,
            multipart_upload_size=self.multipart_upload_size,
            inmemory=self.inmemory,
        )

        atexit.register(self._exit)
Exemplo n.º 8
0
    def _check_bucket(self):
        """
        Check bucket name is exist. If not exist, create new bucket
        If bucket and metadata sub folder exist, get metadata(attributes, compressor) from there.

        Returns:
            :obj: `None`:
        """
        _client = (Minio(
            self.endpoint,
            access_key=self.access_key,
            secret_key=self.secret_key,
            secure=self.secure,
        ) if not check_nas(self.endpoint) else NAS(self.endpoint))
        if _client.bucket_exists(self.bucket_name):
            objects = _client.list_objects(self.bucket_name,
                                           prefix="metadata/")
            _metadata = None
            for obj in objects:
                _metadata = _client.get_object(self.bucket_name,
                                               obj.object_name)
                break
            if not _metadata:
                return

            metadata_dict = json.loads(_metadata.read().decode("utf-8"))
            if self.endpoint != metadata_dict["endpoint"]:
                raise ValueError(
                    "Already created endpoint({}) doesn't current endpoint str({})"
                    " It may occurs permission denied error".format(
                        metadata_dict["endpoint"], self.endpoint))

            self.compressor = metadata_dict["compressor"]
            self.attributes = [
                DataAttribute(**item) for item in metadata_dict["attributes"]
            ]
        else:
            logger.warn("{} {} is not exist!".format(self.dataset_name,
                                                     str(self.additional)))
Exemplo n.º 9
0
    def __init__(self, config, num_worker_threads=4, multipart_upload_size=5 * _MB):
        self.config = config
        self.num_worker_threads = num_worker_threads
        self.multipart_upload_size = multipart_upload_size

        self._client = (
            Minio(
                endpoint=self.config.endpoint,
                access_key=self.config.access_key,
                secret_key=self.config.secret_key,
                secure=self.config.secure,
            )
            if not check_nas(self.config.endpoint)
            else NAS(self.config.endpoint)
        )

        self._uploader = Uploader(
            client=self._client,
            bucket=self.config.bucket_name,
            num_worker_threads=self.num_worker_threads,
            multipart_upload_size=self.multipart_upload_size,
            inmemory=True,
        )