class ObjectStorageApi(object): """ The Object Storage API. High level API that wraps `AccountClient`, `ContainerClient` and `DirectoryClient` classes. Every method that takes a `kwargs` argument accepts the at least the following keywords: - `headers`: `dict` of extra headers to pass to the proxy - `connection_timeout`: `float` - `read_timeout`: `float` - `write_timeout`: `float` """ TIMEOUT_KEYS = ('connection_timeout', 'read_timeout', 'write_timeout') def __init__(self, namespace, logger=None, **kwargs): """ Initialize the object storage API. :param namespace: name of the namespace to interract with :type namespace: `str` :keyword connection_timeout: connection timeout towards rawx services :type connection_timeout: `float` seconds :keyword read_timeout: timeout for rawx responses and data reads from the caller (when uploading) :type read_timeout: `float` seconds :keyword write_timeout: timeout for rawx write requests :type write_timeout: `float` seconds :keyword pool_manager: a pooled connection manager that will be used for all HTTP based APIs (except rawx) :type pool_manager: `urllib3.PoolManager` """ self.namespace = namespace conf = {"namespace": self.namespace} self.logger = logger or get_logger(conf) self.timeouts = {tok: float_value(tov, None) for tok, tov in kwargs.items() if tok in self.__class__.TIMEOUT_KEYS} from oio.account.client import AccountClient from oio.container.client import ContainerClient from oio.directory.client import DirectoryClient self.directory = DirectoryClient(conf, logger=self.logger, **kwargs) self.container = ContainerClient(conf, logger=self.logger, **kwargs) # In AccountClient, "endpoint" is the account service, not the proxy acct_kwargs = kwargs.copy() acct_kwargs["proxy_endpoint"] = acct_kwargs.pop("endpoint", None) self.account = AccountClient(conf, logger=self.logger, **acct_kwargs) def _patch_timeouts(self, kwargs): """ Insert timeout settings from this class's constructor into `kwargs`, if they are not already there. """ for tok, tov in self.timeouts.items(): if tok not in kwargs: kwargs[tok] = tov def account_create(self, account, **kwargs): """ Create an account. :param account: name of the account to create :type account: `str` :returns: `True` if the account has been created """ return self.account.account_create(account, **kwargs) @handle_account_not_found def account_delete(self, account, **kwargs): """ Delete an account. :param account: name of the account to delete :type account: `str` """ self.account.account_delete(account, **kwargs) @handle_account_not_found def account_show(self, account, **kwargs): """ Get information about an account. """ return self.account.account_show(account, **kwargs) def account_list(self, **kwargs): """ List known accounts. Notice that account creation is asynchronous, and an autocreated account may appear in the listing only after several seconds. """ return self.account.account_list(**kwargs) @handle_account_not_found def account_update(self, account, metadata, to_delete=None, **kwargs): warnings.warn("You'd better use account_set_properties()", DeprecationWarning, stacklevel=2) self.account.account_update(account, metadata, to_delete, **kwargs) @handle_account_not_found def account_set_properties(self, account, properties, **kwargs): self.account.account_update(account, properties, None, **kwargs) @handle_account_not_found def account_del_properties(self, account, properties, **kwargs): self.account.account_update(account, None, properties, **kwargs) def container_create(self, account, container, properties=None, **kwargs): """ Create a container. :param account: account in which to create the container :type account: `str` :param container: name of the container :type container: `str` :param properties: properties to set on the container :type properties: `dict` :returns: True if the container has been created, False if it already exists """ return self.container.container_create(account, container, properties=properties, **kwargs) @handle_container_not_found @ensure_headers @ensure_request_id def container_touch(self, account, container, **kwargs): """ Trigger a notification about the container state. :param account: account from which to delete the container :type account: `str` :param container: name of the container :type container: `str` """ self.container.container_touch(account, container, **kwargs) def container_create_many(self, account, containers, properties=None, **kwargs): """ Create Many containers :param account: account in which to create the containers :type account: `str` :param containers: names of the containers :type containers: `list` :param properties: properties to set on the containers :type properties: `dict` """ return self.container.container_create_many(account, containers, properties=properties, **kwargs) @handle_container_not_found def container_delete(self, account, container, **kwargs): """ Delete a container. :param account: account from which to delete the container :type account: `str` :param container: name of the container :type container: `str` """ self.container.container_delete(account, container, **kwargs) @handle_account_not_found def container_list(self, account, limit=None, marker=None, end_marker=None, prefix=None, delimiter=None, **kwargs): """ Get the list of containers of an account. :param account: account from which to get the container list :type account: `str` :keyword limit: maximum number of results to return :type limit: `int` :keyword marker: name of the container from where to start the listing :type marker: `str` :keyword end_marker: :keyword prefix: :keyword delimiter: :return: the list of containers of an account :rtype: `list` of items (`list`) with 4 fields: name, number of objects, number of bytes, and 1 if the item is a prefix or 0 if the item is actually a container """ resp = self.account.container_list(account, limit=limit, marker=marker, end_marker=end_marker, prefix=prefix, delimiter=delimiter, **kwargs) return resp["listing"] @handle_container_not_found def container_show(self, account, container, **kwargs): """ Get information about a container (user properties). :param account: account in which the container is :type account: `str` :param container: name of the container :type container: `str` :returns: a `dict` with "properties" containing a `dict` of user properties. """ return self.container.container_show(account, container, **kwargs) @handle_container_not_found def container_snapshot(self, account, container, dst_account, dst_container, batch=100, **kwargs): """ Create a copy of the container (only the content of the database) :param account: account in which the target is :type account: `str` :param container: name of the target :type container: `str` :param dst_account: account in which the snapshot will be. :type dst_account: `str` :param dst_container: name of the snapshot :type dst_container: `str` """ try: self.container.container_freeze(account, container) self.container.container_snapshot( account, container, dst_account, dst_container) resp = self.object_list(dst_account, dst_container) obj_gen = resp['objects'] target_beans = [] copy_beans = [] for obj in obj_gen: data = self.object_locate( account, container, obj["name"]) chunks = [chunk['url'] for chunk in data[1]] copies = self._generate_copy(chunks) fullpath = self._generate_fullpath( dst_account, dst_container, obj['name'], obj['version']) self._send_copy(chunks, copies, fullpath[0]) t_beans, c_beans = self._prepare_update_meta2( data[1], copies, dst_account, dst_container, obj['content']) target_beans.extend(t_beans) copy_beans.extend(c_beans) if len(target_beans) > batch: self.container.container_raw_update( target_beans, copy_beans, dst_account, dst_container, frozen=True) target_beans = [] copy_beans = [] if target_beans: self.container.container_raw_update( target_beans, copy_beans, dst_account, dst_container, frozen=True) finally: self.container.container_enable(account, container) @handle_container_not_found def container_get_properties(self, account, container, properties=None, **kwargs): """ Get information about a container (user and system properties). :param account: account in which the container is :type account: `str` :param container: name of the container :type container: `str` :param properties: *ignored* :returns: a `dict` with "properties" and "system" entries, containing respectively a `dict` of user properties and a `dict` of system properties. """ return self.container.container_get_properties(account, container, properties=properties, **kwargs) @handle_container_not_found def container_set_properties(self, account, container, properties=None, clear=False, **kwargs): """ Set properties on a container. :param account: name of the account :type account: `str` :param container: name of the container where to set properties :type container: `str` :param properties: a dictionary of properties :type properties: `dict` :param clear: :type clear: `bool` :keyword system: dictionary of system properties to set """ return self.container.container_set_properties( account, container, properties, clear=clear, **kwargs) @handle_container_not_found def container_del_properties(self, account, container, properties, **kwargs): """ Delete properties of a container. :param account: name of the account :type account: `str` :param container: name of the container to deal with :type container: `str` :param properties: a list of property keys :type properties: `list` """ return self.container.container_del_properties( account, container, properties, **kwargs) def container_update(self, account, container, metadata, clear=False, **kwargs): warnings.warn("You'd better use container_set_properties()", DeprecationWarning) if not metadata: self.container_del_properties( account, container, [], **kwargs) else: self.container_set_properties( account, container, metadata, clear, **kwargs) @handle_container_not_found @ensure_headers @ensure_request_id def object_create(self, account, container, file_or_path=None, data=None, etag=None, obj_name=None, mime_type=None, metadata=None, policy=None, key_file=None, append=False, properties=None, **kwargs): """ Create an object or append data to object in *container* of *account* with data taken from either *data* (`str` or `generator`) or *file_or_path* (path to a file or file-like object). The object will be named after *obj_name* if specified, or after the base name of *file_or_path*. :param account: name of the account where to create the object :type account: `str` :param container: name of the container where to create the object :type container: `str` :param file_or_path: file-like object or path to a file from which to read object data :type file_or_path: `str` or file-like object :param data: object data (if `file_or_path` is not set) :type data: `str` or `generator` :keyword etag: entity tag of the object :type etag: `str` :keyword obj_name: name of the object to create. If not set, will use the base name of `file_or_path`. :keyword mime_type: MIME type of the object :type mime_type: `str` :keyword properties: a dictionary of properties :type properties: `dict` :keyword policy: name of the storage policy :type policy: `str` :keyword key_file: :param append: if set, data will be append to existing object (or object will be created if unset) :type append: `bool` :returns: `list` of chunks, size and hash of the what has been uploaded """ if (data, file_or_path) == (None, None): raise exc.MissingData() src = data if data is not None else file_or_path if src is file_or_path: if isinstance(file_or_path, basestring): if not os.path.exists(file_or_path): raise exc.FileNotFound("File '%s' not found." % file_or_path) file_name = os.path.basename(file_or_path) else: try: file_name = os.path.basename(file_or_path.name) except AttributeError: file_name = None obj_name = obj_name or file_name elif isgenerator(src): file_or_path = GeneratorIO(src) src = file_or_path if not obj_name: raise exc.MissingName( "No name for the object has been specified" ) sysmeta = {'mime_type': mime_type, 'etag': etag} if metadata: warnings.warn( "You'd better use 'properties' instead of 'metadata'", DeprecationWarning, stacklevel=4) if not properties: properties = metadata else: properties.update(metadata) if src is data: return self._object_create( account, container, obj_name, BytesIO(data), sysmeta, properties=properties, policy=policy, key_file=key_file, append=append, **kwargs) elif hasattr(file_or_path, "read"): return self._object_create( account, container, obj_name, src, sysmeta, properties=properties, policy=policy, key_file=key_file, append=append, **kwargs) else: with open(file_or_path, "rb") as f: return self._object_create( account, container, obj_name, f, sysmeta, properties=properties, policy=policy, key_file=key_file, append=append, **kwargs) @ensure_headers @ensure_request_id def object_touch(self, account, container, obj, version=None, **kwargs): """ Trigger a notification about an object (as if it just had been created). :param account: name of the account where to create the object :type account: `str` :param container: name of the container where to create the object :type container: `str` :param obj: name of the object to touch """ self.container.content_touch(account, container, obj, version=version, **kwargs) def object_drain(self, account, container, obj, version=None, **kwargs): """ Remove all the chunks of a content, but keep all the metadata. :param account: name of the account where the object is present :type account: `str` :param container: name of the container where the object is present :type container: `str` :param obj: name of the object to drain """ self.container.content_drain(account, container, obj, version=version, **kwargs) @handle_object_not_found @ensure_headers @ensure_request_id def object_delete(self, account, container, obj, version=None, **kwargs): """ Delete an object from a container. If versioning is enabled and no version is specified, the object will be marked as deleted but not actually deleted. :param account: name of the account the object belongs to :type account: `str` :param container: name of the container the object belongs to :type container: `str` :param obj: name of the object to delete :param version: version of the object to delete :returns: True on success """ return self.container.content_delete(account, container, obj, version=version, **kwargs) @ensure_headers @ensure_request_id def object_delete_many(self, account, container, objs, **kwargs): return self.container.content_delete_many( account, container, objs, **kwargs) @handle_object_not_found @ensure_headers @ensure_request_id def object_truncate(self, account, container, obj, version=None, size=None, **kwargs): """ Truncate object at specified size. Only shrink is supported. A download may occur if size is not on chunk boundaries. :param account: name of the account in which the object is stored :param container: name of the container in which the object is stored :param obj: name of the object to query :param version: version of the object to query :param size: new size of object """ # code copied from object_fetch (should be factorized !) meta, raw_chunks = self.object_locate( account, container, obj, version=version, **kwargs) chunk_method = meta['chunk_method'] storage_method = STORAGE_METHODS.load(chunk_method) chunks = _sort_chunks(raw_chunks, storage_method.ec) for pos in sorted(chunks.keys()): chunk = chunks[pos][0] if (size >= chunk['offset'] and size <= chunk['offset'] + chunk['size']): break else: raise exc.OioException("No chunk found at position %d" % size) if chunk['offset'] != size: # retrieve partial chunk ret = self.object_fetch(account, container, obj, version=version, ranges=[(chunk['offset'], size-1)]) # TODO implement a proper object_update pos = int(chunk['pos'].split('.')[0]) self.object_create(account, container, obj_name=obj, data=ret[1], meta_pos=pos, content_id=meta['id']) return self.container.content_truncate(account, container, obj, version=version, size=size, **kwargs) @handle_container_not_found def object_list(self, account, container, limit=None, marker=None, delimiter=None, prefix=None, end_marker=None, properties=False, versions=False, deleted=False, **kwargs): """ Lists objects inside a container. :param properties: if True, list object properties along with objects :param versions: if True, list all versions of objects :param deleted: if True, list also the deleted objects :returns: a dict which contains * 'objects': the list of objects * 'prefixes': common prefixes (only if delimiter and prefix are set) * 'properties': a dict of container properties * 'system': a dict of system metadata """ _, resp_body = self.container.content_list( account, container, limit=limit, marker=marker, end_marker=end_marker, prefix=prefix, delimiter=delimiter, properties=properties, versions=versions, deleted=deleted, **kwargs) for obj in resp_body['objects']: mtype = obj.get('mime-type') if mtype is not None: obj['mime_type'] = mtype del obj['mime-type'] version = obj.get('ver') if version is not None: obj['version'] = version del obj['ver'] return resp_body @handle_object_not_found def object_locate(self, account, container, obj, version=None, **kwargs): """ Get a description of the object along with the list of its chunks. :param account: name of the account in which the object is stored :param container: name of the container in which the object is stored :param obj: name of the object to query :param version: version of the object to query :returns: a tuple with object metadata `dict` as first element and chunk `list` as second element """ obj_meta, chunks = self.container.content_locate( account, container, obj, version=version, **kwargs) return obj_meta, chunks def object_analyze(self, *args, **kwargs): """ :deprecated: use `object_locate` """ warnings.warn("You'd better use object_locate()", DeprecationWarning) return self.object_locate(*args, **kwargs) @ensure_headers @ensure_request_id def object_fetch(self, account, container, obj, version=None, ranges=None, key_file=None, **kwargs): meta, raw_chunks = self.object_locate( account, container, obj, version=version, **kwargs) chunk_method = meta['chunk_method'] storage_method = STORAGE_METHODS.load(chunk_method) chunks = _sort_chunks(raw_chunks, storage_method.ec) meta['container_id'] = cid_from_name(account, container).upper() meta['ns'] = self.namespace self._patch_timeouts(kwargs) if storage_method.ec: stream = fetch_stream_ec(chunks, ranges, storage_method, **kwargs) elif storage_method.backblaze: stream = self._fetch_stream_backblaze(meta, chunks, ranges, storage_method, key_file, **kwargs) else: stream = fetch_stream(chunks, ranges, storage_method, **kwargs) return meta, stream @handle_object_not_found def object_get_properties(self, account, container, obj, **kwargs): return self.container.content_get_properties(account, container, obj, **kwargs) @handle_object_not_found def object_show(self, account, container, obj, version=None, **kwargs): """ Get a description of the content along with its user properties. :param account: name of the account in which the object is stored :param container: name of the container in which the object is stored :param obj: name of the object to query :returns: a `dict` describing the object .. python:: {'hash': '6BF60C17CC15EEA108024903B481738F', 'ctime': '1481031763', 'deleted': 'False', 'properties': { u'projet': u'OpenIO-SDS'}, 'length': '43518', 'hash_method': 'md5', 'chunk_method': 'ec/algo=liberasurecode_rs_vand,k=6,m=3', 'version': '1481031762951972', 'policy': 'EC', 'id': '20BF2194FD420500CD4729AE0B5CBC07', 'mime_type': 'application/octet-stream', 'name': 'Makefile'} """ return self.container.content_show(account, container, obj, version=version, **kwargs) def object_update(self, account, container, obj, metadata, version=None, clear=False, **kwargs): warnings.warn("You'd better use object_set_properties()", DeprecationWarning, stacklevel=2) if clear: self.object_del_properties( account, container, obj, [], version=version, **kwargs) if metadata: self.object_set_properties( account, container, obj, metadata, version=version, **kwargs) @handle_object_not_found def object_set_properties(self, account, container, obj, properties, version=None, **kwargs): return self.container.content_set_properties( account, container, obj, properties={'properties': properties}, version=version, **kwargs) @handle_object_not_found def object_del_properties(self, account, container, obj, properties, version=None, **kwargs): return self.container.content_del_properties( account, container, obj, properties=properties, version=version, **kwargs) def _content_preparer(self, account, container, obj_name, policy=None, **kwargs): # TODO: optimize by asking more than one metachunk at a time obj_meta, first_body = self.container.content_prepare( account, container, obj_name, size=1, stgpol=policy, autocreate=True, **kwargs) storage_method = STORAGE_METHODS.load(obj_meta['chunk_method']) def _fix_mc_pos(chunks, mc_pos): for chunk in chunks: raw_pos = chunk["pos"].split(".") if storage_method.ec: chunk['num'] = int(raw_pos[1]) chunk["pos"] = "%d.%d" % (mc_pos, chunk['num']) else: chunk["pos"] = str(mc_pos) def _metachunk_preparer(): mc_pos = kwargs.get('meta_pos', 0) _fix_mc_pos(first_body, mc_pos) yield first_body while True: mc_pos += 1 _, next_body = self.container.content_prepare( account, container, obj_name, 1, stgpol=policy, autocreate=True, **kwargs) _fix_mc_pos(next_body, mc_pos) yield next_body return obj_meta, _metachunk_preparer def _generate_fullpath(self, account, container_name, path, version): return ['{0}/{1}/{2}/{3}'.format(quote_plus(account), quote_plus(container_name), quote_plus(path), version)] def _object_create(self, account, container, obj_name, source, sysmeta, properties=None, policy=None, key_file=None, **kwargs): self._patch_timeouts(kwargs) obj_meta, chunk_prep = self._content_preparer( account, container, obj_name, policy=policy, **kwargs) obj_meta.update(sysmeta) obj_meta['content_path'] = obj_name obj_meta['container_id'] = cid_from_name(account, container).upper() obj_meta['ns'] = self.namespace obj_meta['full_path'] = self._generate_fullpath(account, container, obj_name, obj_meta['version']) obj_meta['oio_version'] = (obj_meta.get('oio_version') or OIO_VERSION) # XXX content_id is necessary to update an existing object kwargs['content_id'] = kwargs.get('content_id', obj_meta['id']) storage_method = STORAGE_METHODS.load(obj_meta['chunk_method']) if storage_method.ec: handler = ECWriteHandler( source, obj_meta, chunk_prep, storage_method, **kwargs) elif storage_method.backblaze: backblaze_info = self._b2_credentials(storage_method, key_file) handler = BackblazeWriteHandler( source, obj_meta, chunk_prep, storage_method, backblaze_info, **kwargs) else: handler = ReplicatedWriteHandler( source, obj_meta, chunk_prep, storage_method, **kwargs) final_chunks, bytes_transferred, content_checksum = handler.stream() etag = obj_meta.get('etag') if etag and etag.lower() != content_checksum.lower(): raise exc.EtagMismatch( "given etag %s != computed %s" % (etag, content_checksum)) obj_meta['etag'] = content_checksum data = {'chunks': final_chunks, 'properties': properties or {}} # FIXME: we may just pass **obj_meta self.container.content_create( account, container, obj_name, size=bytes_transferred, checksum=content_checksum, data=data, stgpol=obj_meta['policy'], version=obj_meta['version'], mime_type=obj_meta['mime_type'], chunk_method=obj_meta['chunk_method'], **kwargs) return final_chunks, bytes_transferred, content_checksum def _b2_credentials(self, storage_method, key_file): key_file = key_file or '/etc/oio/sds/b2-appkey.conf' try: return BackblazeUtils.get_credentials(storage_method, key_file) except BackblazeUtilsException as err: raise exc.ConfigurationException(str(err)) def _fetch_stream_backblaze(self, meta, chunks, ranges, storage_method, key_file, **kwargs): backblaze_info = self._b2_credentials(storage_method, key_file) total_bytes = 0 current_offset = 0 size = None offset = 0 for pos in range(len(chunks)): if ranges: offset = ranges[pos][0] size = ranges[pos][1] if size is None: size = int(meta["length"]) chunk_size = int(chunks[pos][0]["size"]) if total_bytes >= size: break if current_offset + chunk_size > offset: if current_offset < offset: _offset = offset - current_offset else: _offset = 0 if chunk_size + total_bytes > size: _size = size - total_bytes else: _size = chunk_size handler = BackblazeChunkDownloadHandler( meta, chunks[pos], _offset, _size, backblaze_info=backblaze_info) stream = handler.get_stream() if not stream: raise exc.OioException("Error while downloading") total_bytes += len(stream) yield stream current_offset += chunk_size @handle_container_not_found def container_refresh(self, account, container, attempts=3, **kwargs): for i in range(attempts): try: self.account.container_reset(account, container, time.time()) except exc.Conflict: if i >= attempts - 1: raise try: self.container.container_touch(account, container) except exc.ClientException as e: if e.status != 406 and e.status != 431: raise # CODE_USER_NOTFOUND or CODE_CONTAINER_NOTFOUND metadata = dict() metadata["dtime"] = time.time() self.account.container_update(account, container, metadata) @handle_account_not_found def account_refresh(self, account, **kwargs): self.account.account_refresh(account) containers = self.container_list(account) for container in containers: try: self.container_refresh(account, container[0]) except exc.NoSuchContainer: # container remove in the meantime pass while containers: marker = containers[-1][0] containers = self.container_list(account, marker=marker) if containers: for container in containers: try: self.container_refresh(account, container[0]) except exc.NoSuchContainer: # container remove in the meantime pass def all_accounts_refresh(self, **kwargs): accounts = self.account_list() for account in accounts: try: self.account_refresh(account) except exc.NoSuchAccount: # account remove in the meantime pass @handle_account_not_found def account_flush(self, account): self.account.account_flush(account) def _random_buffer(self, dictionary, n): return ''.join(random.choice(dictionary) for _ in range(n)) def _generate_copy(self, chunks, random_hex=60): # random_hex is the number of hexadecimals characters to generate for # the copy path copies = [] for c in chunks: tmp = ''.join([c[:-random_hex], self._random_buffer('0123456789ABCDEF', random_hex)]) copies.append(tmp) return copies def _send_copy(self, targets, copies, fullpath): headers = {"x-oio-chunk-meta-full-path": fullpath} if not hasattr(self, "blob_client"): from oio.blob.client import BlobClient self.blob_client = BlobClient() for t, c in zip(targets, copies): self.blob_client.chunk_link(t, c, headers=headers).status def _prepare_update_meta2(self, targets, copies, account, container, content): targets_beans = [] copies_beans = [] for t, c in zip(targets, copies): targets_beans.append(self._meta2bean(t['url'], t, content)) copies_beans.append(self._meta2bean(c, t, content)) return targets_beans, copies_beans def _meta2bean(self, url, meta, content): return {"type": "chunk", "id": url, "hash": meta['hash'], "size": int(meta["size"]), "pos": meta["pos"], "content": content}
class BlobMoverWorker(object): def __init__(self, conf, logger, volume): self.conf = conf self.logger = logger or get_logger(conf) self.volume = volume self.run_time = 0 self.passes = 0 self.errors = 0 self.last_reported = 0 self.last_usage_check = 0 self.chunks_run_time = 0 self.bytes_running_time = 0 self.bytes_processed = 0 self.total_bytes_processed = 0 self.total_chunks_processed = 0 self.usage_target = int_value(conf.get('usage_target'), 0) self.usage_check_interval = int_value(conf.get('usage_check_interval'), 3600) self.report_interval = int_value(conf.get('report_interval'), 3600) self.max_chunks_per_second = int_value(conf.get('chunks_per_second'), 30) self.max_bytes_per_second = int_value(conf.get('bytes_per_second'), 10000000) self.blob_client = BlobClient() self.container_client = ContainerClient(conf) def mover_pass(self): self.namespace, self.address = check_volume(self.volume) start_time = report_time = time.time() total_errors = 0 mover_time = 0 paths = paths_gen(self.volume) for path in paths: loop_time = time.time() now = time.time() if now - self.last_usage_check >= self.usage_check_interval: used, total = statfs(self.volume) usage = (float(used) / total) * 100 if usage <= self.usage_target: self.logger.info( 'current usage %.2f%%: target reached (%.2f%%)', usage, self.usage_target) self.last_usage_check = now break self.safe_chunk_move(path) self.chunks_run_time = ratelimit(self.chunks_run_time, self.max_chunks_per_second) self.total_chunks_processed += 1 now = time.time() if now - self.last_reported >= self.report_interval: self.logger.info( '%(start_time)s ' '%(passes)d ' '%(errors)d ' '%(c_rate).2f ' '%(b_rate).2f ' '%(total).2f ' '%(mover_time).2f' '%(mover_rate).2f' % { 'start_time': time.ctime(report_time), 'passes': self.passes, 'errors': self.errors, 'c_rate': self.passes / (now - report_time), 'b_rate': self.bytes_processed / (now - report_time), 'total': (now - start_time), 'mover_time': mover_time, 'mover_rate': mover_time / (now - start_time) }) report_time = now total_errors += self.errors self.passes = 0 self.bytes_processed = 0 self.last_reported = now mover_time += (now - loop_time) elapsed = (time.time() - start_time) or 0.000001 self.logger.info( '%(elapsed).02f ' '%(errors)d ' '%(chunk_rate).2f ' '%(bytes_rate).2f ' '%(mover_time).2f ' '%(mover_rate).2f' % { 'elapsed': elapsed, 'errors': total_errors + self.errors, 'chunk_rate': self.total_chunks_processed / elapsed, 'bytes_rate': self.total_bytes_processed / elapsed, 'mover_time': mover_time, 'mover_rate': mover_time / elapsed }) def safe_chunk_move(self, path): try: self.chunk_move(path) except Exception as e: self.errors += 1 self.logger.error('ERROR while moving chunk %s: %s', path, e) self.passes += 1 def load_chunk_metadata(self, path): with open(path) as f: return read_chunk_metadata(f) def chunk_move(self, path): meta = self.load_chunk_metadata(path) content_cid = meta['content_cid'] content_path = meta['content_path'] chunk_url = 'http://%s/%s' % \ (self.address, meta['chunk_id']) try: _, data = self.container_client.content_show(cid=content_cid, path=content_path) except exc.NotFound: raise exc.OrphanChunk('Content not found') current_chunk = None notin = [] for c in data: if c['pos'] == meta['chunk_pos']: notin.append(c) for c in notin: if c['url'] == chunk_url: current_chunk = c notin.remove(c) if not current_chunk: raise exc.OrphanChunk('Chunk not found in content') spare_data = {'notin': notin, 'broken': [current_chunk], 'size': 0} spare_resp = self.container_client.content_spare(cid=content_cid, path=content_path, data=spare_data) new_chunk = spare_resp['chunks'][0] self.blob_client.chunk_copy(current_chunk['url'], new_chunk['id']) old = [{ 'type': 'chunk', 'id': current_chunk['url'], 'hash': meta['chunk_hash'], 'size': int(meta['chunk_size']) }] new = [{ 'type': 'chunk', 'id': new_chunk['id'], 'hash': meta['chunk_hash'], 'size': int(meta['chunk_size']) }] update_data = {'old': old, 'new': new} self.container_client.container_raw_update(cid=content_cid, data=update_data) self.blob_client.chunk_delete(current_chunk['url']) self.logger.info('moved chunk %s to %s', current_chunk['url'], new_chunk['id'])
class Content(object): def __init__(self, conf, container_id, metadata, chunks, storage_method): self.conf = conf self.container_id = container_id self.metadata = metadata self.chunks = ChunksHelper(chunks) self.storage_method = storage_method self.logger = get_logger(self.conf) self.cs_client = ConscienceClient(conf) self.blob_client = BlobClient() self.container_client = ContainerClient(self.conf) self.content_id = self.metadata["id"] self.stgpol = self.metadata["policy"] self.path = self.metadata["name"] self.length = int(self.metadata["length"]) self.version = self.metadata["version"] self.checksum = self.metadata["hash"] self.mime_type = self.metadata["mime_type"] self.chunk_method = self.metadata["chunk_method"] def _get_spare_chunk(self, chunks_notin, chunks_broken): spare_data = { "notin": ChunksHelper(chunks_notin, False).raw(), "broken": ChunksHelper(chunks_broken, False).raw() } try: spare_resp = self.container_client.content_spare( cid=self.container_id, content=self.content_id, data=spare_data, stgpol=self.stgpol) except ClientException as e: raise exc.SpareChunkException("No spare chunk (%s)" % e.message) url_list = [] for c in spare_resp["chunks"]: url_list.append(c["id"]) return url_list def _update_spare_chunk(self, current_chunk, new_url): old = [{ 'type': 'chunk', 'id': current_chunk.url, 'hash': current_chunk.checksum, 'size': current_chunk.size, 'pos': current_chunk.pos, 'content': self.content_id }] new = [{ 'type': 'chunk', 'id': new_url, 'hash': current_chunk.checksum, 'size': current_chunk.size, 'pos': current_chunk.pos, 'content': self.content_id }] update_data = {'old': old, 'new': new} self.container_client.container_raw_update(cid=self.container_id, data=update_data) def _create_object(self): self.container_client.content_create(cid=self.container_id, path=self.path, content_id=self.content_id, stgpol=self.stgpol, size=self.length, checksum=self.checksum, version=self.version, chunk_method=self.chunk_method, mime_type=self.mime_type, data=self.chunks.raw()) def rebuild_chunk(self, chunk_id, allow_same_rawx=False): raise NotImplementedError() def create(self, stream): raise NotImplementedError() def fetch(self): raise NotImplementedError() def delete(self): self.container_client.content_delete(cid=self.container_id, path=self.path) def move_chunk(self, chunk_id): current_chunk = self.chunks.filter(id=chunk_id).one() if current_chunk is None: raise OrphanChunk("Chunk not found in content") other_chunks = self.chunks.filter( metapos=current_chunk.metapos).exclude(id=chunk_id).all() spare_urls = self._get_spare_chunk(other_chunks, [current_chunk]) self.logger.debug("copy chunk from %s to %s", current_chunk.url, spare_urls[0]) self.blob_client.chunk_copy(current_chunk.url, spare_urls[0]) self._update_spare_chunk(current_chunk, spare_urls[0]) try: self.blob_client.chunk_delete(current_chunk.url) except: self.logger.warn("Failed to delete chunk %s" % current_chunk.url) current_chunk.url = spare_urls[0] return current_chunk.raw()
class Content(object): def __init__(self, conf, container_id, metadata, chunks, stgpol_args): self.conf = conf self.container_id = container_id self.metadata = metadata self.chunks = ChunksHelper(chunks) self.stgpol_args = stgpol_args self.logger = get_logger(self.conf) self.cs_client = ConscienceClient(conf) self.container_client = ContainerClient(self.conf) self.blob_client = BlobClient() self.session = requests.Session() self.content_id = self.metadata["id"] self.stgpol_name = self.metadata["policy"] self.path = self.metadata["name"] self.length = int(self.metadata["length"]) self.version = self.metadata["version"] self.hash = self.metadata["hash"] self.mime_type = self.metadata["mime-type"] self.chunk_method = self.metadata["chunk-method"] def _meta2_get_spare_chunk(self, chunks_notin, chunks_broken): spare_data = { "notin": ChunksHelper(chunks_notin, False).raw(), "broken": ChunksHelper(chunks_broken, False).raw() } try: spare_resp = self.container_client.content_spare( cid=self.container_id, content=self.content_id, data=spare_data, stgpol=self.stgpol_name) except ClientException as e: raise exc.SpareChunkException("No spare chunk (%s)" % e.message) url_list = [] for c in spare_resp["chunks"]: url_list.append(c["id"]) return url_list def _meta2_update_spare_chunk(self, current_chunk, new_url): old = [{'type': 'chunk', 'id': current_chunk.url, 'hash': current_chunk.hash, 'size': current_chunk.size, 'pos': current_chunk.pos, 'content': self.content_id}] new = [{'type': 'chunk', 'id': new_url, 'hash': current_chunk.hash, 'size': current_chunk.size, 'pos': current_chunk.pos, 'content': self.content_id}] update_data = {'old': old, 'new': new} self.container_client.container_raw_update( cid=self.container_id, data=update_data) def _meta2_create_object(self): self.container_client.content_create(cid=self.container_id, path=self.path, content_id=self.content_id, stgpol=self.stgpol_name, size=self.length, checksum=self.hash, version=self.version, chunk_method=self.chunk_method, mime_type=self.mime_type, data=self.chunks.raw()) def rebuild_chunk(self, chunk_id): raise NotImplementedError() def upload(self, stream): try: self._upload(stream) except Exception as e: for chunk in self.chunks: try: self.blob_client.chunk_delete(chunk.url) except: pass raise e def _upload(self, stream): raise NotImplementedError() def download(self): raise NotImplementedError()
class BlobMoverWorker(object): def __init__(self, conf, logger, volume): self.conf = conf self.logger = logger or get_logger(conf) self.volume = volume self.run_time = 0 self.passes = 0 self.errors = 0 self.last_reported = 0 self.last_usage_check = 0 self.chunks_run_time = 0 self.bytes_running_time = 0 self.bytes_processed = 0 self.total_bytes_processed = 0 self.total_chunks_processed = 0 self.usage_target = int_value( conf.get('usage_target'), 0) self.usage_check_interval = int_value( conf.get('usage_check_interval'), 3600) self.report_interval = int_value( conf.get('report_interval'), 3600) self.max_chunks_per_second = int_value( conf.get('chunks_per_second'), 30) self.max_bytes_per_second = int_value( conf.get('bytes_per_second'), 10000000) self.blob_client = BlobClient() self.container_client = ContainerClient(conf) def mover_pass(self): self.namespace, self.address = check_volume(self.volume) start_time = report_time = time.time() total_errors = 0 mover_time = 0 paths = paths_gen(self.volume) for path in paths: loop_time = time.time() now = time.time() if now - self.last_usage_check >= self.usage_check_interval: used, total = statfs(self.volume) usage = (float(used) / total) * 100 if usage <= self.usage_target: self.logger.info( 'current usage %.2f%%: target reached (%.2f%%)', usage, self.usage_target) self.last_usage_check = now break self.safe_chunk_move(path) self.chunks_run_time = ratelimit( self.chunks_run_time, self.max_chunks_per_second ) self.total_chunks_processed += 1 now = time.time() if now - self.last_reported >= self.report_interval: self.logger.info( '%(start_time)s ' '%(passes)d ' '%(errors)d ' '%(c_rate).2f ' '%(b_rate).2f ' '%(total).2f ' '%(mover_time).2f' '%(mover_rate).2f' % { 'start_time': time.ctime(report_time), 'passes': self.passes, 'errors': self.errors, 'c_rate': self.passes / (now - report_time), 'b_rate': self.bytes_processed / (now - report_time), 'total': (now - start_time), 'mover_time': mover_time, 'mover_rate': mover_time / (now - start_time) } ) report_time = now total_errors += self.errors self.passes = 0 self.bytes_processed = 0 self.last_reported = now mover_time += (now - loop_time) elapsed = (time.time() - start_time) or 0.000001 self.logger.info( '%(elapsed).02f ' '%(errors)d ' '%(chunk_rate).2f ' '%(bytes_rate).2f ' '%(mover_time).2f ' '%(mover_rate).2f' % { 'elapsed': elapsed, 'errors': total_errors + self.errors, 'chunk_rate': self.total_chunks_processed / elapsed, 'bytes_rate': self.total_bytes_processed / elapsed, 'mover_time': mover_time, 'mover_rate': mover_time / elapsed } ) def safe_chunk_move(self, path): try: self.chunk_move(path) except Exception as e: self.errors += 1 self.logger.error('ERROR while moving chunk %s: %s', path, e) self.passes += 1 def load_chunk_metadata(self, path): with open(path) as f: return read_chunk_metadata(f) def chunk_move(self, path): meta = self.load_chunk_metadata(path) content_cid = meta['content_cid'] content_path = meta['content_path'] chunk_url = 'http://%s/%s' % \ (self.address, meta['chunk_id']) try: data = self.container_client.content_show( cid=content_cid, path=content_path) except exc.NotFound: raise exc.OrphanChunk('Content not found') current_chunk = None notin = [] for c in data: if c['pos'] == meta['chunk_pos']: notin.append(c) for c in notin: if c['url'] == chunk_url: current_chunk = c notin.remove(c) if not current_chunk: raise exc.OrphanChunk('Chunk not found in content') spare_data = {'notin': notin, 'broken': [current_chunk], 'size': 0} spare_resp = self.container_client.content_spare( cid=content_cid, path=content_path, data=spare_data) new_chunk = spare_resp['chunks'][0] self.blob_client.chunk_copy( current_chunk['url'], new_chunk['id']) old = [{'type': 'chunk', 'id': current_chunk['url'], 'hash': meta['chunk_hash'], 'size': int(meta['chunk_size'])}] new = [{'type': 'chunk', 'id': new_chunk['id'], 'hash': meta['chunk_hash'], 'size': int(meta['chunk_size'])}] update_data = {'old': old, 'new': new} self.container_client.container_raw_update( cid=content_cid, data=update_data) self.blob_client.chunk_delete(current_chunk['url']) self.logger.info( 'moved chunk %s to %s', current_chunk['url'], new_chunk['id'])
class Content(object): def __init__(self, conf, container_id, metadata, chunks, stgpol_args): self.conf = conf self.container_id = container_id self.metadata = metadata self.chunks = ChunksHelper(chunks) self.stgpol_args = stgpol_args self.logger = get_logger(self.conf) self.cs_client = ConscienceClient(conf) self.container_client = ContainerClient(self.conf) self.blob_client = BlobClient() self.session = requests.Session() self.content_id = self.metadata["id"] self.stgpol_name = self.metadata["policy"] self.path = self.metadata["name"] self.length = int(self.metadata["length"]) self.version = self.metadata["version"] self.hash = self.metadata["hash"] self.mime_type = self.metadata["mime-type"] self.chunk_method = self.metadata["chunk-method"] def _meta2_get_spare_chunk(self, chunks_notin, chunks_broken): spare_data = { "notin": ChunksHelper(chunks_notin, False).raw(), "broken": ChunksHelper(chunks_broken, False).raw() } try: spare_resp = self.container_client.content_spare( cid=self.container_id, content=self.content_id, data=spare_data, stgpol=self.stgpol_name) except ClientException as e: raise exc.SpareChunkException("No spare chunk (%s)" % e.message) url_list = [] for c in spare_resp["chunks"]: url_list.append(c["id"]) return url_list def _meta2_update_spare_chunk(self, current_chunk, new_url): old = [{'type': 'chunk', 'id': current_chunk.url, 'hash': current_chunk.hash, 'size': current_chunk.size, 'pos': current_chunk.pos, 'content': self.content_id}] new = [{'type': 'chunk', 'id': new_url, 'hash': current_chunk.hash, 'size': current_chunk.size, 'pos': current_chunk.pos, 'content': self.content_id}] update_data = {'old': old, 'new': new} self.container_client.container_raw_update( cid=self.container_id, data=update_data) def _meta2_create_object(self): self.container_client.content_create(cid=self.container_id, path=self.path, content_id=self.content_id, stgpol=self.stgpol_name, size=self.length, checksum=self.hash, version=self.version, chunk_method=self.chunk_method, mime_type=self.mime_type, data=self.chunks.raw()) def rebuild_chunk(self, chunk_id): raise NotImplementedError() def upload(self, stream): try: self._upload(stream) except: # Keep the stack trace exc_info = sys.exc_info() for chunk in self.chunks: try: self.blob_client.chunk_delete(chunk.url) except: self.logger.warn("Failed to delete %s", chunk.url) # Raise with the original stack trace raise exc_info[0], exc_info[1], exc_info[2] def _upload(self, stream): raise NotImplementedError() def download(self): raise NotImplementedError() def delete(self): self.container_client.content_delete(cid=self.container_id, path=self.path) def move_chunk(self, chunk_id): current_chunk = self.chunks.filter(id=chunk_id).one() if current_chunk is None: raise OrphanChunk("Chunk not found in content") other_chunks = self.chunks.filter( metapos=current_chunk.metapos).exclude(id=chunk_id).all() spare_urls = self._meta2_get_spare_chunk(other_chunks, [current_chunk]) self.logger.debug("copy chunk from %s to %s", current_chunk.url, spare_urls[0]) self.blob_client.chunk_copy(current_chunk.url, spare_urls[0]) self._meta2_update_spare_chunk(current_chunk, spare_urls[0]) try: self.blob_client.chunk_delete(current_chunk.url) except: self.logger.warn("Failed to delete chunk %s" % current_chunk.url) current_chunk.url = spare_urls[0] return current_chunk.raw()
class BlobRegistratorWorker(object): def __init__(self, conf, logger, volume): self.conf = conf self.logger = logger self.volume = volume self.namespace = self.conf["namespace"] self.volume_ns, self.volume_id = check_volume(self.volume) c = dict() c['namespace'] = self.namespace self.client = ContainerClient(c, logger=self.logger) self.report_interval = conf.get( "report_period", default_report_interval) actions = { 'update': BlobRegistratorWorker._update_chunk, 'insert': BlobRegistratorWorker._insert_chunk, 'check': BlobRegistratorWorker._check_chunk, } self.action = actions[conf.get("action", "check")] def pass_with_lock(self): with lock_volume(self.volume): return self.pass_without_lock() def pass_without_lock(self): last_report = now() count, success, fail = 0, 0, 0 if self.namespace != self.volume_ns: self.logger.warn("Forcing the NS to [%s] (previously [%s])", self.namespace, self.volume_ns) self.logger.info("START %s", self.volume) paths = paths_gen(self.volume) for path in paths: # Action try: with open(path) as f: meta = read_chunk_metadata(f) self.action(self, path, f, meta) success = success + 1 except NotFound as e: fail = fail + 1 self.logger.info("ORPHAN %s/%s in %s/%s %s", meta['content_id'], meta['chunk_id'], meta['container_id'], meta['content_path'], str(e)) except Conflict as e: fail = fail + 1 self.logger.info("ALREADY %s/%s in %s/%s %s", meta['content_id'], meta['chunk_id'], meta['container_id'], meta['content_path'], str(e)) except Exception as e: fail = fail + 1 self.logger.warn("ERROR %s/%s in %s/%s %s", meta['content_id'], meta['chunk_id'], meta['container_id'], meta['content_path'], str(e)) count = count + 1 # TODO(jfs): do the throttling # periodical reporting t = now() if t - last_report > self.report_interval: self.logger.info("STEP %d ok %d ko %d", count, success, fail) self.logger.info("FINAL %s %d ok %d ko %d", self.volume, count, success, fail) def _check_chunk(self, path, f, meta): raise Exception("CHECK not yet implemented") def _insert_chunk(self, path, f, meta): cid = meta['container_id'] chunkid = basename(path) bean = meta2bean(self.volume_id, meta) self.client.container_raw_insert(bean, cid=cid) self.logger.info("inserted %s/%s in %s/%s", meta['content_id'], chunkid, cid, meta['content_path']) def _update_chunk(self, path, f, meta): cid = meta['container_id'] chunkid = basename(path) if str(meta['chunk_pos']).startswith('0'): if not self.conf['first']: self.logger.info("skip %s/%s from %s/%s", meta['content_id'], chunkid, cid, meta['content_path']) return pre = meta2bean(self.volume_id, meta) post = meta2bean(self.volume_id, meta) self.client.container_raw_update(pre, post, cid=cid) self.logger.info("updated %s/%s in %s/%s", meta['content_id'], chunkid, cid, meta['content_path'])
class Content(object): def __init__(self, conf, container_id, metadata, chunks, stgpol_args): self.conf = conf self.container_id = container_id self.metadata = metadata self.chunks = ChunksHelper(chunks) self.stgpol_args = stgpol_args self.logger = get_logger(self.conf) self.cs_client = ConscienceClient(conf) self.container_client = ContainerClient(self.conf) self.blob_client = BlobClient() self.session = requests.Session() self.content_id = self.metadata["id"] self.stgpol_name = self.metadata["policy"] self.path = self.metadata["name"] self.length = int(self.metadata["length"]) self.version = self.metadata["version"] self.hash = self.metadata["hash"] self.mime_type = self.metadata["mime-type"] self.chunk_method = self.metadata["chunk-method"] def _meta2_get_spare_chunk(self, chunks_notin, chunks_broken): spare_data = { "notin": ChunksHelper(chunks_notin, False).raw(), "broken": ChunksHelper(chunks_broken, False).raw() } try: spare_resp = self.container_client.content_spare( cid=self.container_id, content=self.content_id, data=spare_data, stgpol=self.stgpol_name) except ClientException as e: raise exc.SpareChunkException("No spare chunk (%s)" % e.message) url_list = [] for c in spare_resp["chunks"]: url_list.append(c["id"]) return url_list def _meta2_update_spare_chunk(self, current_chunk, new_url): old = [{ 'type': 'chunk', 'id': current_chunk.url, 'hash': current_chunk.hash, 'size': current_chunk.size, 'pos': current_chunk.pos, 'content': self.content_id }] new = [{ 'type': 'chunk', 'id': new_url, 'hash': current_chunk.hash, 'size': current_chunk.size, 'pos': current_chunk.pos, 'content': self.content_id }] update_data = {'old': old, 'new': new} self.container_client.container_raw_update(cid=self.container_id, data=update_data) def _meta2_create_object(self): self.container_client.content_create(cid=self.container_id, path=self.path, content_id=self.content_id, stgpol=self.stgpol_name, size=self.length, checksum=self.hash, version=self.version, chunk_method=self.chunk_method, mime_type=self.mime_type, data=self.chunks.raw()) def rebuild_chunk(self, chunk_id): raise NotImplementedError() def upload(self, stream): try: self._upload(stream) except Exception as e: for chunk in self.chunks: try: self.blob_client.chunk_delete(chunk.url) except: pass raise e def _upload(self, stream): raise NotImplementedError() def download(self): raise NotImplementedError()
class BlobRegistrator(object): DEFAULT_CHUNKS_PER_SECOND = 30 DEFAULT_REPORT_INTERVAL = 3600 BEAN_TYPES = ('alias', 'header', 'chunk') def __init__(self, conf, logger, volume, container_ids): self.conf = conf self.logger = logger self.volume = volume self.volume_ns, self.volume_id = check_volume(self.volume) self.container_ids = container_ids or list() self.container_ids = [ container_id.upper() for container_id in self.container_ids ] self.namespace = self.conf['namespace'] if self.namespace != self.volume_ns: raise ValueError( 'Namespace (%s) mismatch with volume namespace (%s)', self.namespace, self.volume_ns) # action self.action_name = self.conf['action'].lower() if (self.action_name == 'insert'): self.action = self._insert_bean elif (self.action_name == 'update'): self.action = self._update_bean elif (self.action_name == 'check'): self.action = self._check_bean else: raise ValueError('Unknown action (%s)', self.action_name) # speed self.chunks_run_time = 0 self.max_chunks_per_second = int_value( self.conf.get('chunks_per_second'), self.DEFAULT_CHUNKS_PER_SECOND) # counters self.chunks_processed = 0 self.chunk_errors = 0 self.beans_processed = dict() self.bean_successes = dict() self.bean_already_exists = dict() self.bean_orphans = dict() self.bean_errors = dict() for bean_type in self.BEAN_TYPES: self.beans_processed[bean_type] = 0 self.bean_successes[bean_type] = 0 self.bean_already_exists[bean_type] = 0 self.bean_orphans[bean_type] = 0 self.bean_errors[bean_type] = 0 # report self.start_time = 0 self.last_report = 0 self.report_interval = int_value(conf.get('report_interval'), self.DEFAULT_REPORT_INTERVAL) self.client = ContainerClient({'namespace': self.namespace}, logger=self.logger) self.ctime = int(time.time()) def _beans_from_meta(self, meta): return \ [{ 'type': 'alias', 'name': meta['content_path'], 'version': int(meta['content_version']), 'ctime': self.ctime, 'mtime': self.ctime, 'deleted': False, 'header': meta['content_id'] }, { 'type': 'header', 'id': meta['content_id'], 'size': 0, 'ctime': self.ctime, 'mtime': self.ctime, 'policy': meta['content_policy'], 'chunk-method': meta['content_chunkmethod'], 'mime-type': 'application/octet-stream' }, { 'type': 'chunk', 'id': 'http://' + self.volume_id + '/' + meta['chunk_id'], 'hash': meta.get('metachunk_hash') or meta['chunk_hash'], 'size': int(meta['chunk_size']), 'ctime': self.ctime, 'pos': meta['chunk_pos'], 'content': meta['content_id'] }] def _check_bean(self, meta, bean): raise Exception("CHECK not yet implemented") def _insert_bean(self, meta, bean): self.client.container_raw_insert(bean, cid=meta['container_id']) def _update_bean(self, meta, bean): self.client.container_raw_update([bean], [bean], cid=meta['container_id']) def _get_report(self, status, end_time): time_since_last_report = (end_time - self.last_report) or 0.00001 total_time = (end_time - self.start_time) or 0.00001 report = ( '%(status)s volume=%(volume)s ' 'start_time=%(start_time)s %(total_time).2fs ' 'last_report=%(last_report)s %(time_since_last_report).2fs ' 'chunks_processed=%(chunks_processed)d ' '%(chunks_processed_rate).2f/s ' 'chunk_errors=%(chunk_errors)d ' '%(chunk_errors_rate).2f%% ' % { 'status': status, 'volume': self.volume_id, 'start_time': datetime.fromtimestamp(int(self.start_time)).isoformat(), 'total_time': total_time, 'last_report': datetime.fromtimestamp(int(self.last_report)).isoformat(), 'time_since_last_report': time_since_last_report, 'chunks_processed': self.chunks_processed, 'chunks_processed_rate': self.chunks_processed / total_time, 'chunk_errors': self.chunk_errors, 'chunk_errors_rate': 100 * self.chunk_errors / float(self.chunks_processed or 1), }) for bean_type in self.BEAN_TYPES: report = ( '%(report)s ' 'bean_%(bean_type)s_processed=%(beans_processed)d ' '%(beans_processed_rate).2f/s ' 'bean_%(bean_type)s_successes=%(bean_successes)d ' '%(bean_successes_rate).2f%% ' 'bean_%(bean_type)s_already_exists=%(bean_already_exists)d ' '%(bean_already_exists_rate).2f%% ' 'bean_%(bean_type)s_orphans=%(bean_orphans)d ' '%(bean_orphans_rate).2f%% ' 'bean_%(bean_type)s_errors=%(bean_errors)d ' '%(bean_errors_rate).2f%%' % { 'report': report, 'bean_type': bean_type, 'beans_processed': self.beans_processed[bean_type], 'beans_processed_rate': self.beans_processed[bean_type] / total_time, 'bean_successes': self.bean_successes[bean_type], 'bean_successes_rate': 100 * self.bean_successes[bean_type] / float(self.beans_processed[bean_type] or 1), 'bean_already_exists': self.bean_already_exists[bean_type], 'bean_already_exists_rate': 100 * self.bean_already_exists[bean_type] / float(self.beans_processed[bean_type] or 1), 'bean_orphans': self.bean_orphans[bean_type], 'bean_orphans_rate': 100 * self.bean_orphans[bean_type] / float(self.beans_processed[bean_type] or 1), 'bean_errors': self.bean_errors[bean_type], 'bean_errors_rate': 100 * self.bean_errors[bean_type] / float(self.beans_processed[bean_type] or 1) }) return report def log_report(self, status, force=False): end_time = time.time() if force or (end_time - self.last_report >= self.report_interval): self.logger.info(self._get_report(status, end_time)) self.last_report = end_time def pass_volume(self): self.start_time = self.last_report = time.time() self.log_report('START', force=True) paths = paths_gen(self.volume) for path in paths: try: self.pass_chunk_file(path) self.chunks_processed += 1 except Exception as exc: self.logger.error( 'Failed to pass chunk file (chunk_file=%s): %s', path, exc) self.chunk_errors += 1 self.log_report('RUN') self.chunks_run_time = ratelimit(self.chunks_run_time, self.max_chunks_per_second) self.log_report('DONE', force=True) return self.chunk_errors == 0 \ and all(errors == 0 for errors in self.bean_errors.values()) def pass_chunk_file(self, path): chunk_id = path.rsplit('/', 1)[-1] if len(chunk_id) != STRLEN_CHUNKID: if chunk_id.endswith(CHUNK_SUFFIX_PENDING): self.logger.info('Skipping pending chunk %s', path) else: self.logger.warn('WARN Not a chunk %s', path) return for char in chunk_id: if char not in hexdigits: self.logger.warn('WARN Not a chunk %s', path) return with open(path) as f: meta, _ = read_chunk_metadata(f, chunk_id) if self.container_ids \ and meta['container_id'] in self.container_ids: self.logger.debug( 'Skipping chunk file (container_id=%s content_path=%s ' 'content_version=%s content_id=%s chunk_id=%s ' 'chunk_pos=%s)', meta['container_id'], meta['content_path'], meta['content_version'], meta['content_id'], meta['chunk_id'], meta['chunk_pos']) return beans = self._beans_from_meta(meta) for bean in beans: try: self.pass_bean(meta, bean) except Exception as exc: self.logger.error( 'Failed to pass chunk file (container_id=%s ' 'content_path=%s content_version=%s content_id=%s ' 'chunk_id=%s chunk_pos=%s): %s', meta['container_id'], meta['content_path'], meta['content_version'], meta['content_id'], meta['chunk_id'], meta['chunk_pos'], exc) self.bean_errors[bean['type']] = \ self.bean_errors[bean['type']] + 1 def pass_bean(self, meta, bean): try: self.beans_processed[bean['type']] = \ self.beans_processed[bean['type']] + 1 self.action(meta, bean) self.logger.debug( 'Passed %s (container_id=%s content_path=%s ' 'content_version=%s content_id=%s chunk_id=%s chunk_pos=%s)', bean['type'], meta['container_id'], meta['content_path'], meta['content_version'], meta['content_id'], meta['chunk_id'], meta['chunk_pos']) self.bean_successes[bean['type']] = \ self.bean_successes[bean['type']] + 1 except Conflict as exc: self.logger.info( 'Already exists %s (container_id=%s content_path=%s ' 'content_version=%s content_id=%s chunk_id=%s chunk_pos=%s): ' '%s', bean['type'], meta['container_id'], meta['content_path'], meta['content_version'], meta['content_id'], meta['chunk_id'], meta['chunk_pos'], exc) self.bean_already_exists[bean['type']] = \ self.bean_already_exists[bean['type']] + 1 except NotFound as exc: self.logger.info( 'Orphan %s (container_id=%s content_path=%s ' 'content_version=%s content_id=%s chunk_id=%s chunk_pos=%s): ' '%s', bean['type'], meta['container_id'], meta['content_path'], meta['content_version'], meta['content_id'], meta['chunk_id'], meta['chunk_pos'], exc) self.bean_orphans[bean['type']] = \ self.bean_orphans[bean['type']] + 1 except Exception as exc: self.logger.error( 'Failed to pass %s (container_id=%s content_path=%s ' 'content_version=%s content_id=%s chunk_id=%s chunk_pos=%s): ' '%s', bean['type'], meta['container_id'], meta['content_path'], meta['content_version'], meta['content_id'], meta['chunk_id'], meta['chunk_pos'], exc) self.bean_errors[bean['type']] = \ self.bean_errors[bean['type']] + 1
class BlobRebuilderWorker(object): def __init__(self, conf, logger, volume): self.conf = conf self.logger = logger or get_logger(conf) self.volume = volume self.run_time = 0 self.passes = 0 self.errors = 0 self.last_reported = 0 self.chunks_run_time = 0 self.bytes_running_time = 0 self.bytes_processed = 0 self.total_bytes_processed = 0 self.total_chunks_processed = 0 self.dry_run = true_value( conf.get('dry_run', False)) self.report_interval = int_value( conf.get('report_interval'), 3600) self.max_chunks_per_second = int_value( conf.get('chunks_per_second'), 30) self.max_bytes_per_second = int_value( conf.get('bytes_per_second'), 10000000) self.rdir_fetch_limit = int_value( conf.get('rdir_fetch_limit'), 100) self.blob_client = BlobClient() self.container_client = ContainerClient(conf) self.rdir_client = RdirClient(conf) def rebuilder_pass_with_lock(self): self.rdir_client.admin_lock(self.volume, "rebuilder on %s" % gethostname()) try: self.rebuilder_pass() finally: self.rdir_client.admin_unlock(self.volume) def rebuilder_pass(self): start_time = report_time = time.time() total_errors = 0 rebuilder_time = 0 chunks = self.rdir_client.chunk_fetch(self.volume, limit=self.rdir_fetch_limit, rebuild=True) for container_id, content_id, chunk_id, data in chunks: loop_time = time.time() if self.dry_run: self.dryrun_chunk_rebuild(container_id, content_id, chunk_id) else: self.safe_chunk_rebuild(container_id, content_id, chunk_id) self.chunks_run_time = ratelimit( self.chunks_run_time, self.max_chunks_per_second ) self.total_chunks_processed += 1 now = time.time() if now - self.last_reported >= self.report_interval: self.logger.info( '%(start_time)s ' '%(passes)d ' '%(errors)d ' '%(c_rate).2f ' '%(b_rate).2f ' '%(total).2f ' '%(rebuilder_time).2f' '%(rebuilder_rate).2f' % { 'start_time': time.ctime(report_time), 'passes': self.passes, 'errors': self.errors, 'c_rate': self.passes / (now - report_time), 'b_rate': self.bytes_processed / (now - report_time), 'total': (now - start_time), 'rebuilder_time': rebuilder_time, 'rebuilder_rate': rebuilder_time / (now - start_time) } ) report_time = now total_errors += self.errors self.passes = 0 self.bytes_processed = 0 self.last_reported = now rebuilder_time += (now - loop_time) elapsed = (time.time() - start_time) or 0.000001 self.logger.info( '%(elapsed).02f ' '%(errors)d ' '%(chunk_rate).2f ' '%(bytes_rate).2f ' '%(rebuilder_time).2f ' '%(rebuilder_rate).2f' % { 'elapsed': elapsed, 'errors': total_errors + self.errors, 'chunk_rate': self.total_chunks_processed / elapsed, 'bytes_rate': self.total_bytes_processed / elapsed, 'rebuilder_time': rebuilder_time, 'rebuilder_rate': rebuilder_time / elapsed } ) def dryrun_chunk_rebuild(self, container_id, content_id, chunk_id): self.logger.info("[dryrun] Rebuilding " "container %s, content %s, chunk %s" % (container_id, content_id, chunk_id)) self.passes += 1 def safe_chunk_rebuild(self, container_id, content_id, chunk_id): self.logger.info('Rebuilding (container %s, content %s, chunk %s)' % (container_id, content_id, chunk_id)) try: self.chunk_rebuild(container_id, content_id, chunk_id) except Exception as e: self.errors += 1 self.logger.error('ERROR while rebuilding chunk %s|%s|%s) : %s', container_id, content_id, chunk_id, e) self.passes += 1 def _meta2_get_chunks_at_pos(self, container_id, content_id, chunk_id): current_chunk_url = 'http://%s/%s' % (self.volume, chunk_id) try: data = self.container_client.content_show( cid=container_id, content=content_id) except exc.NotFound: raise exc.OrphanChunk('Content not found') current_chunk = None for c in data: if c['url'] == current_chunk_url: current_chunk = c break if not current_chunk: raise exc.OrphanChunk('Chunk not found in content') duplicate_chunks = [] for c in data: if c['pos'] == current_chunk['pos'] \ and c['url'] != current_chunk['url']: duplicate_chunks.append(c) if len(duplicate_chunks) == 0: raise exc.UnrecoverableContent('No copy of missing chunk') return current_chunk, duplicate_chunks def _meta2_get_spare_chunk(self, container_id, content_id, notin, broken): spare_data = {'notin': notin, 'broken': [broken], 'size': 0} try: spare_resp = self.container_client.content_spare( cid=container_id, content=content_id, data=spare_data) except ClientException as e: raise exc.SpareChunkException('No spare chunk (%s)' % e.message) return spare_resp['chunks'][0] def _meta2_replace_chunk(self, container_id, content_id, current_chunk, new_chunk): old = [{'type': 'chunk', 'id': current_chunk['url'], 'hash': current_chunk['hash'], 'size': current_chunk['size'], 'pos': current_chunk['pos'], 'content': content_id}] new = [{'type': 'chunk', 'id': new_chunk['id'], 'hash': current_chunk['hash'], 'size': current_chunk['size'], 'pos': current_chunk['pos'], 'content': content_id}] update_data = {'old': old, 'new': new} self.container_client.container_raw_update( cid=container_id, data=update_data) # TODO rain support def chunk_rebuild(self, container_id, content_id, chunk_id): current_chunk, duplicate_chunks = self._meta2_get_chunks_at_pos( container_id, content_id, chunk_id) spare_chunk = self._meta2_get_spare_chunk( container_id, content_id, duplicate_chunks, current_chunk) uploaded = False for src in duplicate_chunks: try: self.blob_client.chunk_copy(src['url'], spare_chunk['id']) self.logger.debug('copy chunk from %s to %s', src['url'], spare_chunk['id']) uploaded = True break except Exception as e: self.logger.debug('Failed to copy chunk from %s to %s: %s', src['url'], spare_chunk['id'], type(e)) if not uploaded: raise exc.UnrecoverableContent('No copy available ' 'of missing chunk') self._meta2_replace_chunk(container_id, content_id, current_chunk, spare_chunk) self.rdir_client.chunk_push(self.volume, container_id, content_id, chunk_id, rtime=int(time.time())) self.bytes_processed += current_chunk['size'] self.total_bytes_processed += current_chunk['size']