def chunk_audit(self, path): with open(path) as f: try: meta = read_chunk_metadata(f) except exc.MissingAttribute as e: raise exc.FaultyChunk( 'Missing extended attribute %s' % e) size = int(meta['chunk_size']) md5_checksum = meta['chunk_hash'].lower() reader = ChunkReader(f, size, md5_checksum) with closing(reader): for buf in reader: buf_len = len(buf) self.bytes_running_time = ratelimit( self.bytes_running_time, self.max_bytes_per_second, increment=buf_len) self.bytes_processed += buf_len self.total_bytes_processed += buf_len try: content_cid = meta['content_cid'] content_path = meta['content_path'] content_attr, data = self.container_client.content_show( cid=content_cid, path=content_path) # Check chunk data chunks_nb = 0 chunk_data = None for c in data: if c['url'].endswith(meta['chunk_id']): chunks_nb += 1 # FIXME: won't work with DUP chunk_data = c if not chunk_data: raise exc.OrphanChunk('Not found in content') if chunk_data['size'] != int(meta['chunk_size']): raise exc.FaultyChunk('Invalid chunk size found') if chunk_data['hash'] != meta['chunk_hash']: raise exc.FaultyChunk('Invalid chunk hash found') if chunk_data['pos'] != meta['chunk_pos']: raise exc.FaultyChunk('Invalid chunk position found') # Check content data if content_attr['length'] != meta['content_size']: raise exc.FaultyChunk('Invalid content size found') if chunks_nb != int(meta['content_chunksnb']): self.logger.warn('Invalid number of chunks found') # TODO: really count chunks and enable the exception # raise exc.FaultyChunk('Invalid number of chunks found') except exc.NotFound: raise exc.OrphanChunk('Chunk not found in container')
def chunk_file_audit(self, chunk_file, chunk_id): try: meta, _ = read_chunk_metadata(chunk_file, chunk_id) except exc.MissingAttribute as err: raise exc.FaultyChunk(err) size = int(meta['chunk_size']) md5_checksum = meta['chunk_hash'].lower() reader = ChunkReader(chunk_file, size, md5_checksum, compression=meta.get("compression", "")) with closing(reader): for buf in reader: buf_len = len(buf) self.bytes_running_time = ratelimit(self.bytes_running_time, self.max_bytes_per_second, increment=buf_len) self.bytes_processed += buf_len self.total_bytes_processed += buf_len try: container_id = meta['container_id'] content_id = meta['content_id'] _obj_meta, data = self.container_client.content_locate( cid=container_id, content=content_id, properties=False) # Check chunk data chunk_data = None metachunks = set() for c in data: if c['url'].endswith(meta['chunk_id']): metachunks.add(c['pos'].split('.', 2)[0]) chunk_data = c if not chunk_data: raise exc.OrphanChunk('Not found in content') metachunk_size = meta.get('metachunk_size') if metachunk_size is not None \ and chunk_data['size'] != int(metachunk_size): raise exc.FaultyChunk('Invalid metachunk size found') metachunk_hash = meta.get('metachunk_hash') if metachunk_hash is not None \ and chunk_data['hash'] != meta['metachunk_hash']: raise exc.FaultyChunk('Invalid metachunk hash found') if chunk_data['pos'] != meta['chunk_pos']: raise exc.FaultyChunk('Invalid chunk position found') except exc.NotFound: raise exc.OrphanChunk('Chunk not found in container')
def extract_headers_meta(headers, check=True): """ Extract chunk metadata from a dictionary of rawx response headers. :param headers: a dictionary of headers, as returned by a HEAD or GET request to a rawx service. :keyword check: if True (the default), raise FaultyChunk if one or several mandatory response headers are missing. :returns: a dictionary of chunk metadata. """ meta = {} missing = list() for mkey, hkey in CHUNK_HEADERS.items(): try: if mkey == 'full_path': meta[mkey] = headers[hkey] else: meta[mkey] = unquote(headers[hkey]) except KeyError: if check and mkey not in CHUNK_XATTR_KEYS_OPTIONAL: missing.append(exc.MissingAttribute(mkey)) if check and missing: raise exc.FaultyChunk(*missing) mtime = meta.get('chunk_mtime') if mtime: meta['chunk_mtime'] = mktime(parsedate(mtime)) return meta
def chunk_audit(self, path): with open(path) as f: try: meta = read_chunk_metadata(f) except exc.MissingAttribute as e: raise exc.FaultyChunk('Missing extended attribute %s' % e) size = int(meta['chunk_size']) md5_checksum = meta['chunk_hash'].lower() reader = ChunkReader(f, size, md5_checksum) with closing(reader): for buf in reader: buf_len = len(buf) self.bytes_running_time = ratelimit( self.bytes_running_time, self.max_bytes_per_second, increment=buf_len) self.bytes_processed += buf_len self.total_bytes_processed += buf_len try: container_id = meta['container_id'] content_path = meta['content_path'] content_attr, data = self.container_client.content_show( cid=container_id, path=content_path) # Check chunk data chunk_data = None metachunks = set() for c in data: if c['url'].endswith(meta['chunk_id']): metachunks.add(c['pos'].split('.', 2)[0]) chunk_data = c if not chunk_data: raise exc.OrphanChunk('Not found in content') if chunk_data['size'] != int(meta['chunk_size']): raise exc.FaultyChunk('Invalid chunk size found') if chunk_data['hash'] != meta['chunk_hash']: raise exc.FaultyChunk('Invalid chunk hash found') if chunk_data['pos'] != meta['chunk_pos']: raise exc.FaultyChunk('Invalid chunk position found') except exc.NotFound: raise exc.OrphanChunk('Chunk not found in container')
def close(self): if self.fp: self.md5_read = self.iter_md5.hexdigest() if self.bytes_read != self.size: raise exc.FaultyChunk('Invalid size for chunk') if self.md5_read != self.md5_checksum: raise exc.CorruptedChunk('checksum does not match %s != %s' % (self.md5_read, self.md5_checksum))
def update_index(self, path): with open(path) as f: try: meta = read_chunk_metadata(f) except exc.MissingAttribute as e: raise exc.FaultyChunk('Missing extended attribute %s' % e) data = {'mtime': int(time.time())} self.index_client.chunk_push(self.volume_id, meta['container_id'], meta['content_id'], meta['chunk_id'], **data)
def close(self): """ Perform checks on what has been read before closing, if no error has occurred yet. """ if self.fp and not self.error: md5_read = self.iter_md5.hexdigest() if self.bytes_read != self.size: raise exc.FaultyChunk('Invalid size: expected %d, got %d' % (self.size, self.bytes_read)) if md5_read != self.md5_checksum: raise exc.CorruptedChunk('checksum does not match %s != %s' % (md5_read, self.md5_checksum))
def __init__(self, fp, size, md5_checksum, compression=None): self.fp = fp self.decompressor = None self.error = None if compression not in (None, 'off'): if compression == 'zlib': self.decompressor = zlib.decompressobj(0) else: msg = "Compression method not managed: %s" % compression self.error = exc.FaultyChunk(msg) raise self.error self.size = size self.md5_checksum = md5_checksum self.bytes_read = 0 self.iter_md5 = None
def read_chunk_metadata(fd, chunk_id, check_chunk_id=True): chunk_id = chunk_id.upper() raw_meta = read_user_xattr(fd) raw_meta_copy = None meta = {} meta['links'] = dict() attr_vers = 0.0 raw_chunk_id = container_id = path = version = content_id = None missing = list() for k, v in raw_meta.iteritems(): # New chunks have a version if k == chunk_xattr_keys['oio_version']: attr_vers = float(v) # Chunks with version >= 4.2 have a "full_path" elif k.startswith(CHUNK_XATTR_CONTENT_FULLPATH_PREFIX): parsed_chunk_id = k[len(CHUNK_XATTR_CONTENT_FULLPATH_PREFIX):] if parsed_chunk_id == chunk_id: raw_chunk_id = parsed_chunk_id meta['full_path'] = v account, container, path, version, content_id = \ decode_fullpath(v) container_id = cid_from_name(account, container) else: meta['links'][parsed_chunk_id] = v if raw_chunk_id: raw_meta_copy = raw_meta.copy() raw_meta[chunk_xattr_keys['chunk_id']] = raw_chunk_id raw_meta[chunk_xattr_keys['container_id']] = container_id raw_meta[chunk_xattr_keys['content_path']] = path raw_meta[chunk_xattr_keys['content_version']] = version raw_meta[chunk_xattr_keys['content_id']] = content_id if attr_vers >= 4.2 and 'full_path' not in meta: # TODO(FVE): in that case, do not warn about other attributes # that could be deduced from this one. missing.append( exc.MissingAttribute(CHUNK_XATTR_CONTENT_FULLPATH_PREFIX + chunk_id)) for k, v in chunk_xattr_keys.iteritems(): if v not in raw_meta: if k not in chunk_xattr_keys_optional: missing.append(exc.MissingAttribute(v)) else: meta[k] = raw_meta[v] if missing: raise exc.FaultyChunk(*missing) if check_chunk_id and meta['chunk_id'] != chunk_id: raise exc.MissingAttribute(chunk_xattr_keys['chunk_id']) return meta, raw_meta_copy if raw_meta_copy else raw_meta
def update_index(self, path, chunk_id): with open(path) as file_: try: meta = None if meta is None: meta, _ = read_chunk_metadata(file_, chunk_id) except exc.MissingAttribute as err: raise exc.FaultyChunk(err) data = {'mtime': int(time.time())} headers = {REQID_HEADER: request_id('blob-indexer-')} self.index_client.chunk_push(self.volume_id, meta['container_id'], meta['content_id'], meta['chunk_id'], headers=headers, **data)
def update_index(self, path): with open(path) as f: try: meta = read_chunk_metadata(f) except exc.MissingAttribute as e: raise exc.FaultyChunk('Missing extended attribute %s' % e) data = { 'content_version': meta['content_version'], 'content_nbchunks': meta['content_chunksnb'], 'content_path': meta['content_path'], 'content_size': meta['content_size'], 'chunk_hash': meta['chunk_hash'], 'chunk_position': meta['chunk_pos'], 'chunk_size': meta['chunk_size'], 'mtime': int(time.time()) } self.index_client.chunk_push(self.volume_id, meta['content_cid'], meta['content_id'], meta['chunk_id'], **data)
def update_index(self, path, chunk_id): with open(path) as f: try: meta = None if self.convert_chunks and self.converter: _, meta = self.converter.convert_chunk(f, chunk_id) if meta is None: meta, _ = read_chunk_metadata(f, chunk_id) except exc.MissingAttribute as e: raise exc.FaultyChunk('Missing extended attribute %s' % e) data = {'mtime': int(time.time())} headers = {'X-oio-req-id': 'blob-indexer-' + request_id()[:-13]} self.index_client.chunk_push(self.volume_id, meta['container_id'], meta['content_id'], meta['chunk_id'], headers=headers, **data)