def test_retain_copy(self, session_mock): settings = { 'aws_bucket': self.aws_bucket, 'aws_identity': 'identity', 'aws_secret': 'credential', 'account': 'account', 'container': 'container', 'retain_local': False } sync = SyncContainer(self.scratch_space, settings) sync.provider = mock.Mock() swift_client = mock.Mock() row = { 'deleted': 0, 'created_at': str(time.time() - 5), 'name': 'foo', 'storage_policy_index': 99 } sync.handle(row, swift_client) _, _, swift_ts = decode_timestamps(row['created_at']) swift_ts.offset += 1 sync.provider.upload_object.assert_called_once_with( row['name'], 99, swift_client) swift_client.delete_object.assert_called_once_with( settings['account'], settings['container'], row['name'], headers={'X-Timestamp': Timestamp(swift_ts).internal})
def parse_raw_obj(obj_info): """ Translate a reconciler container listing entry to a dictionary containing the parts of the misplaced object queue entry. :param obj_info: an entry in an a container listing with the required keys: name, content_type, and hash :returns: a queue entry dict with the keys: q_policy_index, account, container, obj, q_op, q_ts, q_record, and path """ raw_obj_name = obj_info['name'].encode('utf-8') policy_index, obj_name = raw_obj_name.split(':', 1) q_policy_index = int(policy_index) account, container, obj = split_path(obj_name, 3, 3, rest_with_last=True) try: q_op = { 'application/x-put': 'PUT', 'application/x-delete': 'DELETE', }[obj_info['content_type']] except KeyError: raise ValueError('invalid operation type %r' % obj_info.get('content_type', None)) return { 'q_policy_index': q_policy_index, 'account': account, 'container': container, 'obj': obj, 'q_op': q_op, 'q_ts': decode_timestamps((obj_info['hash']))[0], 'q_record': last_modified_date_to_timestamp(obj_info['last_modified']), 'path': '/%s/%s/%s' % (account, container, obj) }
def test_retain_copy(self, session_mock): settings = { 'aws_bucket': self.aws_bucket, 'aws_identity': 'identity', 'aws_secret': 'credential', 'account': 'account', 'container': 'container', 'retain_local': False } sync = SyncContainer(self.scratch_space, settings, self.stats_factory) sync.provider = mock.Mock() sync.provider.upload_object.return_value = SyncS3.UploadStatus.PUT swift_client = mock.Mock() swift_client.get_object_metadata.return_value = {} row = { 'deleted': 0, 'created_at': str(time.time() - 5), 'name': 'foo', 'storage_policy_index': 99 } sync.handle(row, swift_client) _, _, swift_ts = decode_timestamps(row['created_at']) sync.provider.upload_object.assert_called_once_with( row, swift_client, mock.ANY) sync.provider.delete_local_object.assert_called_once_with( swift_client, row, swift_ts, False) sync.stats_reporter.increment.assert_called_once_with( 'copied_objects', 1)
def handle(self, row, swift_client): if row['deleted']: if self.propagate_delete: self.provider.delete_object(row['name'], swift_client) else: # The metadata timestamp should always be the latest timestamp _, _, meta_ts = decode_timestamps(row['created_at']) if time.time() <= self.copy_after + meta_ts.timestamp: raise RetryError('Object is not yet eligible for archive') self.provider.upload_object(row['name'], row['storage_policy_index'], swift_client) if not self.retain_local: # NOTE: We rely on the DELETE object X-Timestamp header to # mitigate races where the object may be overwritten. We # increment the offset to ensure that we never remove new # customer data. self.logger.debug("Creating a new TS: %f %f" % (meta_ts.offset, meta_ts.timestamp)) delete_ts = Timestamp(meta_ts, offset=meta_ts.offset + 1) try: swift_client.delete_object( self._account, self._container, row['name'], headers={'X-Timestamp': delete_ts.internal}) except UnexpectedResponse as e: if '409 Conflict' in e.message: pass
def handle(self, row, swift_client): if self.exclude_regex.match(row['name']) is not None: self.logger.debug('Skipping excluded object: %s/%s' % (self._container, row['name'].decode('utf-8'))) return if row['deleted']: if self.propagate_delete: self.provider.delete_object(row['name']) self.stats_reporter.increment('deleted_objects', 1) else: # The metadata timestamp should always be the latest timestamp _, _, meta_ts = decode_timestamps(row['created_at']) if time.time() <= self.copy_after + meta_ts.timestamp: raise RetryError('Object is not yet eligible for archive') status = self.provider.upload_object( row, swift_client, lambda bytes_uploaded: self.stats_reporter.increment( 'bytes', bytes_uploaded)) if status == BaseSync.UploadStatus.PUT: self.stats_reporter.increment('copied_objects', 1) uploaded_statuses = [ BaseSync.UploadStatus.PUT, BaseSync.UploadStatus.POST, # NOOP means the object already exists BaseSync.UploadStatus.NOOP ] if not self.retain_local and status in uploaded_statuses: self.provider.delete_local_object(swift_client, row, meta_ts, self.retain_local_segments)
def parse_raw_obj(obj_info): """ Translate a reconciler container listing entry to a dictionary containing the parts of the misplaced object queue entry. :param obj_info: an entry in an a container listing with the required keys: name, content_type, and hash :returns: a queue entry dict with the keys: q_policy_index, account, container, obj, q_op, q_ts, q_record, and path """ raw_obj_name = obj_info['name'].encode('utf-8') policy_index, obj_name = raw_obj_name.split(':', 1) q_policy_index = int(policy_index) account, container, obj = split_path(obj_name, 3, 3, rest_with_last=True) try: q_op = { 'application/x-put': 'PUT', 'application/x-delete': 'DELETE', }[obj_info['content_type']] except KeyError: raise ValueError('invalid operation type %r' % obj_info.get('content_type', None)) return { 'q_policy_index': q_policy_index, 'account': account, 'container': container, 'obj': obj, 'q_op': q_op, 'q_ts': decode_timestamps((obj_info['hash']))[0], 'q_record': last_modified_date_to_timestamp( obj_info['last_modified']), 'path': '/%s/%s/%s' % (account, container, obj) }
def _transform_record(self, record): """ Decode the created_at timestamp into separate data, content-type and meta timestamps and replace the created_at timestamp with the metadata timestamp i.e. the last-modified time. """ t_data, t_ctype, t_meta = decode_timestamps(record[1]) return (record[0], t_meta.internal) + record[2:]
def test_fail_upload_segment(self, factory_mock): factory_mock.return_value = mock.Mock() base = base_sync.BaseSync(self.settings, max_conns=1) base.logger = mock.Mock() swift_client = mock.Mock() swift_client.get_object_metadata.return_value = { 'x-static-large-object': 'true' } def _get_object(account, container, key, **kwargs): manifest = [{ 'name': '/container_segments/part1', 'hash': 'deadbeef' }, { 'name': '/container_segments/part2', 'hash': 'deadbeef2' }] body = json.dumps(manifest) headers = {'etag': hashlib.md5(body).hexdigest()} return 200, headers, StringIO.StringIO(body) def _delete_object(acc, cont, obj, acceptable_statuses): if obj == 'part1': raise UnexpectedResponse('foo', None) swift_client.get_object.side_effect = _get_object swift_client.delete_object.side_effect = _delete_object row = { 'deleted': 0, 'created_at': str(time.time() - 5), 'name': 'foo', 'storage_policy_index': 99 } _, _, swift_ts = decode_timestamps(row['created_at']) base.delete_local_object(swift_client, row, swift_ts, False) # manifest should not be deleted swift_client.delete_object.assert_has_calls([ mock.call(self.settings['account'], 'container_segments', 'part1', acceptable_statuses=(2, 404, 409)), mock.call(self.settings['account'], 'container_segments', 'part2', acceptable_statuses=(2, 404, 409)) ]) base.logger.warning.assert_called_once_with( 'Failed to delete segment %s/%s/%s: %s', 'account', 'container_segments', 'part1', 'foo') base.logger.error.assert_called_once_with( 'Failed to delete %s segments of %s/%s', 1, 'container', 'foo')
def _get_new_rows(self, broker, start_row, nodes, node_id, verifying): rows = [] if verifying: cutoff = time.time() - self._verification_slack for row in broker.get_items_since(start_row, self.items_chunk): hnum = num_from_row(row) if not verifying and hnum % nodes != node_id: continue ts = decode_timestamps(row['created_at'])[2].timestamp if verifying and ts > cutoff: break rows.append(row) return rows
def get_reconciler_container_name(obj_timestamp): """ Get the name of a container into which a misplaced object should be enqueued. The name is the object's last modified time rounded down to the nearest hour. :param obj_timestamp: a string representation of the object's 'created_at' time from it's container db row. :return: a container name """ # Use last modified time of object to determine reconciler container name _junk, _junk, ts_meta = decode_timestamps(obj_timestamp) return str( int(ts_meta) // MISPLACED_OBJECTS_CONTAINER_DIVISOR * MISPLACED_OBJECTS_CONTAINER_DIVISOR)
def get_reconciler_container_name(obj_timestamp): """ Get the name of a container into which a misplaced object should be enqueued. The name is the object's last modified time rounded down to the nearest hour. :param obj_timestamp: a string representation of the object's 'created_at' time from it's container db row. :return: a container name """ # Use last modified time of object to determine reconciler container name _junk, _junk, ts_meta = decode_timestamps(obj_timestamp) return str(int(ts_meta) // MISPLACED_OBJECTS_CONTAINER_DIVISOR * MISPLACED_OBJECTS_CONTAINER_DIVISOR)
def test_retain_copy_slo(self, factory_mock): factory_mock.return_value = mock.Mock() base = base_sync.BaseSync(self.settings, max_conns=1) swift_client = mock.Mock() swift_client.get_object_metadata.return_value = { 'x-static-large-object': 'true' } def _get_object(account, container, key, **kwargs): manifest = [{ 'name': '/container_segments/part1', 'hash': 'deadbeef' }, { 'name': '/container_segments/part2', 'hash': 'deadbeef2' }] body = json.dumps(manifest) headers = {'etag': hashlib.md5(body).hexdigest()} return 200, headers, StringIO.StringIO(body) swift_client.get_object.side_effect = _get_object row = { 'deleted': 0, 'created_at': str(time.time() - 5), 'name': 'foo', 'storage_policy_index': 99 } _, _, swift_ts = decode_timestamps(row['created_at']) base.delete_local_object(swift_client, row, swift_ts, False) swift_ts.offset += 1 swift_client.delete_object.assert_has_calls([ mock.call(self.settings['account'], 'container_segments', 'part1', acceptable_statuses=(2, 404, 409)), mock.call(self.settings['account'], 'container_segments', 'part2', acceptable_statuses=(2, 404, 409)), mock.call(self.settings['account'], self.settings['container'], row['name'], acceptable_statuses=(2, 404, 409), headers={'X-Timestamp': Timestamp(swift_ts).internal}) ])
def test_retain_copy_dlo(self, factory_mock): factory_mock.return_value = mock.Mock() base = base_sync.BaseSync(self.settings, max_conns=1) swift_client = mock.NonCallableMock() swift_client.get_object_metadata.return_value = { 'x-object-manifest': 'container_segments/segment_' } swift_client.make_request.side_effect = (mock.Mock(body=json.dumps([{ 'name': 'segments_%d' % (i + 1), 'hash': 'deadbeef' } for i in range(2)]), status_int=200), mock.Mock(body='[]', status_int=200)) row = { 'deleted': 0, 'created_at': str(time.time() - 5), 'name': 'foo', 'storage_policy_index': 99 } _, _, swift_ts = decode_timestamps(row['created_at']) base.delete_local_object(swift_client, row, swift_ts, False) swift_ts.offset += 1 swift_client.delete_object.assert_has_calls([ mock.call(self.settings['account'], 'container_segments', 'segments_1', acceptable_statuses=(2, 404, 409)), mock.call(self.settings['account'], 'container_segments', 'segments_2', acceptable_statuses=(2, 404, 409)), mock.call(self.settings['account'], self.settings['container'], row['name'], acceptable_statuses=(2, 404, 409), headers={'X-Timestamp': Timestamp(swift_ts).internal}) ])
def test_retain_copy(self, factory_mock): factory_mock.return_value = mock.Mock() base = base_sync.BaseSync(self.settings, max_conns=1) swift_client = mock.Mock() swift_client.get_object_metadata.return_value = {} row = { 'deleted': 0, 'created_at': str(time.time() - 5), 'name': 'foo', 'storage_policy_index': 99 } _, _, swift_ts = decode_timestamps(row['created_at']) base.delete_local_object(swift_client, row, swift_ts, False) swift_ts.offset += 1 swift_client.delete_object.assert_called_once_with( self.settings['account'], self.settings['container'], row['name'], acceptable_statuses=(2, 404, 409), headers={'X-Timestamp': Timestamp(swift_ts).internal})
def container_sync_row(self, row, sync_to, user_key, broker, info, realm, realm_key): """ Sends the update the row indicates to the sync_to container. Update can be either delete or put. :param row: The updated row in the local database triggering the sync update. :param sync_to: The URL to the remote container. :param user_key: The X-Container-Sync-Key to use when sending requests to the other container. :param broker: The local container database broker. :param info: The get_info result from the local container database broker. :param realm: The realm from self.realms_conf, if there is one. If None, fallback to using the older allowed_sync_hosts way of syncing. :param realm_key: The realm key from self.realms_conf, if there is one. If None, fallback to using the older allowed_sync_hosts way of syncing. :returns: True on success """ try: start_time = time() # extract last modified time from the created_at value ts_data, ts_ctype, ts_meta = decode_timestamps(row['created_at']) if row['deleted']: # when sync'ing a deleted object, use ts_data - this is the # timestamp of the source tombstone try: headers = {'x-timestamp': ts_data.internal} self._update_sync_to_headers(row['name'], sync_to, user_key, realm, realm_key, 'DELETE', headers) delete_object(sync_to, name=row['name'], headers=headers, proxy=self.select_http_proxy(), logger=self.logger, timeout=self.conn_timeout) except ClientException as err: if err.http_status != HTTP_NOT_FOUND: raise self.container_deletes += 1 self.container_stats['deletes'] += 1 self.logger.increment('deletes') self.logger.timing_since('deletes.timing', start_time) else: # when sync'ing a live object, use ts_meta - this is the time # at which the source object was last modified by a PUT or POST if self._object_in_remote_container(row['name'], sync_to, user_key, realm, realm_key, ts_meta): return True exc = None # look up for the newest one; the symlink=get query-string has # no effect unless symlinks are enabled in the internal client # in which case it ensures that symlink objects retain their # symlink property when sync'd. headers_out = { 'X-Newest': True, 'X-Backend-Storage-Policy-Index': str(info['storage_policy_index']) } try: source_obj_status, headers, body = \ self.swift.get_object(info['account'], info['container'], row['name'], headers=headers_out, acceptable_statuses=(2, 4), params={'symlink': 'get'}) except (Exception, UnexpectedResponse, Timeout) as err: headers = {} body = None exc = err timestamp = Timestamp(headers.get('x-timestamp', 0)) if timestamp < ts_meta: if exc: raise exc raise Exception( _('Unknown exception trying to GET: ' '%(account)r %(container)r %(object)r'), { 'account': info['account'], 'container': info['container'], 'object': row['name'] }) for key in ('date', 'last-modified'): if key in headers: del headers[key] if 'etag' in headers: headers['etag'] = normalize_etag(headers['etag']) if 'content-type' in headers: headers['content-type'] = clean_content_type( headers['content-type']) self._update_sync_to_headers(row['name'], sync_to, user_key, realm, realm_key, 'PUT', headers) put_object(sync_to, name=row['name'], headers=headers, contents=FileLikeIter(body), proxy=self.select_http_proxy(), logger=self.logger, timeout=self.conn_timeout) self.container_puts += 1 self.container_stats['puts'] += 1 self.container_stats['bytes'] += row['size'] self.logger.increment('puts') self.logger.timing_since('puts.timing', start_time) except ClientException as err: if err.http_status == HTTP_UNAUTHORIZED: self.logger.info( _('Unauth %(sync_from)r => %(sync_to)r'), { 'sync_from': '%s/%s' % (quote(info['account']), quote(info['container'])), 'sync_to': sync_to }) elif err.http_status == HTTP_NOT_FOUND: self.logger.info( _('Not found %(sync_from)r => %(sync_to)r \ - object %(obj_name)r'), { 'sync_from': '%s/%s' % (quote(info['account']), quote(info['container'])), 'sync_to': sync_to, 'obj_name': row['name'] }) else: self.logger.exception(_('ERROR Syncing %(db_file)s %(row)s'), { 'db_file': str(broker), 'row': row }) self.container_failures += 1 self.logger.increment('failures') return False except (Exception, Timeout) as err: self.logger.exception(_('ERROR Syncing %(db_file)s %(row)s'), { 'db_file': str(broker), 'row': row }) self.container_failures += 1 self.logger.increment('failures') return False return True
def _upload_object(self, src_container, dst_container, key, internal_client, segment=False, policy_index=None, timestamp=None, stats_cb=None): req_hdrs = {} if policy_index is not None: req_hdrs['X-Backend-Storage-Policy-Index'] = policy_index try: with self.client_pool.get_client() as swift_client: remote_meta = swift_client.head_object( dst_container, key, headers=self._client_headers()) except swiftclient.exceptions.ClientException as e: if e.http_status == 404: remote_meta = None else: raise try: metadata = internal_client.get_object_metadata( self.account, src_container, key, headers=req_hdrs) except UnexpectedResponse as e: if '404 Not Found' in e.message: return self.UploadStatus.NOT_FOUND raise if not segment: _, _, internal_timestamp = decode_timestamps(timestamp) if float(metadata['x-timestamp']) <\ float(internal_timestamp.internal): raise RetryError('Stale object %s' % key) if not segment and not match_item(metadata, self.selection_criteria): self.logger.debug( 'Not archiving %s as metadata does not match: %s %s' % ( key, metadata, self.selection_criteria)) return self.UploadStatus.SKIPPED_METADATA if check_slo(metadata): if segment: self.logger.warning( 'Nested SLOs are not currently supported. Failing to ' 'upload: %s/%s/%s' % (self.account, src_container, key)) return self.UploadStatus.SKIPPED_NESTED_SLO if remote_meta and self._check_slo_uploaded( key, remote_meta, internal_client, req_hdrs): if not self._is_meta_synced(metadata, remote_meta): # TODO: Update segments' X-Delete-At headers if # remote_delete_after is applied/updated/removed. self.update_metadata(key, metadata, remote_metadata=remote_meta, bucket=dst_container) return self.UploadStatus.POST return self.UploadStatus.NOOP return self._upload_slo(key, internal_client, req_hdrs, stats_cb=stats_cb) dlo_prefix = get_dlo_prefix(metadata) if not segment and dlo_prefix: # TODO: we should be able to consolidate checking of uploaded # objects before getting into the specifics of uploading large # objects or regular objects. if remote_meta and self._check_dlo_uploaded(metadata, remote_meta, internal_client): if not self._is_meta_synced(metadata, remote_meta): self.update_metadata(key, metadata, remote_metadata=remote_meta, bucket=dst_container) # TODO: Update segments' X-Delete-At headers if # remote_delete_after is applied/updated/removed. return self.UploadStatus.POST return self.UploadStatus.NOOP return self._upload_dlo(key, internal_client, metadata, req_hdrs, stats_cb=stats_cb) if remote_meta and metadata['etag'] == remote_meta['etag']: if not self._is_meta_synced( metadata, remote_meta, segment=segment): self.update_metadata(key, metadata, remote_metadata=remote_meta, bucket=dst_container, segment=segment) return self.UploadStatus.POST return self.UploadStatus.NOOP body = FileWrapper( internal_client, self.account, src_container, key, req_hdrs, stats_cb=stats_cb) headers = self._get_user_headers( body.get_headers(), segment=segment) self.logger.debug('Uploading %s with meta: %r' % ( key, headers)) try: resp = self.put_object( key, self._client_headers(headers), body, bucket=dst_container, etag=body.get_headers()['etag'], content_length=len(body)) if not resp.success: resp.reraise() finally: body.close() return self.UploadStatus.PUT
def update_new_item_from_existing(new_item, existing): """ Compare the data and meta related timestamps of a new object item with the timestamps of an existing object record, and update the new item with data and/or meta related attributes from the existing record if their timestamps are newer. The multiple timestamps are encoded into a single string for storing in the 'created_at' column of the objects db table. :param new_item: A dict of object update attributes :param existing: A dict of existing object attributes :return: True if any attributes of the new item dict were found to be newer than the existing and therefore not updated, otherwise False implying that the updated item is equal to the existing. """ # item[created_at] may be updated so keep a copy of the original # value in case we process this item again new_item.setdefault('data_timestamp', new_item['created_at']) # content-type and metadata timestamps may be encoded in # item[created_at], or may be set explicitly. item_ts_data, item_ts_ctype, item_ts_meta = decode_timestamps( new_item['data_timestamp']) if new_item.get('ctype_timestamp'): item_ts_ctype = Timestamp(new_item.get('ctype_timestamp')) item_ts_meta = item_ts_ctype if new_item.get('meta_timestamp'): item_ts_meta = Timestamp(new_item.get('meta_timestamp')) if not existing: # encode new_item timestamps into one string for db record new_item['created_at'] = encode_timestamps(item_ts_data, item_ts_ctype, item_ts_meta) return True # decode existing timestamp into separate data, content-type and # metadata timestamps rec_ts_data, rec_ts_ctype, rec_ts_meta = decode_timestamps( existing['created_at']) # Extract any swift_bytes values from the content_type values. This is # necessary because the swift_bytes value to persist should be that at the # most recent data timestamp whereas the content-type value to persist is # that at the most recent content-type timestamp. The two values happen to # be stored in the same database column for historical reasons. for item in (new_item, existing): content_type, swift_bytes = extract_swift_bytes(item['content_type']) item['content_type'] = content_type item['swift_bytes'] = swift_bytes newer_than_existing = [True, True, True] if rec_ts_data >= item_ts_data: # apply data attributes from existing record new_item.update([(k, existing[k]) for k in ('size', 'etag', 'deleted', 'swift_bytes')]) item_ts_data = rec_ts_data newer_than_existing[0] = False if rec_ts_ctype >= item_ts_ctype: # apply content-type attribute from existing record new_item['content_type'] = existing['content_type'] item_ts_ctype = rec_ts_ctype newer_than_existing[1] = False if rec_ts_meta >= item_ts_meta: # apply metadata timestamp from existing record item_ts_meta = rec_ts_meta newer_than_existing[2] = False # encode updated timestamps into one string for db record new_item['created_at'] = encode_timestamps(item_ts_data, item_ts_ctype, item_ts_meta) # append the most recent swift_bytes onto the most recent content_type in # new_item and restore existing to its original state for item in (new_item, existing): if item['swift_bytes']: item['content_type'] += ';swift_bytes=%s' % item['swift_bytes'] del item['swift_bytes'] return any(newer_than_existing)
def container_sync_row(self, row, sync_to, user_key, broker, info, realm, realm_key): """ Sends the update the row indicates to the sync_to container. :param row: The updated row in the local database triggering the sync update. :param sync_to: The URL to the remote container. :param user_key: The X-Container-Sync-Key to use when sending requests to the other container. :param broker: The local container database broker. :param info: The get_info result from the local container database broker. :param realm: The realm from self.realms_conf, if there is one. If None, fallback to using the older allowed_sync_hosts way of syncing. :param realm_key: The realm key from self.realms_conf, if there is one. If None, fallback to using the older allowed_sync_hosts way of syncing. :returns: True on success """ try: start_time = time() # extract last modified time from the created_at value ts_data, ts_ctype, ts_meta = decode_timestamps(row['created_at']) if row['deleted']: # when sync'ing a deleted object, use ts_data - this is the # timestamp of the source tombstone try: headers = {'x-timestamp': ts_data.internal} if realm and realm_key: nonce = uuid.uuid4().hex path = urlparse(sync_to).path + '/' + quote( row['name']) sig = self.realms_conf.get_sig('DELETE', path, headers['x-timestamp'], nonce, realm_key, user_key) headers['x-container-sync-auth'] = '%s %s %s' % ( realm, nonce, sig) else: headers['x-container-sync-key'] = user_key delete_object(sync_to, name=row['name'], headers=headers, proxy=self.select_http_proxy(), logger=self.logger, timeout=self.conn_timeout) except ClientException as err: if err.http_status != HTTP_NOT_FOUND: raise self.container_deletes += 1 self.logger.increment('deletes') self.logger.timing_since('deletes.timing', start_time) else: # when sync'ing a live object, use ts_meta - this is the time # at which the source object was last modified by a PUT or POST part, nodes = \ self.get_object_ring(info['storage_policy_index']). \ get_nodes(info['account'], info['container'], row['name']) shuffle(nodes) exc = None # look up for the newest one headers_out = { 'X-Newest': True, 'X-Backend-Storage-Policy-Index': str(info['storage_policy_index']) } try: source_obj_status, headers, body = \ self.swift.get_object(info['account'], info['container'], row['name'], headers=headers_out, acceptable_statuses=(2, 4)) except (Exception, UnexpectedResponse, Timeout) as err: headers = {} body = None exc = err timestamp = Timestamp(headers.get('x-timestamp', 0)) if timestamp < ts_meta: if exc: raise exc raise Exception( _('Unknown exception trying to GET: ' '%(account)r %(container)r %(object)r'), { 'account': info['account'], 'container': info['container'], 'object': row['name'] }) for key in ('date', 'last-modified'): if key in headers: del headers[key] if 'etag' in headers: headers['etag'] = headers['etag'].strip('"') if 'content-type' in headers: headers['content-type'] = clean_content_type( headers['content-type']) if realm and realm_key: nonce = uuid.uuid4().hex path = urlparse(sync_to).path + '/' + quote(row['name']) sig = self.realms_conf.get_sig('PUT', path, headers['x-timestamp'], nonce, realm_key, user_key) headers['x-container-sync-auth'] = '%s %s %s' % ( realm, nonce, sig) else: headers['x-container-sync-key'] = user_key put_object(sync_to, name=row['name'], headers=headers, contents=FileLikeIter(body), proxy=self.select_http_proxy(), logger=self.logger, timeout=self.conn_timeout) self.container_puts += 1 self.logger.increment('puts') self.logger.timing_since('puts.timing', start_time) except ClientException as err: if err.http_status == HTTP_UNAUTHORIZED: self.logger.info( _('Unauth %(sync_from)r => %(sync_to)r'), { 'sync_from': '%s/%s' % (quote(info['account']), quote(info['container'])), 'sync_to': sync_to }) elif err.http_status == HTTP_NOT_FOUND: self.logger.info( _('Not found %(sync_from)r => %(sync_to)r \ - object %(obj_name)r'), { 'sync_from': '%s/%s' % (quote(info['account']), quote(info['container'])), 'sync_to': sync_to, 'obj_name': row['name'] }) else: self.logger.exception(_('ERROR Syncing %(db_file)s %(row)s'), { 'db_file': str(broker), 'row': row }) self.container_failures += 1 self.logger.increment('failures') return False except (Exception, Timeout) as err: self.logger.exception(_('ERROR Syncing %(db_file)s %(row)s'), { 'db_file': str(broker), 'row': row }) self.container_failures += 1 self.logger.increment('failures') return False return True
def container_sync_row(self, row, sync_to, user_key, broker, info, realm, realm_key): """ Sends the update the row indicates to the sync_to container. :param row: The updated row in the local database triggering the sync update. :param sync_to: The URL to the remote container. :param user_key: The X-Container-Sync-Key to use when sending requests to the other container. :param broker: The local container database broker. :param info: The get_info result from the local container database broker. :param realm: The realm from self.realms_conf, if there is one. If None, fallback to using the older allowed_sync_hosts way of syncing. :param realm_key: The realm key from self.realms_conf, if there is one. If None, fallback to using the older allowed_sync_hosts way of syncing. :returns: True on success """ try: start_time = time() # extract last modified time from the created_at value ts_data, ts_ctype, ts_meta = decode_timestamps( row['created_at']) if row['deleted']: # when sync'ing a deleted object, use ts_data - this is the # timestamp of the source tombstone try: headers = {'x-timestamp': ts_data.internal} if realm and realm_key: nonce = uuid.uuid4().hex path = urlparse(sync_to).path + '/' + quote( row['name']) sig = self.realms_conf.get_sig( 'DELETE', path, headers['x-timestamp'], nonce, realm_key, user_key) headers['x-container-sync-auth'] = '%s %s %s' % ( realm, nonce, sig) else: headers['x-container-sync-key'] = user_key delete_object(sync_to, name=row['name'], headers=headers, proxy=self.select_http_proxy(), logger=self.logger, timeout=self.conn_timeout) except ClientException as err: if err.http_status != HTTP_NOT_FOUND: raise self.container_deletes += 1 self.logger.increment('deletes') self.logger.timing_since('deletes.timing', start_time) else: # when sync'ing a live object, use ts_meta - this is the time # at which the source object was last modified by a PUT or POST exc = None # look up for the newest one headers_out = {'X-Newest': True, 'X-Backend-Storage-Policy-Index': str(info['storage_policy_index'])} try: source_obj_status, headers, body = \ self.swift.get_object(info['account'], info['container'], row['name'], headers=headers_out, acceptable_statuses=(2, 4)) except (Exception, UnexpectedResponse, Timeout) as err: headers = {} body = None exc = err timestamp = Timestamp(headers.get('x-timestamp', 0)) if timestamp < ts_meta: if exc: raise exc raise Exception( _('Unknown exception trying to GET: ' '%(account)r %(container)r %(object)r'), {'account': info['account'], 'container': info['container'], 'object': row['name']}) for key in ('date', 'last-modified'): if key in headers: del headers[key] if 'etag' in headers: headers['etag'] = headers['etag'].strip('"') if 'content-type' in headers: headers['content-type'] = clean_content_type( headers['content-type']) if realm and realm_key: nonce = uuid.uuid4().hex path = urlparse(sync_to).path + '/' + quote(row['name']) sig = self.realms_conf.get_sig( 'PUT', path, headers['x-timestamp'], nonce, realm_key, user_key) headers['x-container-sync-auth'] = '%s %s %s' % ( realm, nonce, sig) else: headers['x-container-sync-key'] = user_key put_object(sync_to, name=row['name'], headers=headers, contents=FileLikeIter(body), proxy=self.select_http_proxy(), logger=self.logger, timeout=self.conn_timeout) self.container_puts += 1 self.logger.increment('puts') self.logger.timing_since('puts.timing', start_time) except ClientException as err: if err.http_status == HTTP_UNAUTHORIZED: self.logger.info( _('Unauth %(sync_from)r => %(sync_to)r'), {'sync_from': '%s/%s' % (quote(info['account']), quote(info['container'])), 'sync_to': sync_to}) elif err.http_status == HTTP_NOT_FOUND: self.logger.info( _('Not found %(sync_from)r => %(sync_to)r \ - object %(obj_name)r'), {'sync_from': '%s/%s' % (quote(info['account']), quote(info['container'])), 'sync_to': sync_to, 'obj_name': row['name']}) else: self.logger.exception( _('ERROR Syncing %(db_file)s %(row)s'), {'db_file': str(broker), 'row': row}) self.container_failures += 1 self.logger.increment('failures') return False except (Exception, Timeout) as err: self.logger.exception( _('ERROR Syncing %(db_file)s %(row)s'), {'db_file': str(broker), 'row': row}) self.container_failures += 1 self.logger.increment('failures') return False return True
def upload_slo(self, row, s3_meta, internal_client, upload_stats_cb=None): # Converts an SLO into a multipart upload. We use the segments as # is, for the part sizes. # NOTE: If the SLO segment is < 5MB and is not the last segment, the # UploadPart call will fail. We need to stitch segments together in # that case. # # For Google Cloud Storage, we will convert the SLO into a single # object put, assuming the SLO is < 5TB. If the SLO is > 5TB, we have # to fail the upload. With GCS _compose_, we could support larger # objects, but defer this work for the time being. swift_req_hdrs = { 'X-Backend-Storage-Policy-Index': row['storage_policy_index']} swift_key = row['name'] status, headers, body = internal_client.get_object( self.account, self.container, swift_key, headers=swift_req_hdrs) if status != 200: body.close() raise RuntimeError('Failed to get the manifest') manifest = json.loads(''.join(body)) body.close() _, _, metadata_timestamp = decode_timestamps(row['created_at']) if float(headers['x-timestamp']) < metadata_timestamp.timestamp: raise RetryError('Stale object %s' % row['name']) self.logger.debug("JSON manifest: %s" % str(manifest)) s3_key = self.get_s3_name(swift_key) if not self._validate_slo_manifest(manifest): # We do not raise an exception here -- we should not retry these # errors and they will be logged. # TODO: When we report statistics, we need to account for permanent # failures. self.logger.error('Failed to validate the SLO manifest for %s' % self._full_name(swift_key)) return self.UploadStatus.INVALID_SLO if self._google(): if s3_meta: slo_etag = s3_meta['Metadata'].get(SLO_ETAG_FIELD, None) if slo_etag == headers['etag']: if self.is_object_meta_synced(s3_meta, headers): return self.UploadStatus.NOOP self.update_metadata(swift_key, headers) return self.UploadStatus.POST self._upload_google_slo(manifest, headers, s3_key, internal_client, upload_stats_cb) else: expected_etag = get_slo_etag(manifest) if s3_meta and self.check_etag(expected_etag, s3_meta['ETag']): if self.is_object_meta_synced(s3_meta, headers): return self.UploadStatus.NOOP elif not self.in_glacier(s3_meta): self.update_slo_metadata(headers, manifest, s3_key, swift_req_hdrs, internal_client) return self.UploadStatus.POST self._upload_slo(manifest, headers, s3_key, internal_client, upload_stats_cb) with self.client_pool.get_client() as s3_client: # We upload the manifest so that we can restore the object in # Swift and have it match the S3 multipart ETag. To avoid name # length issues, we hash the object name and append the suffix params = dict( Bucket=self.aws_bucket, Key=self.get_manifest_name(s3_key), Body=json.dumps(manifest), ContentLength=len(json.dumps(manifest)), ContentType='application/json') if self._is_amazon() and self.encryption: params['ServerSideEncryption'] = 'AES256' s3_client.put_object(**params) return self.UploadStatus.PUT
def update_new_item_from_existing(new_item, existing): """ Compare the data and meta related timestamps of a new object item with the timestamps of an existing object record, and update the new item with data and/or meta related attributes from the existing record if their timestamps are newer. The multiple timestamps are encoded into a single string for storing in the 'created_at' column of the objects db table. :param new_item: A dict of object update attributes :param existing: A dict of existing object attributes :return: True if any attributes of the new item dict were found to be newer than the existing and therefore not updated, otherwise False implying that the updated item is equal to the existing. """ # item[created_at] may be updated so keep a copy of the original # value in case we process this item again new_item.setdefault('data_timestamp', new_item['created_at']) # content-type and metadata timestamps may be encoded in # item[created_at], or may be set explicitly. item_ts_data, item_ts_ctype, item_ts_meta = decode_timestamps( new_item['data_timestamp']) if new_item.get('ctype_timestamp'): item_ts_ctype = Timestamp(new_item.get('ctype_timestamp')) item_ts_meta = item_ts_ctype if new_item.get('meta_timestamp'): item_ts_meta = Timestamp(new_item.get('meta_timestamp')) if not existing: # encode new_item timestamps into one string for db record new_item['created_at'] = encode_timestamps( item_ts_data, item_ts_ctype, item_ts_meta) return True # decode existing timestamp into separate data, content-type and # metadata timestamps rec_ts_data, rec_ts_ctype, rec_ts_meta = decode_timestamps( existing['created_at']) # Extract any swift_bytes values from the content_type values. This is # necessary because the swift_bytes value to persist should be that at the # most recent data timestamp whereas the content-type value to persist is # that at the most recent content-type timestamp. The two values happen to # be stored in the same database column for historical reasons. for item in (new_item, existing): content_type, swift_bytes = extract_swift_bytes(item['content_type']) item['content_type'] = content_type item['swift_bytes'] = swift_bytes newer_than_existing = [True, True, True] if rec_ts_data >= item_ts_data: # apply data attributes from existing record new_item.update([(k, existing[k]) for k in ('size', 'etag', 'deleted', 'swift_bytes')]) item_ts_data = rec_ts_data newer_than_existing[0] = False if rec_ts_ctype >= item_ts_ctype: # apply content-type attribute from existing record new_item['content_type'] = existing['content_type'] item_ts_ctype = rec_ts_ctype newer_than_existing[1] = False if rec_ts_meta >= item_ts_meta: # apply metadata timestamp from existing record item_ts_meta = rec_ts_meta newer_than_existing[2] = False # encode updated timestamps into one string for db record new_item['created_at'] = encode_timestamps( item_ts_data, item_ts_ctype, item_ts_meta) # append the most recent swift_bytes onto the most recent content_type in # new_item and restore existing to its original state for item in (new_item, existing): if item['swift_bytes']: item['content_type'] += ';swift_bytes=%s' % item['swift_bytes'] del item['swift_bytes'] return any(newer_than_existing)
def _get_last_modified_date(row): ts, content, meta = decode_timestamps(row['created_at']) # NOTE: the meta timestamp will always be latest, as it will be updated # when content type is updated return meta
def upload_object(self, row, internal_client, upload_stats_cb=None): swift_key = row['name'] s3_key = self.get_s3_name(swift_key) try: with self.client_pool.get_client() as s3_client: s3_meta = s3_client.head_object(Bucket=self.aws_bucket, Key=s3_key) except botocore.exceptions.ClientError as e: resp_meta = e.response.get('ResponseMetadata', {}) if resp_meta.get('HTTPStatusCode', 0) == 404: s3_meta = None else: raise e swift_req_hdrs = { 'X-Backend-Storage-Policy-Index': row['storage_policy_index']} try: metadata = internal_client.get_object_metadata( self.account, self.container, swift_key, headers=swift_req_hdrs) except UnexpectedResponse as e: if '404 Not Found' in e.message: return self.UploadStatus.NOT_FOUND raise _, _, metadata_timestamp = decode_timestamps(row['created_at']) if float(metadata['x-timestamp']) < metadata_timestamp.timestamp: raise RetryError('Stale object %s' % row['name']) if not match_item(metadata, self.selection_criteria): self.logger.debug( 'Not archiving %s as metadata does not match: %s %s' % ( swift_key, metadata, self.selection_criteria)) return self.UploadStatus.SKIPPED_METADATA self.logger.debug("Metadata: %s" % str(metadata)) if check_slo(metadata): return self.upload_slo(row, s3_meta, internal_client, upload_stats_cb) if s3_meta and self.check_etag(metadata['etag'], s3_meta['ETag']): if self.is_object_meta_synced(s3_meta, metadata): return self.UploadStatus.NOOP elif not self.in_glacier(s3_meta): self.update_metadata(swift_key, metadata) return self.UploadStatus.POST with self.client_pool.get_client() as s3_client: wrapper_stream = FileWrapper(internal_client, self.account, self.container, swift_key, swift_req_hdrs, stats_cb=upload_stats_cb) self.logger.debug('Uploading %s with meta: %r' % ( s3_key, wrapper_stream.get_s3_headers())) params = dict( Bucket=self.aws_bucket, Key=s3_key, Body=wrapper_stream, Metadata=wrapper_stream.get_s3_headers(), ContentLength=len(wrapper_stream), ContentMD5=base64.b64encode( wrapper_stream.get_headers()['etag'].decode('hex')), ContentType=metadata['content-type'] ) if self._is_amazon() and self.encryption: params['ServerSideEncryption'] = 'AES256' try: s3_client.put_object(**params) finally: wrapper_stream.close() return self.UploadStatus.PUT