def select_package_stats(s3_client, bucket, manifest_key) -> str: """use s3 select to generate file stats for package""" logger_ = get_quilt_logger() try: raw_stats = query_manifest_content( s3_client, bucket=bucket, key=manifest_key, sql_stmt=SELECT_PACKAGE_STATS ).read() if raw_stats: stats = json.loads(raw_stats) assert isinstance(stats['total_bytes'], int) assert isinstance(stats['total_files'], int) return stats except ( AssertionError, botocore.exceptions.ClientError, json.JSONDecodeError, KeyError, ) as err: logger_.exception("Unable to compute package stats via S3 select") return None
def _append_document(self, doc): """append well-formed documents (used for retry or by append())""" logger_ = get_quilt_logger() if doc.get("content"): # document text dominates memory footprint; OK to neglect the # small fixed size for the JSON metadata self.size += min(doc["size"], ELASTIC_LIMIT_BYTES) logger_.debug("Appending document %s", doc) self.queue.append(doc)
def append( self, *, bucket: str, key: str, etag: str, last_modified: str, size: int, text: str, event_type: str, ext: str, version_id, ): """format event as a document and then queue the document""" logger_ = get_quilt_logger() if not bucket or not key: raise ValueError(f"bucket={bucket} or key={key} required but missing") is_delete_marker = False if event_type.startswith(EVENT_PREFIX["Created"]): _op_type = "index" elif event_type.startswith(EVENT_PREFIX["Removed"]): _op_type = "delete" if event_type.endswith("DeleteMarkerCreated"): is_delete_marker = True # we index (not delete) delete markers to sync state with S3 _op_type = "index" else: logger_.error("Skipping unrecognized event type %s", event_type) return # On types and fields, see # https://www.elastic.co/guide/en/elasticsearch/reference/master/mapping.html # Set common properties on the document # BE CAREFUL changing these values, as type changes or missing fields # can cause exceptions from ES # ensure the same versionId and primary keys (_id) as given by # list-object-versions in the enterprise bulk_scanner version_id = version_id or "null" # core properties for all document types; # see https://elasticsearch-py.readthedocs.io/en/6.3.1/helpers.html body = { "_index": bucket, "_op_type": _op_type, # determines if action is upsert (index) or delete "_id": get_id(key, version_id), "etag": etag, "key": key, "last_modified": last_modified, "size": size, "delete_marker": is_delete_marker, "version_id": version_id, "content": text, # field for full-text search "event": event_type, "ext": ext, "updated": datetime.utcnow().isoformat(), } self.append_document(body)
def get_time_remaining(context): """returns time remaining in seconds before lambda context is shut down""" logger_ = get_quilt_logger() time_remaining = floor(context.get_remaining_time_in_millis()/1000) if time_remaining < 30: logger_.warning( "Lambda function has {time_remaining} sec remaining. Reduce batch size?" ) return time_remaining
def _filter_and_delete_packages(self, elastic): """handle package hard delete""" logger_ = get_quilt_logger() true_docs = [] for doc in self.queue: pointer_file = doc.get("pointer_file") # handle hard package delete outside of the bulk operation if doc["_op_type"] == "delete" and pointer_file and not doc.get( "delete_marker"): index = doc.get("_index") assert index.endswith(PACKAGE_INDEX_SUFFIX ), f"Refuse to delete non-package: {doc}" handle = doc.get("handle") assert handle, "Cannot delete package without handle" # no try/except because failure to delete, or trying to delete things # that aren't in ES, does not throw deletes = elastic.delete_by_query( index=index, body={ "query": { "bool": { "must": [ # use match (not term) because some of these fields are analyzed { "match": { "handle": handle } }, { "match": { "pointer_file": pointer_file } }, { "match": { "delete_marker": False } }, ] } } }, # we delete synchronously, so don't let it linger too long timeout='20s') logger_.debug("Deleted %s stamped %s: %s", handle, pointer_file, deletes) if not deletes.get("deleted"): logger_.warning("Unable to delete: %s", doc) # send everything else to bulk() else: logger_.debug("Not filtering docs: %s", doc) true_docs.append(doc) # the queue is now everything we didn't delete by query above self.queue = true_docs
def retry_s3( operation, bucket, key, size=None, limit=None, *, etag, version_id, s3_client ): """retry head or get operation to S3 with; stop before we run out of time. retry is necessary since, due to eventual consistency, we may not always get the required version of the object. """ logger_ = get_quilt_logger() if operation == "head": function_ = s3_client.head_object elif operation == "get": function_ = s3_client.get_object else: raise ValueError(f"unexpected operation: {operation}") # Keyword arguments to function_ arguments = { "Bucket": bucket, "Key": key } if operation == 'get' and size and limit: # can only request range if file is not empty arguments['Range'] = f"bytes=0-{min(size, limit) - 1}" if version_id: arguments['VersionId'] = version_id elif etag: arguments['IfMatch'] = etag logger_.debug("Entering @retry: %s, %s", operation, arguments) @retry( # debug reraise=True, stop=stop_after_attempt(MAX_RETRY), wait=wait_exponential(multiplier=2, min=4, max=10), retry=(retry_if_exception(should_retry_exception)) ) def call(): """local function so we can set stop_after_delay dynamically""" # TODO: remove all this, stop_after_delay is not dynamically loaded anymore return function_(**arguments) return call()
def append_document(self, doc): """append well-formed documents (used for retry or by append())""" logger_ = get_quilt_logger() # This should be removed when we migrate to recent ES versions, see # https://www.elastic.co/guide/en/elasticsearch/reference/6.7/removal-of-types.html doc["_type"] = "_doc" # document text dominates memory footprint; OK to neglect the # small fixed size for the JSON metadata self.size += len(doc.get("content") or "") logger_.debug("Appending document %s", doc) self.queue.append(doc) if self.size >= QUEUE_LIMIT_BYTES: self.send_all()
def do_index( s3_client, doc_queue: DocumentQueue, event_type: str, *, bucket: str, etag: str, ext: str, key: str, last_modified: str, text: str = '', size: int = 0, version_id: Optional[str] = None, ): """wrap dual indexing of packages and objects""" logger_ = get_quilt_logger() # index as object (always) logger_.debug("%s to indexing queue (%s)", key, event_type) doc_queue.append( event_type, DocTypes.OBJECT, bucket=bucket, ext=ext, etag=etag, key=key, last_modified=last_modified, size=size, text=text, version_id=version_id ) # maybe index as package if index_if_package( s3_client, doc_queue, event_type, bucket=bucket, etag=etag, ext=ext, key=key, last_modified=last_modified, size=size, version_id=version_id, ): logger_.debug("%s indexed as package (%s)", key, event_type)
def bulk_send(elastic, list_): """make a bulk() call to elastic""" logger_ = get_quilt_logger() logger_.debug("bulk_send(): %s", list_) return bulk( elastic, list_, # Some magic numbers to reduce memory pressure # e.g. see https://github.com/wagtail/wagtail/issues/4554 chunk_size=100, # max number of documents sent in one chunk # The stated default is max_chunk_bytes=10485760, but with default # ES will still return an exception stating that the very # same request size limit has been exceeded max_chunk_bytes=CHUNK_LIMIT_BYTES, # number of retries for 429 (too many requests only) # all other errors handled by our code max_retries=RETRY_429, # we'll process errors on our own raise_on_error=False, raise_on_exception=False)
def shape_event(event: dict): """check event schema, return None if schema check fails""" logger_ = get_quilt_logger() try: validate( instance=event, schema=EVENT_SCHEMA, # format_checker= required for for format:date-time validation # (we also need strict-rfc3339 in requirements.txt) format_checker=draft7_format_checker, ) except ValidationError as error: logger_.error("Invalid event format: %s\n%s", error, event) return None # be a good citizen and don't modify params return { **event, 'eventName': map_event_name(event), }
def handler(event, context): """enumerate S3 keys in event, extract relevant data, queue events, send to elastic via bulk() API """ logger_ = get_quilt_logger() # message is a proper SQS message, which either contains a single event # (from the bucket notification system) or batch-many events as determined # by enterprise/**/bulk_loader.py # An exception that we'll want to re-raise after the batch sends content_exception = None for message in event["Records"]: body = json.loads(message["body"]) body_message = json.loads(body["Message"]) if "Records" not in body_message: if body_message.get("Event") == TEST_EVENT: logger_.debug("Skipping S3 Test Event") # Consume and ignore this event, which is an initial message from # SQS; see https://forums.aws.amazon.com/thread.jspa?threadID=84331 continue print("Unexpected message['body']. No 'Records' key.", message) raise Exception("Unexpected message['body']. No 'Records' key.") batch_processor = DocumentQueue(context) events = body_message.get("Records", []) s3_client = make_s3_client() # event is a single S3 event for event_ in events: logger_.debug("Processing %s", event_) try: event_name = event_["eventName"] # Process all Create:* and Remove:* events if not any( event_name.startswith(n) for n in EVENT_PREFIX.values()): continue bucket = unquote(event_["s3"]["bucket"]["name"]) # In the grand tradition of IE6, S3 events turn spaces into '+' key = unquote_plus(event_["s3"]["object"]["key"]) version_id = event_["s3"]["object"].get("versionId") version_id = unquote(version_id) if version_id else None # Skip delete markers when versioning is on if version_id and event_name == "ObjectRemoved:DeleteMarkerCreated": continue # ObjectRemoved:Delete does not include "eTag" etag = unquote(event_["s3"]["object"].get("eTag", "")) # Get two levels of extensions to handle files like .csv.gz path = pathlib.PurePosixPath(key) ext1 = path.suffix ext2 = path.with_suffix('').suffix ext = (ext2 + ext1).lower() # Handle delete first and then continue so that # head_object and get_object (below) don't fail if event_name.startswith(EVENT_PREFIX["Removed"]): logger_.debug("Object delete to queue") batch_processor.append(event_name, DocTypes.OBJECT, bucket=bucket, ext=ext, etag=etag, key=key, last_modified=now_like_boto3(), text="", version_id=version_id) continue try: logger_.debug("Get object head") head = retry_s3("head", bucket, key, s3_client=s3_client, version_id=version_id, etag=etag) except botocore.exceptions.ClientError as first: logger_.warning("head_object error: %s", first) # "null" version sometimes results in 403s for buckets # that have changed versioning, retry without it if (first.response.get('Error', {}).get('Code') == "403" and version_id == "null"): try: head = retry_s3("head", bucket, key, s3_client=s3_client, version_id=None, etag=etag) except botocore.exceptions.ClientError as second: # this will bypass the DLQ but that's the right thing to do # as some listed objects may NEVER succeed head requests # (e.g. foreign owner) and there's no reason to torpedo # the whole batch (which might include good files) logger_.warning("Retried head_object error: %s", second) logger_.error("Fatal head_object, skipping event: %s", event_) continue size = head["ContentLength"] last_modified = head["LastModified"] did_index = index_if_manifest(s3_client, batch_processor, event_name, bucket=bucket, etag=etag, ext=ext, key=key, last_modified=last_modified, size=size, version_id=version_id) logger_.debug("Logged as manifest? %s", did_index) try: text = maybe_get_contents(bucket, key, ext, etag=etag, version_id=version_id, s3_client=s3_client, size=size) # we still want an entry for this document in elastic so that, e.g., # the file counts from elastic are correct. re-raise below. except Exception as exc: # pylint: disable=broad-except text = "" content_exception = exc logger_.error("Content extraction failed %s %s %s", bucket, key, exc) batch_processor.append(event_name, DocTypes.OBJECT, bucket=bucket, key=key, ext=ext, etag=etag, version_id=version_id, last_modified=last_modified, size=size, text=text) except botocore.exceptions.ClientError as boto_exc: if not should_retry_exception(boto_exc): logger_.warning("Got exception but retrying: %s", boto_exc) continue logger_.critical("Failed record: %s, %s", event, boto_exc) raise boto_exc # flush the queue batch_processor.send_all() # note: if there are multiple content exceptions in the batch, this will # only raise the most recent one; # re-raise so that get_contents() failures end up in the DLQ if content_exception: logger_.critical("Failed batch due to %s", content_exception) raise content_exception
def index_if_manifest(s3_client, doc_queue: DocumentQueue, event_type: str, *, bucket: str, etag: str, ext: str, key: str, last_modified: str, version_id: Optional[str], size: int) -> bool: """index manifest files as package documents in ES Returns: - True if manifest (and passes to doc_queue for indexing) - False if not a manifest (no attempt at indexing) """ logger_ = get_quilt_logger() pointer_prefix, pointer_file = split(key) handle = pointer_prefix[len(POINTER_PREFIX_V1):] if (not pointer_prefix.startswith(POINTER_PREFIX_V1) or len(handle) < 3 or '/' not in handle): logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key) return False try: manifest_timestamp = int(pointer_file) except ValueError as err: logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err) # this is probably the latest pointer, skip it. manifest already indexed. return False else: if not 1451631600 <= manifest_timestamp <= 1767250800: logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key) return False package_hash = get_plain_text( bucket, key, size, None, etag=etag, s3_client=s3_client, version_id=version_id, ).strip() manifest_key = f"{MANIFEST_PREFIX_V1}{package_hash}" first = select_manifest_meta(s3_client, bucket, manifest_key) stats = select_package_stats(s3_client, bucket, manifest_key) if not first: logger_.error("S3 select failed %s %s", bucket, manifest_key) return False try: first_dict = json.loads(first) doc_queue.append( event_type, DocTypes.PACKAGE, bucket=bucket, etag=etag, ext=ext, handle=handle, key=manifest_key, last_modified=last_modified, package_hash=package_hash, package_stats=stats, pointer_file=pointer_file, comment=str(first_dict.get("message", "")), metadata=json.dumps(first_dict.get("user_meta", {})), ) return True except (json.JSONDecodeError, botocore.exceptions.ClientError) as exc: print(f"{exc}\n" f"\tFailed to select first line of manifest s3://{bucket}/{key}." f"\tGot {first}.") return False
'meta': { 'type': 'object', }, }, 'required': ['logical_key', 'physical_key'], } s3 = boto3.client('s3') lambda_ = boto3.client('lambda') # Monkey patch quilt3 S3ClientProvider, so it builds a client using user credentials. user_boto_session = None quilt3.data_transfer.S3ClientProvider.get_boto_session = staticmethod( lambda: user_boto_session) logger = get_quilt_logger() def calculate_pkg_hashes(boto_session, pkg): entries = [] for lk, entry in pkg.walk(): if entry.hash is not None: continue if entry.size > S3_HASH_LAMBDA_MAX_FILE_SIZE_BYTES: raise FileTooLargeForHashing(lk) entries.append(entry) user_s3 = boto_session.client("s3") @functools.lru_cache(maxsize=None)
def append( self, event_type: str, doc_type: DocTypes, # properties unique to a document type are non-required kwargs ext: str = '', handle: str = '', metadata: str = '', pointer_file: str = '', # this could the hash OR tag; to be used in _id primary key package_hash: str = '', package_stats: Dict[str, int] = None, tags: List[str] = (), text: str = '', version_id=None, *, # common properties are required kwargs bucket: str, comment: str = '', key: str, etag: str, last_modified: str, size: int = 0 ): """format event as a document and then queue the document""" logger_ = get_quilt_logger() if not bucket or not key: raise ValueError(f"bucket={bucket} or key={key} required but missing") is_delete_marker = False if event_type.startswith(EVENT_PREFIX["Created"]): _op_type = "index" elif event_type.startswith(EVENT_PREFIX["Removed"]): _op_type = "delete" if event_type.endswith("DeleteMarkerCreated"): is_delete_marker = True # we index (not delete) delete markers to sync state with S3 _op_type = "index" else: logger_.error("Skipping unrecognized event type %s", event_type) return # On types and fields, see # https://www.elastic.co/guide/en/elasticsearch/reference/master/mapping.html # Set common properties on the document # BE CAREFUL changing these values, as type changes or missing fields # can cause exceptions from ES index_name = bucket if doc_type == DocTypes.PACKAGE: index_name += PACKAGE_INDEX_SUFFIX if not index_name: raise ValueError(f"Can't infer index name; bucket={bucket}, doc_type={doc_type}") # ensure the same versionId and primary keys (_id) as given by # list-object-versions in the enterprise bulk_scanner version_id = version_id or "null" # core properties for all document types; # see https://elasticsearch-py.readthedocs.io/en/6.3.1/helpers.html body = { "_index": index_name, "_op_type": _op_type, # determines if action is upsert (index) or delete # TODO remove this; it's not meaningful since we use a different index # type for object vs. package documents "_type": "_doc", "_id": get_id(key, version_id), # TODO nest fields under "document" and maybe use _type:{package, object} "comment": comment, "etag": etag, "key": key, "last_modified": last_modified, "size": size, "delete_marker": is_delete_marker, "version_id": version_id, } if doc_type == DocTypes.PACKAGE: if not handle: raise ValueError("missing required argument for package doc") if _op_type == "index": if not (pointer_file and package_hash): raise ValueError("missing required argument for package doc") if not ( package_stats is None or isinstance(package_stats, dict) and {'total_files', 'total_bytes'}.issubset(package_stats) ): raise ValueError("Malformed package_stats") body.update({ "handle": handle, "hash": package_hash, "metadata": metadata, "pointer_file": pointer_file, "tags": ",".join(tags) }) if package_stats: body.update({ "package_stats": package_stats, }) elif doc_type == DocTypes.OBJECT: body.update({ # TODO: remove this field from ES in /enterprise (now deprecated and unused) # here we explicitly drop the comment "comment": "", "content": text, # field for full-text search "event": event_type, "ext": ext, "target": "", "updated": datetime.utcnow().isoformat(), }) else: logger_.error("Skipping unexpected document type: %s", doc_type) self._append_document(body) if self.size >= QUEUE_LIMIT_BYTES: self.send_all()
def index_if_package( s3_client, doc_queue: DocumentQueue, event_type: str, *, bucket: str, etag: str, ext: str, key: str, last_modified: str, version_id: Optional[str], size: int ) -> bool: """index manifest pointer files as package documents in ES Returns: - True if pointer to manifest (and passes to doc_queue for indexing) - False if not a manifest (no attempt at indexing) """ logger_ = get_quilt_logger() pointer_prefix, pointer_file = split(key) handle = pointer_prefix[len(POINTER_PREFIX_V1):] if ( not pointer_file or not pointer_prefix.startswith(POINTER_PREFIX_V1) or len(handle) < 3 or '/' not in handle ): logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key) return False try: manifest_timestamp = int(pointer_file) is_tag = False if not 1451631600 <= manifest_timestamp <= 1767250800: logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key) return False except ValueError as err: is_tag = True logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err) package_hash = '' first_dict = {} stats = None # we only need to get manifest contents for proper create events (not latest pointers) if event_type.startswith(EVENT_PREFIX["Created"]) and not is_tag: package_hash = get_plain_text( bucket, key, size, None, etag=etag, s3_client=s3_client, version_id=version_id, ).strip() manifest_key = f'{MANIFEST_PREFIX_V1}{package_hash}' first = select_manifest_meta(s3_client, bucket, manifest_key) stats = select_package_stats(s3_client, bucket, manifest_key) if not first: logger_.error("S3 select failed %s %s", bucket, manifest_key) return False try: first_dict = json.loads(first) except (json.JSONDecodeError, botocore.exceptions.ClientError) as exc: print( f"{exc}\n" f"\tFailed to select first line of manifest s3://{bucket}/{key}." f"\tGot {first}." ) return False doc_queue.append( event_type, DocTypes.PACKAGE, bucket=bucket, etag=etag, ext=ext, handle=handle, key=key, last_modified=last_modified, # if we don't have the hash, we're processing a tag package_hash=(package_hash or pointer_file), package_stats=stats, pointer_file=pointer_file, comment=str(first_dict.get("message", "")), metadata=json.dumps(first_dict.get("user_meta", {})), version_id=version_id, ) return True
def index_if_package( s3_client, doc_queue: DocumentQueue, *, bucket: str, etag: str, key: str, last_modified: str, version_id: Optional[str], ) -> bool: """index manifest pointer files as package documents in ES Returns: - True if pointer to manifest (and passes to doc_queue for indexing) - False if not a manifest (no attempt at indexing) """ logger_ = get_quilt_logger() pointer_prefix, pointer_file = split(key) handle = pointer_prefix[len(POINTER_PREFIX_V1):] if ( not pointer_file or not pointer_prefix.startswith(POINTER_PREFIX_V1) or len(handle) < 3 or '/' not in handle ): logger_.debug("Not indexing as manifest file s3://%s/%s", bucket, key) return False try: manifest_timestamp = int(pointer_file) if not 1451631600 <= manifest_timestamp <= 1767250800: logger_.warning("Unexpected manifest timestamp s3://%s/%s", bucket, key) return False except ValueError as err: logger_.debug("Non-integer manifest pointer: s3://%s/%s, %s", bucket, key, err) def get_pkg_data(): try: package_hash = s3_client.get_object( Bucket=bucket, Key=key, )['Body'].read().decode() except botocore.exceptions.ClientError: return manifest_key = f'{MANIFEST_PREFIX_V1}{package_hash}' first = select_manifest_meta(s3_client, bucket, manifest_key) if not first: return stats = select_package_stats(s3_client, bucket, manifest_key) if not stats: return return { "key": key, "etag": etag, "version_id": version_id, "last_modified": last_modified, "delete_marker": False, # TODO: remove "handle": handle, "pointer_file": pointer_file, "hash": package_hash, "package_stats": stats, "metadata": json.dumps(first.get("user_meta", {})), "comment": str(first.get("message", "")), } data = get_pkg_data() or {} doc_queue.append_document({ "_index": bucket + PACKAGE_INDEX_SUFFIX, "_id": key, "_op_type": "index" if data else "delete", **data, }) return True
def handler(event, context): """enumerate S3 keys in event, extract relevant data, queue events, send to elastic via bulk() API """ logger_ = get_quilt_logger() # message is a proper SQS message, which either contains a single event # (from the bucket notification system) or batch-many events as determined # by enterprise/**/bulk_loader.py # An exception that we'll want to re-raise after the batch sends content_exception = None batch_processor = DocumentQueue(context) s3_client = make_s3_client() for message in event["Records"]: body = json.loads(message["body"]) body_message = json.loads(body["Message"]) if "Records" not in body_message: # could be TEST_EVENT, or another unexpected event; skip it logger_.error("No 'Records' key in message['body']: %s", message) continue events = body_message["Records"] # event is a single S3 event for event_ in events: validated = shape_event(event_) if not validated: logger_.debug("Skipping invalid event %s", event_) continue event_ = validated logger_.debug("Processing %s", event_) try: event_name = event_["eventName"] # Process all Create:* and Remove:* events if not any(event_name.startswith(n) for n in EVENT_PREFIX.values()): logger_.warning("Skipping unknown event type: %s", event_name) continue bucket = event_["s3"]["bucket"]["name"] # In the grand tradition of IE6, S3 events turn spaces into '+' # TODO: check if eventbridge events do the same thing with + key = unquote_plus(event_["s3"]["object"]["key"]) version_id = event_["s3"]["object"].get("versionId", None) # ObjectRemoved:Delete does not include "eTag" etag = event_["s3"]["object"].get("eTag", "") # synthetic events from bulk scanner might define lastModified last_modified = ( event_["s3"]["object"].get("lastModified") or event_["eventTime"] ) # Get two levels of extensions to handle files like .csv.gz path = pathlib.PurePosixPath(key) ext1 = path.suffix ext2 = path.with_suffix('').suffix ext = (ext2 + ext1).lower() # Handle delete and deletemarker first and then continue so that # head_object and get_object (below) don't fail if event_name.startswith(EVENT_PREFIX["Removed"]): do_index( s3_client, batch_processor, event_name, bucket=bucket, etag=etag, ext=ext, key=key, last_modified=last_modified, version_id=version_id ) continue try: head = retry_s3( "head", bucket, key, s3_client=s3_client, version_id=version_id, etag=etag ) except botocore.exceptions.ClientError as first: logger_.warning("head_object error: %s", first) # "null" version sometimes results in 403s for buckets # that have changed versioning, retry without it if (first.response.get('Error', {}).get('Code') == "403" and version_id == "null"): try: head = retry_s3( "head", bucket, key, s3_client=s3_client, version_id=None, etag=etag ) except botocore.exceptions.ClientError as second: # this will bypass the DLQ but that's the right thing to do # as some listed objects may NEVER succeed head requests # (e.g. foreign owner) and there's no reason to torpedo # the whole batch (which might include good files) logger_.warning("Retried head_object error: %s", second) logger_.error("Fatal head_object, skipping event: %s", event_) continue # backfill fields based on the head_object size = head["ContentLength"] last_modified = last_modified or head["LastModified"].isoformat() etag = head.get("etag") or etag version_id = head.get("VersionId") or version_id try: text = maybe_get_contents( bucket, key, ext, etag=etag, version_id=version_id, s3_client=s3_client, size=size ) # we still want an entry for this document in elastic so that, e.g., # the file counts from elastic are correct # these exceptions can happen for a variety of reasons (e.g. glacier # storage class, index event arrives after delete has occurred, etc.) # given how common they are, we shouldn't fail the batch for this except Exception as exc: # pylint: disable=broad-except text = "" logger_.warning("Content extraction failed %s %s %s", bucket, key, exc) do_index( s3_client, batch_processor, event_name, bucket=bucket, etag=etag, ext=ext, key=key, last_modified=last_modified, size=size, text=text, version_id=version_id ) except botocore.exceptions.ClientError as boto_exc: if not should_retry_exception(boto_exc): logger_.warning("Skipping non-fatal exception: %s", boto_exc) continue logger_.critical("Failed record: %s, %s", event, boto_exc) raise boto_exc # flush the queue batch_processor.send_all()
def maybe_get_contents(bucket, key, ext, *, etag, version_id, s3_client, size): """get the byte contents of a file if it's a target for deep indexing""" logger_ = get_quilt_logger() if ext.endswith('.gz'): compression = 'gz' ext = ext[:-len('.gz')] else: compression = None logger_.debug( "Entering maybe_get_contents (could run out of mem.) %s %s %s", bucket, key, version_id ) content = "" inferred_ext = infer_extensions(key, ext) if inferred_ext in get_content_index_extensions(bucket_name=bucket): def _get_obj(): return retry_s3( "get", bucket, key, size, etag=etag, s3_client=s3_client, version_id=version_id, ) if inferred_ext == ".fcs": obj = _get_obj() body, info = extract_fcs(get_bytes(obj["Body"], compression), as_html=False) # be smart and just send column names to ES (instead of bloated full schema) # if this is not an HTML/catalog preview content = trim_to_bytes(f"{body}\n{info}", get_content_index_bytes(bucket_name=bucket)) elif inferred_ext == ".ipynb": content = trim_to_bytes( # we have no choice but to fetch the entire notebook, because we # are going to parse it # warning: huge notebooks could spike memory here get_notebook_cells( bucket, key, size, compression, etag=etag, s3_client=s3_client, version_id=version_id ), get_content_index_bytes(bucket_name=bucket), ) elif inferred_ext == ".parquet": if size >= get_available_memory(): print(f"{bucket}/{key} too large to deserialize; skipping contents") # at least index the key and other stats, but don't overrun memory # and fail indexing altogether return "" obj = _get_obj() body, info = extract_parquet( get_bytes(obj["Body"], compression), as_html=False, skip_rows=(inferred_ext in SKIP_ROWS_EXTS), max_bytes=get_content_index_bytes(bucket_name=bucket), ) # be smart and just send column names to ES (instead of bloated full schema) # if this is not an HTML/catalog preview columns = ','.join(list(info['schema']['names'])) content = trim_to_bytes(f"{columns}\n{body}", get_content_index_bytes(bucket_name=bucket)) elif inferred_ext == ".pdf": obj = _get_obj() content = trim_to_bytes( extract_pdf(get_bytes(obj["Body"], compression)), get_content_index_bytes(bucket_name=bucket), ) elif inferred_ext in (".xls", ".xlsx"): obj = _get_obj() body, _ = extract_excel(get_bytes(obj["Body"], compression), as_html=False) content = trim_to_bytes( body, get_content_index_bytes(bucket_name=bucket), ) elif inferred_ext == ".pptx": obj = _get_obj() content = extract_pptx(get_bytes(obj["Body"], compression), get_content_index_bytes(bucket_name=bucket)) else: content = get_plain_text( bucket, key, size, compression, etag=etag, s3_client=s3_client, version_id=version_id ) return content