def get_image_bytes( submission_message: S3ImageSubmission, default_s3_bucket_image_prefix: str, ): """ Takes a submission_message, identifies how best to get its bytes. Future work on re-using sessions for `requests` or any possible optimization must go here. Once we have more hashing lambdas that need access to media, this could be moved into its own module. """ return S3BucketContentSource(submission_message.bucket, default_s3_bucket_image_prefix).get_bytes( submission_message.content_id)
def get_preview_url(content_id, content_object) -> str: """ Given a content_id and a content_object, returns a URL you can use to preview it. """ content_object = t.cast(ContentObject, content_object) if content_object.content_ref_type == ContentRefType.DEFAULT_S3_BUCKET: source = S3BucketContentSource(image_bucket, image_prefix) preview_url = create_presigned_url(image_bucket, source.get_s3_key(content_id), None, 3600, "get_object") elif content_object.content_ref_type == ContentRefType.URL: preview_url = content_object.content_ref return preview_url
def from_sqs_message(cls, d: dict, image_prefix: str) -> "S3ImageSubmissionBatchMessage": result = [] for s3_record in d["Records"]: bucket_name = s3_record["s3"]["bucket"]["name"] key = unquote_plus(s3_record["s3"]["object"]["key"]) # Ignore Folders and Empty Files if s3_record["s3"]["object"]["size"] == 0: logger.info("Disregarding empty file or directory: %s", key) continue content_id = S3BucketContentSource.get_content_id_from_s3_key( key, image_prefix) result.append(S3ImageSubmission(content_id, bucket_name, key)) return cls(image_submissions=result)
def get_submit_api( dynamodb_table: Table, image_bucket: str, image_prefix: str, submissions_queue_url: str, hash_queue_url: str, ) -> bottle.Bottle: """ A Closure that includes all dependencies that MUST be provided by the root API that this API plugs into. Declare dependencies here, but initialize in the root API alone. """ # A prefix to all routes must be provided by the api_root app # The documentation below expects prefix to be '/submit/' submit_api = bottle.Bottle() s3_bucket_image_source = S3BucketContentSource(image_bucket, image_prefix) def _content_exist_error(content_id: str): return bottle.abort( 400, f"Content with id '{content_id}' already exists if you want to resubmit `force_resubmit=True` must be included in payload.", ) def _record_content_submission_from_request( request: SubmitRequestBodyBase, ) -> bool: """ Given a request object submission record the content object to the table passed to the API using 'record_content_submission' Note: this method does not store the content media itself. """ content_ref, content_ref_type = request.get_content_ref_details() return record_content_submission( dynamodb_table, content_id=request.content_id, content_type=request.content_type, content_ref=content_ref, content_ref_type=content_ref_type, additional_fields=set(request.additional_fields) if request.additional_fields else set(), force_resubmit=request.force_resubmit, ) @submit_api.post("/url/", apply=[jsoninator(SubmitContentViaURLRequestBody)]) def submit_url( request: SubmitContentViaURLRequestBody, ) -> t.Union[SubmitResponse, SubmitError]: """ Submission via a url to content. This does not store a copy of the content in s3 """ if not _record_content_submission_from_request(request): return _content_exist_error(request.content_id) send_submission_to_url_queue( dynamodb_table, submissions_queue_url, request.content_id, request.content_type, request.content_url, ) return SubmitResponse(content_id=request.content_id, submit_successful=True) @submit_api.post("/bytes/", apply=[jsoninator(SubmitContentBytesRequestBody)]) def submit_bytes( request: SubmitContentBytesRequestBody, ) -> t.Union[SubmitResponse, SubmitError]: """ Submit of media to HMA via a direct transfer of bytes to the system's s3 bucket. """ content_id = request.content_id file_contents = base64.b64decode(request.content_bytes) # We want to record the submission before triggering and processing on # the content itself therefore we write to dynamodb before s3 if not _record_content_submission_from_request(request): return _content_exist_error(request.content_id) s3_bucket_image_source.put_image_bytes(content_id, file_contents) return SubmitResponse(content_id=request.content_id, submit_successful=True) @submit_api.post( "/put-url/", apply=[jsoninator(SubmitContentViaPutURLUploadRequestBody)]) def submit_put_url( request: SubmitContentViaPutURLUploadRequestBody, ) -> t.Union[SubmitViaUploadUrlResponse, SubmitError]: """ Submission of content to HMA in two steps 1st the creation to a content record and put url based on request body 2nd Upload to the system's s3 bucket by said put url returned by this method """ presigned_url = create_presigned_put_url( bucket_name=image_bucket, key=s3_bucket_image_source.get_s3_key(request.content_id), file_type=request.file_type, ) if presigned_url: if not _record_content_submission_from_request(request): return _content_exist_error(request.content_id) return SubmitViaUploadUrlResponse( content_id=request.content_id, file_type=str(request.file_type), presigned_url=presigned_url, ) bottle.response.status = 400 return SubmitError( content_id=request.content_id, message="Failed to generate upload url", ) @submit_api.post("/hash/", apply=[jsoninator(SubmitContentHashRequestBody)]) def submit_hash( request: SubmitContentHashRequestBody, ) -> t.Union[SubmitResponse, SubmitError]: """ Submission of a hash from a piece of content. Functions the same as other submission endpoint but skips the hasher and media storage. """ # Record content object (even though we don't store anything just like with url) if not _record_content_submission_from_request(request): return _content_exist_error(request.content_id) # Record hash # ToDo expand submit hash API to include `signal_specific_attributes` hash_record = PipelineHashRecord( content_id=request.content_id, signal_type=t.cast(t.Type[SignalType], request.signal_type), content_hash=request.signal_value, updated_at=datetime.datetime.now(), ) hash_record.write_to_table(dynamodb_table) # Send hash directly to matcher # todo this could maybe try and reuse the methods in UnifiedHasher in #749 _get_sqs_client().send_message( QueueUrl=hash_queue_url, MessageBody=json.dumps(hash_record.to_sqs_message()), ) return SubmitResponse(content_id=request.content_id, submit_successful=True) return submit_api
def lambda_handler(event, context): """ SQS Events generated by the submissions API or by files being added to S3. Downloads files to temp-storage, identifies content_type and generates allowed signal_types from it. Saves hash output to DynamoDB, sends a message on an output queue. Note that this brings the contents of a file into memory. This is subject to the resource limitation on the lambda. Potentially extendable until 10GB, but that would be super-expensive. [1] [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html """ records_table = get_dynamodb().Table(DYNAMODB_TABLE) HMAConfig.initialize(HMA_CONFIG_TABLE) banks_table = BanksTable( get_dynamodb().Table(BANKS_TABLE), _get_signal_type_mapping(), ) sqs_client = get_sqs_client() hasher = _get_hasher(_get_signal_type_mapping()) for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "s3:TestEvent": continue media_to_process: t.List[t.Union[S3ImageSubmission, URLSubmissionMessage, BankSubmissionMessage]] = [] if URLSubmissionMessage.could_be(message): media_to_process.append( URLSubmissionMessage.from_sqs_message( message, _get_signal_type_mapping())) elif S3ImageSubmissionBatchMessage.could_be(message): # S3 submissions can only be images for now. media_to_process.extend( S3ImageSubmissionBatchMessage.from_sqs_message( message, image_prefix=IMAGE_PREFIX).image_submissions) elif BankSubmissionMessage.could_be(message): media_to_process.append( BankSubmissionMessage.from_sqs_message( message, _get_signal_type_mapping())) else: logger.warn(f"Unprocessable Message: {message}") for media in media_to_process: if not hasher.supports(media.content_type): if isinstance(media, BankSubmissionMessage): object_id = media.bank_id else: object_id = media.content_id logger.warn( f"Unprocessable content type: {media.content_type}, id: {object_id}" ) continue with metrics.timer(metrics.names.hasher.download_file): try: if hasattr(media, "key") and hasattr(media, "bucket"): # Classic duck-typing. If it has key and bucket, must be an # S3 submission. media = t.cast(S3ImageSubmission, media) bytes_: bytes = S3BucketContentSource( media.bucket, IMAGE_PREFIX).get_bytes(media.content_id) else: media = t.cast(URLSubmissionMessage, media) bytes_: bytes = URLContentSource().get_bytes(media.url) except Exception: if isinstance(media, BankSubmissionMessage): object_id = media.bank_id else: object_id = media.content_id logger.exception( f"Encountered exception while trying to get_bytes for id: {object_id}. Unable to hash content." ) continue for signal in hasher.get_hashes(media.content_type, bytes_): if isinstance(media, BankSubmissionMessage): # route signals to bank datastore only. bank_operations.add_bank_member_signal( banks_table=banks_table, bank_id=media.bank_id, bank_member_id=media.bank_member_id, signal_type=signal.signal_type, signal_value=signal.signal_value, ) # don't write hash records etc. continue hash_record = PipelineHashRecord( content_id=media.content_id, signal_type=signal.signal_type, content_hash=signal.signal_value, updated_at=datetime.datetime.now(), ) hasher.write_hash_record(records_table, hash_record) hasher.publish_hash_message(sqs_client, hash_record) metrics.flush()
def lambda_handler(event, context): """ SQS Events generated by the submissions API or by files being added to S3. Downloads files to temp-storage, identifies content_type and generates allowed signal_types from it. Saves hash output to DynamoDB, sends a message on an output queue. Note that this brings the contents of a file into memory. This is subject to the resource limitation on the lambda. Potentially extendable until 10GB, but that would be super-expensive. [1] [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html """ records_table = get_dynamodb().Table(DYNAMODB_TABLE) sqs_client = get_sqs_client() for sqs_record in event["Records"]: message = json.loads(sqs_record["body"]) if message.get("Event") == "s3:TestEvent": continue media_to_process: t.List[t.Union[S3ImageSubmission, URLSubmissionMessage]] = [] if URLSubmissionMessage.could_be(message): media_to_process.append( URLSubmissionMessage.from_sqs_message(message)) elif S3ImageSubmissionBatchMessage.could_be(message): # S3 submissions can only be images for now. media_to_process.extend( S3ImageSubmissionBatchMessage.from_sqs_message( message, image_prefix=IMAGE_PREFIX).image_submissions) else: logger.warn(f"Unprocessable Message: {message}") for media in media_to_process: if not hasher.supports(media.content_type): logger.warn( f"Unprocessable content type: {media.content_type}") continue with metrics.timer(metrics.names.hasher.download_file): if hasattr(media, "key") and hasattr(media, "bucket"): # Classic duck-typing. If it has key and bucket, must be an # S3 submission. bytes_: bytes = S3BucketContentSource( media.bucket, IMAGE_PREFIX).get_bytes(media.content_id) else: bytes_: bytes = URLContentSource().get_bytes(media.url) for signal in hasher.get_hashes(media.content_id, media.content_type, bytes_): hash_record = PipelineHashRecord( content_id=media.content_id, signal_type=signal.signal_type, content_hash=signal.signal_value, updated_at=datetime.datetime.now(), ) hasher.write_hash_record(records_table, hash_record) hasher.publish_hash_message(sqs_client, hash_record) metrics.flush()