Пример #1
0
    def submit_hash(
        request: SubmitContentHashRequestBody,
    ) -> t.Union[SubmitResponse, SubmitError]:
        """
        Submission of a hash from a piece of content.
        Functions the same as other submission endpoint but skips
        the hasher and media storage.
        """

        # Record content object (even though we don't store anything just like with url)
        if not _record_content_submission_from_request(request):
            return _content_exist_error(request.content_id)

        # Record hash
        #   ToDo expand submit hash API to include `signal_specific_attributes`
        hash_record = PipelineHashRecord(
            content_id=request.content_id,
            signal_type=t.cast(t.Type[SignalType], request.signal_type),
            content_hash=request.signal_value,
            updated_at=datetime.datetime.now(),
        )
        hash_record.write_to_table(dynamodb_table)

        # Send hash directly to matcher
        # todo this could maybe try and reuse the methods in UnifiedHasher in #749
        _get_sqs_client().send_message(
            QueueUrl=hash_queue_url,
            MessageBody=json.dumps(hash_record.to_sqs_message()),
        )

        return SubmitResponse(content_id=request.content_id,
                              submit_successful=True)
Пример #2
0
 def write_hash_record(self, table: Table, hash_record: PipelineHashRecord):
     """
     Once a content signal has been created, write its corresponding hash
     record. These records can be used to do retroaction in case a new signal
     is obtained from sources.
     """
     with metrics.timer(metrics.names.hasher.write_record):
         hash_record.write_to_table(table)
Пример #3
0
 def publish_hash_message(self, sqs_client: SQSClient,
                          hash_record: PipelineHashRecord):
     """
     Once you've written the hash record, publish a message to the matcher's
     input queue.
     """
     with metrics.timer(metrics.names.hasher.publish_message):
         sqs_client.send_message(
             QueueUrl=self.output_queue_url,
             MessageBody=json.dumps(hash_record.to_sqs_message()),
         )
Пример #4
0
    def hashes() -> t.Optional[HashResultResponse]:
        """
        Return the hash details for a given ID.
        """
        content_id = bottle.request.query.content_id or None
        if not content_id:
            return None

        # FIXME: Presently, hash API can only support one hash per content_id
        record = PipelineHashRecord.get_from_content_id(
            dynamodb_table, f"{content_id}")[0]
        if not record:
            return None

        return HashResultResponse(
            content_id=record.content_id,
            content_hash=record.content_hash,
            updated_at=record.updated_at.isoformat(),
        )
Пример #5
0
    def pipeline_progress() -> ContentPipelineProgress:
        """
        WARNING: UNOPTIMIZED. DO NOT CALL FROM AUTOMATED SYSTEMS.

        Build a history of the stages that this piece of content has gone
        through and what their results were. Do not call this from anything but
        a UI. This is not optimized for performance.
        """
        content_id = bottle.request.query.content_id or None

        if not content_id:
            return bottle.abort(400, "content_id must be provided.")
        content_id = t.cast(str, content_id)

        content_object = ContentObject.get_from_content_id(
            dynamodb_table, content_id)
        if not content_object:
            return bottle.abort(400,
                                f"Content with id '{content_id}' not found.")
        content_object = t.cast(ContentObject, content_object)

        preview_url = get_preview_url(content_id, content_object)

        # The result object will be gradually built up as records are retrieved.
        result = ContentPipelineProgress(
            content_id=content_id,
            content_type=content_object.content_type,
            content_preview_url=preview_url,
            submitted_at=content_object.updated_at,
            submission_additional_fields=list(
                content_object.additional_fields),
        )

        hash_records = PipelineHashRecord.get_from_content_id(
            dynamodb_table, content_id)
        if len(hash_records) != 0:
            result.hashed_at = max(hash_records,
                                   key=lambda r: r.updated_at).updated_at
            for hash_record in hash_records:
                # Assume that each signal type has a single hash
                if hash_record.signal_type.get_name() in result.hash_results:
                    return bottle.abort(
                        500,
                        f"Content with id '{content_id}' has multiple hash records for signal-type: '{hash_record.signal_type.get_name()}'.",
                    )

                result.hash_results[hash_record.signal_type.get_name(
                )] = hash_record.content_hash

        match_records = MatchRecord.get_from_content_id(
            dynamodb_table, content_id)
        if len(match_records) != 0:
            result.matched_at = max(match_records,
                                    key=lambda r: r.updated_at).updated_at

            # TODO #751 Until we resolve type agnostic storage of signal data,
            # we can't populate match details.
            # actually populate result.match_results.

        # TODO: ActionEvaluation does not yet leave a trail. Either record
        # action evaluation or remove the evaluation stage from the
        # pipeline-progress indicator.

        action_records = ActionEvent.get_from_content_id(
            dynamodb_table, content_id)
        if len(action_records) != 0:
            result.action_performed_at = max(
                action_records, key=lambda r: r.performed_at).performed_at
            result.action_perform_results = [
                r.action_label for r in action_records
            ]

        return result
Пример #6
0
def lambda_handler(event, context):
    """
    SQS Events generated by the submissions API or by files being added to S3.
    Downloads files to temp-storage, identifies content_type and generates
    allowed signal_types from it.

    Saves hash output to DynamoDB, sends a message on an output queue.

    Note that this brings the contents of a file into memory. This is subject to
    the resource limitation on the lambda. Potentially extendable until 10GB, but
    that would be super-expensive. [1]

    [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html
    """
    records_table = get_dynamodb().Table(DYNAMODB_TABLE)
    HMAConfig.initialize(HMA_CONFIG_TABLE)
    banks_table = BanksTable(
        get_dynamodb().Table(BANKS_TABLE),
        _get_signal_type_mapping(),
    )
    sqs_client = get_sqs_client()

    hasher = _get_hasher(_get_signal_type_mapping())

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])

        if message.get("Event") == "s3:TestEvent":
            continue

        media_to_process: t.List[t.Union[S3ImageSubmission,
                                         URLSubmissionMessage,
                                         BankSubmissionMessage]] = []

        if URLSubmissionMessage.could_be(message):
            media_to_process.append(
                URLSubmissionMessage.from_sqs_message(
                    message, _get_signal_type_mapping()))
        elif S3ImageSubmissionBatchMessage.could_be(message):
            # S3 submissions can only be images for now.
            media_to_process.extend(
                S3ImageSubmissionBatchMessage.from_sqs_message(
                    message, image_prefix=IMAGE_PREFIX).image_submissions)
        elif BankSubmissionMessage.could_be(message):
            media_to_process.append(
                BankSubmissionMessage.from_sqs_message(
                    message, _get_signal_type_mapping()))
        else:
            logger.warn(f"Unprocessable Message: {message}")

        for media in media_to_process:
            if not hasher.supports(media.content_type):
                if isinstance(media, BankSubmissionMessage):
                    object_id = media.bank_id
                else:
                    object_id = media.content_id
                logger.warn(
                    f"Unprocessable content type: {media.content_type}, id: {object_id}"
                )
                continue

            with metrics.timer(metrics.names.hasher.download_file):
                try:
                    if hasattr(media, "key") and hasattr(media, "bucket"):
                        # Classic duck-typing. If it has key and bucket, must be an
                        # S3 submission.
                        media = t.cast(S3ImageSubmission, media)
                        bytes_: bytes = S3BucketContentSource(
                            media.bucket,
                            IMAGE_PREFIX).get_bytes(media.content_id)
                    else:
                        media = t.cast(URLSubmissionMessage, media)
                        bytes_: bytes = URLContentSource().get_bytes(media.url)
                except Exception:
                    if isinstance(media, BankSubmissionMessage):
                        object_id = media.bank_id
                    else:
                        object_id = media.content_id
                    logger.exception(
                        f"Encountered exception while trying to get_bytes for id: {object_id}. Unable to hash content."
                    )
                    continue

            for signal in hasher.get_hashes(media.content_type, bytes_):
                if isinstance(media, BankSubmissionMessage):
                    # route signals to bank datastore only.
                    bank_operations.add_bank_member_signal(
                        banks_table=banks_table,
                        bank_id=media.bank_id,
                        bank_member_id=media.bank_member_id,
                        signal_type=signal.signal_type,
                        signal_value=signal.signal_value,
                    )
                    # don't write hash records etc.
                    continue

                hash_record = PipelineHashRecord(
                    content_id=media.content_id,
                    signal_type=signal.signal_type,
                    content_hash=signal.signal_value,
                    updated_at=datetime.datetime.now(),
                )

                hasher.write_hash_record(records_table, hash_record)
                hasher.publish_hash_message(sqs_client, hash_record)

    metrics.flush()
Пример #7
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new hash is generated. Loads the index
    stored in an S3 bucket and looks for a match.

    When matched, publishes a notification to an SNS endpoint. Note this is in
    contrast with hasher and indexer. They publish to SQS directly. Publishing
    to SQS implies there can be only one consumer.

    Because, here, in the matcher, we publish to SNS, we can plug multiple
    queues behind it and profit!
    """
    table = get_dynamodb().Table(DYNAMODB_TABLE)

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])

        if message.get("Event") == "TestEvent":
            logger.debug("Disregarding Test Event")
            continue

        if not PipelineHashRecord.could_be(message):
            logger.warn(
                "Could not de-serialize message in matcher lambda. Message was %s",
                message,
            )
            continue

        hash_record = PipelineHashRecord.from_sqs_message(message)
        logger.info(
            "HashRecord for contentId: %s with contentHash: %s",
            hash_record.content_id,
            hash_record.content_hash,
        )

        matches = matcher.match(hash_record.signal_type, hash_record.content_hash)
        logger.info("Found %d matches.", len(matches))

        for match in matches:
            matcher.write_match_record_for_result(
                table=table,
                signal_type=hash_record.signal_type,
                content_hash=hash_record.content_hash,
                content_id=hash_record.content_id,
                match=match,
            )

        for match in matches:
            matcher.write_signal_if_not_found(
                table=table, signal_type=hash_record.signal_type, match=match
            )

        if len(matches) != 0:
            # Publish all messages together
            matcher.publish_match_message(
                content_id=hash_record.content_id,
                content_hash=hash_record.content_hash,
                matches=matches,
                sns_client=get_sns_client(),
                topic_arn=MATCHES_TOPIC_ARN,
            )

        metrics.flush()
Пример #8
0
def lambda_handler(event, context):
    """
    Listens to SQS events generated when new files are added to S3. Downloads
    files to temp-storage, generates PDQ hash and quality from the file.

    The SQS events would be from S3. URL only submissions are routed to
    hmalib.lambdas.hashing instead.

    Saves hash output to dynamodb.

    Sends a message on an output queue.

    Note: The image is brought into memory and then handed over to the hasher.
    If you are hashing large images, you may need to increase the memory
    allocated to the lambda. Also remember that images that look small on disk
    (eg. low quality jpegs) still occupy a lot of space in memory. The
    pixel-size of the image is a better indicator of the space it will take in
    memory.
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    for sqs_record in event["Records"]:
        message_body = json.loads(sqs_record["body"])
        message = json.loads(message_body["Message"])

        if message.get("Event") == "s3:TestEvent":
            logger.info("Disregarding S3 Test Event")
            continue

        images_to_process: t.List[t.Union[S3ImageSubmission]] = []

        if S3ImageSubmissionBatchMessage.could_be(message):
            images_to_process.extend(
                S3ImageSubmissionBatchMessage.from_sqs_message(
                    message, image_prefix=IMAGE_FOLDER_KEY).image_submissions)
        else:
            logger.warn("PDQ Hahser could not process incoming message %s",
                        repr(message))

        for image in images_to_process:
            logger.info("Getting bytes for submission:  %s", repr(image))
            with metrics.timer(metrics.names.pdq_hasher_lambda.download_file):
                bytes_: bytes = get_image_bytes(image, IMAGE_FOLDER_KEY)

            logger.info("Generating PDQ hash for submission: %s", repr(image))

            with metrics.timer(metrics.names.pdq_hasher_lambda.hash):
                pdq_hash, quality = pdq_hasher.pdq_from_bytes(bytes_)

            hash_record = PipelineHashRecord(
                image.content_id,
                PdqSignal,
                pdq_hash,
                datetime.datetime.now(),
                {"Quality": quality},
            )

            hash_record.write_to_table(records_table)

            # Publish to SQS queue
            sqs_client.send_message(
                QueueUrl=OUTPUT_QUEUE_URL,
                MessageBody=json.dumps(hash_record.to_legacy_sqs_message()),
            )

            logger.info("Published new PDQ hash")

    metrics.flush()
Пример #9
0
def lambda_handler(event, context):
    """
    SQS Events generated by the submissions API or by files being added to S3.
    Downloads files to temp-storage, identifies content_type and generates
    allowed signal_types from it.

    Saves hash output to DynamoDB, sends a message on an output queue.

    Note that this brings the contents of a file into memory. This is subject to
    the resource limitation on the lambda. Potentially extendable until 10GB, but
    that would be super-expensive. [1]

    [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html
    """
    records_table = get_dynamodb().Table(DYNAMODB_TABLE)
    sqs_client = get_sqs_client()

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])

        if message.get("Event") == "s3:TestEvent":
            continue

        media_to_process: t.List[t.Union[S3ImageSubmission,
                                         URLSubmissionMessage]] = []

        if URLSubmissionMessage.could_be(message):
            media_to_process.append(
                URLSubmissionMessage.from_sqs_message(message))
        elif S3ImageSubmissionBatchMessage.could_be(message):
            # S3 submissions can only be images for now.
            media_to_process.extend(
                S3ImageSubmissionBatchMessage.from_sqs_message(
                    message, image_prefix=IMAGE_PREFIX).image_submissions)
        else:
            logger.warn(f"Unprocessable Message: {message}")

        for media in media_to_process:
            if not hasher.supports(media.content_type):
                logger.warn(
                    f"Unprocessable content type: {media.content_type}")
                continue

            with metrics.timer(metrics.names.hasher.download_file):
                if hasattr(media, "key") and hasattr(media, "bucket"):
                    # Classic duck-typing. If it has key and bucket, must be an
                    # S3 submission.
                    bytes_: bytes = S3BucketContentSource(
                        media.bucket, IMAGE_PREFIX).get_bytes(media.content_id)
                else:
                    bytes_: bytes = URLContentSource().get_bytes(media.url)

            for signal in hasher.get_hashes(media.content_id,
                                            media.content_type, bytes_):
                hash_record = PipelineHashRecord(
                    content_id=media.content_id,
                    signal_type=signal.signal_type,
                    content_hash=signal.signal_value,
                    updated_at=datetime.datetime.now(),
                )

                hasher.write_hash_record(records_table, hash_record)
                hasher.publish_hash_message(sqs_client, hash_record)

    metrics.flush()