Exemplo n.º 1
0
    def publish_match_message(
        self,
        content_id: str,
        content_hash: str,
        matches: t.List[IndexMatch],
        sns_client: SNSClient,
        topic_arn: str,
    ):
        """
        Creates banked signal objects and publishes one message for a list of
        matches to SNS.
        """
        banked_signals = []

        for match in matches:
            for privacy_group_id in match.metadata.get("privacy_groups", []):
                banked_signal = BankedSignal(
                    str(match.metadata["id"]),
                    str(privacy_group_id),
                    str(match.metadata["source"]),
                )

                for tag in match.metadata["tags"].get(privacy_group_id, []):
                    banked_signal.add_classification(tag)

                banked_signals.append(banked_signal)

        match_message = MatchMessage(
            content_key=content_id,
            content_hash=content_hash,
            matching_banked_signals=banked_signals,
        )

        sns_client.publish(TopicArn=topic_arn,
                           Message=match_message.to_aws_json())
Exemplo n.º 2
0
    def publish_match_message(
        self,
        content_id: str,
        content_hash: str,
        matches: t.List[IndexMatch[t.List[BaseIndexMetadata]]],
        sns_client: SNSClient,
        topic_arn: str,
    ):
        """
        Creates banked signal objects and publishes one message for a list of
        matches to SNS.
        """
        banked_signals = []

        for match in matches:
            for metadata_obj in match.metadata:
                if metadata_obj.get_source(
                ) == THREAT_EXCHANGE_SOURCE_SHORT_CODE:
                    metadata_obj = t.cast(ThreatExchangeIndicatorIndexMetadata,
                                          metadata_obj)
                    banked_signal = BankedSignal(
                        str(metadata_obj.indicator_id),
                        str(metadata_obj.privacy_group),
                        str(metadata_obj.get_source()),
                    )
                    for tag in metadata_obj.tags:
                        banked_signal.add_classification(tag)

                    banked_signals.append(banked_signal)
                elif metadata_obj.get_source() == BANKS_SOURCE_SHORT_CODE:
                    metadata_obj = t.cast(BankedSignalIndexMetadata,
                                          metadata_obj)
                    bank_member = self.banks_table.get_bank_member(
                        bank_member_id=metadata_obj.bank_member_id)

                    banked_signal = BankedSignal(
                        metadata_obj.bank_member_id,
                        bank_member.bank_id,
                        metadata_obj.get_source(),
                    )

                    # TODO: This would do good with caching.
                    bank = self.banks_table.get_bank(
                        bank_id=bank_member.bank_id)
                    for tag in set.union(bank_member.bank_member_tags,
                                         bank.bank_tags):
                        banked_signal.add_classification(tag)

                    banked_signals.append(banked_signal)

        match_message = MatchMessage(
            content_key=content_id,
            content_hash=content_hash,
            matching_banked_signals=banked_signals,
        )

        sns_client.publish(TopicArn=topic_arn,
                           Message=match_message.to_aws_json())
Exemplo n.º 3
0
    def get_example_action_event():
        enqueue_mini_castle_for_review_action_label = ActionLabel(
            TestContentModels.TEST_ACTION_LABEL)
        action_rules = [
            ActionRule(
                name="Enqueue Mini-Castle for Review",
                action_label=enqueue_mini_castle_for_review_action_label,
                must_have_labels=set([
                    ClassificationLabel("true_positive"),
                ]),
                must_not_have_labels=set(),
            ),
        ]

        banked_signal = BankedSignal(
            banked_content_id="4169895076385542",
            bank_id="303636684709969",
            bank_source="te",
        )
        banked_signal.add_classification("true_positive")

        action_performer = WebhookPostActionPerformer(
            name="EnqueueForReview",
            url="https://webhook.site/ff7ebc37-514a-439e-9a03-46f86989e195",
            headers='{"Connection":"keep-alive"}',
            # monitoring page:
            # https://webhook.site/#!/ff7ebc37-514a-439e-9a03-46f86989e195
        )

        action_message = ActionMessage(
            content_key=TestContentModels.TEST_CONTENT_ID,
            content_hash=
            "361da9e6cf1b72f5cea0344e5bb6e70939f4c70328ace762529cac704297354a",
            matching_banked_signals=[banked_signal],
            action_label=enqueue_mini_castle_for_review_action_label,
            action_rules=action_rules,
        )

        return ActionEvent(
            content_id=action_message.content_key,
            performed_at=TestContentModels.TEST_TIME,
            action_label=action_message.action_label.value,
            action_performer=action_performer.to_aws_json(),
            action_rules=[
                rule.to_aws_json() for rule in action_message.action_rules
            ],
        )  # .write_to_table(table)
Exemplo n.º 4
0
    def publish_match_message(
        self,
        content_id: str,
        content_hash: str,
        matches: t.List[IndexMatch],
        sns_client: SNSClient,
        topic_arn: str,
    ):
        """
        Creates banked signal objects and publishes one message for a list of
        matches to SNS.
        """
        banked_signals = []

        for match in matches:
            for metadata_obj in match.metadata:
                if metadata_obj.get_source(
                ) == THREAT_EXCHANGE_SOURCE_SHORT_CODE:
                    banked_signal = BankedSignal(
                        str(metadata_obj.indicator_id),
                        str(metadata_obj.privacy_group),
                        str(metadata_obj.get_source()),
                    )
                    for tag in metadata_obj.tags:
                        banked_signal.add_classification(tag)

                    banked_signals.append(banked_signal)
                elif metadata_obj.get_source() == BANKS_SOURCE_SHORT_CODE:
                    banked_signal = BankedSignal(
                        metadata_obj.signal_id,
                        metadata_obj.bank_member_id,
                        metadata_obj.get_source(),
                    )

                    banked_signals.append(banked_signal)

        match_message = MatchMessage(
            content_key=content_id,
            content_hash=content_hash,
            matching_banked_signals=banked_signals,
        )

        sns_client.publish(TopicArn=topic_arn,
                           Message=match_message.to_aws_json())
    def test_action_message_serialization_and_deserialization(self):
        enqueue_mini_castle_for_review_action_label = ActionLabel(
            "EnqueueMiniCastleForReview")

        action_rules = [
            ActionRule(
                name="Enqueue Mini-Castle for Review",
                action_label=enqueue_mini_castle_for_review_action_label,
                must_have_labels=set([
                    BankIDClassificationLabel("303636684709969"),
                    ClassificationLabel("true_positive"),
                ]),
                must_not_have_labels=set(
                    [BankedContentIDClassificationLabel("3364504410306721")]),
            ),
        ]

        banked_signal = BankedSignal(
            banked_content_id="4169895076385542",
            bank_id="303636684709969",
            bank_source="te",
        )
        banked_signal.add_classification("true_positive")

        action_message = ActionMessage(
            content_key="images/mini-castle.jpg",
            content_hash=
            "361da9e6cf1b72f5cea0344e5bb6e70939f4c70328ace762529cac704297354a",
            matching_banked_signals=[banked_signal],
            action_label=enqueue_mini_castle_for_review_action_label,
            action_rules=action_rules,
        )

        action_message_aws_json = action_message.to_aws_json()

        action_message_2 = ActionMessage.from_aws_json(action_message_aws_json)

        self.assertEqual(action_message_2.action_label,
                         enqueue_mini_castle_for_review_action_label)
Exemplo n.º 6
0
    def test_get_action_labels(self):

        enqueue_for_review_action_label = ActionLabel("EnqueueForReview")
        bank_id = "12345"

        banked_signal_without_foo = BankedSignal("67890", bank_id, "Test")
        banked_signal_without_foo.add_classification("Bar")
        banked_signal_without_foo.add_classification("Xyz")

        banked_signal_with_foo = BankedSignal("67890", bank_id, "Test")
        banked_signal_with_foo.add_classification("Foo")
        banked_signal_with_foo.add_classification("Bar")
        banked_signal_with_foo.add_classification("Xyz")

        match_message_without_foo = MatchMessage("key", "hash",
                                                 [banked_signal_without_foo])
        match_message_with_foo = MatchMessage("key", "hash",
                                              [banked_signal_with_foo])

        action_rules = [
            ActionRule(
                enqueue_for_review_action_label.value,
                enqueue_for_review_action_label,
                set([BankIDClassificationLabel(bank_id)]),
                set([ClassificationLabel("Foo")]),
            )
        ]

        action_label_to_action_rules: t.Dict[
            ActionLabel, t.List[ActionRule]] = get_actions_to_take(
                match_message_without_foo, action_rules, set())

        assert len(action_label_to_action_rules) == 1
        self.assertIn(
            enqueue_for_review_action_label,
            action_label_to_action_rules,
            "enqueue_for_review_action_label should be in action_label_to_action_rules",
        )

        action_label_to_action_rules = get_actions_to_take(
            match_message_with_foo, action_rules, set())

        assert len(action_label_to_action_rules) == 0

        enqueue_mini_castle_for_review_action_label = ActionLabel(
            "EnqueueMiniCastleForReview")
        enqueue_sailboat_for_review_action_label = ActionLabel(
            "EnqueueSailboatForReview")

        action_rules = [
            ActionRule(
                name="Enqueue Mini-Castle for Review",
                action_label=enqueue_mini_castle_for_review_action_label,
                must_have_labels=set([
                    BankIDClassificationLabel("303636684709969"),
                    ClassificationLabel("true_positive"),
                ]),
                must_not_have_labels=set(
                    [BankedContentIDClassificationLabel("3364504410306721")]),
            ),
            ActionRule(
                name="Enqueue Sailboat for Review",
                action_label=enqueue_sailboat_for_review_action_label,
                must_have_labels=set([
                    BankIDClassificationLabel("303636684709969"),
                    ClassificationLabel("true_positive"),
                    BankedContentIDClassificationLabel("3364504410306721"),
                ]),
                must_not_have_labels=set(),
            ),
        ]

        mini_castle_banked_signal = BankedSignal(
            banked_content_id="4169895076385542",
            bank_id="303636684709969",
            bank_source="te",
        )
        mini_castle_banked_signal.add_classification("true_positive")

        mini_castle_match_message = MatchMessage(
            content_key="images/mini-castle.jpg",
            content_hash=
            "361da9e6cf1b72f5cea0344e5bb6e70939f4c70328ace762529cac704297354a",
            matching_banked_signals=[mini_castle_banked_signal],
        )

        sailboat_banked_signal = BankedSignal(
            banked_content_id="3364504410306721",
            bank_id="303636684709969",
            bank_source="te",
        )
        sailboat_banked_signal.add_classification("true_positive")

        sailboat_match_message = MatchMessage(
            content_key="images/sailboat-mast-and-sun.jpg",
            content_hash=
            "388ff5e1084efef10096df9cb969296dff2b04d67a94065ecd292129ef6b1090",
            matching_banked_signals=[sailboat_banked_signal],
        )

        action_label_to_action_rules = get_actions_to_take(
            mini_castle_match_message, action_rules, set())

        assert len(action_label_to_action_rules) == 1
        self.assertIn(
            enqueue_mini_castle_for_review_action_label,
            action_label_to_action_rules,
            "enqueue_mini_castle_for_review_action_label should be in action_label_to_action_rules",
        )

        action_label_to_action_rules = get_actions_to_take(
            sailboat_match_message, action_rules, set())

        assert len(action_label_to_action_rules) == 1
        self.assertIn(
            enqueue_sailboat_for_review_action_label,
            action_label_to_action_rules,
            "enqueue_sailboat_for_review_action_label should be in action_label_to_action_rules",
        )
Exemplo n.º 7
0
def lambda_handler(event, context):
    """
    TODO/FIXME migrate this lambda to be a part of matcher.py

    Listens to SQS events fired when new hash is generated. Loads the index
    stored in an S3 bucket and looks for a match.

    As per the default configuration
    - the index data bucket is INDEXES_BUCKET_NAME
    - the key name must be S3BackedPDQIndex._get_index_s3_key()

    When matched, publishes a notification to an SNS endpoint. Note this is in
    contrast with hasher and indexer. They publish to SQS directly. Publishing
    to SQS implies there can be only one consumer.

    Because, here, in the matcher, we publish to SNS, we can plug multiple
    queues behind it and profit!
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME)
    logger.info("loaded_hash_index")

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])
        if message.get("Event") == "TestEvent":
            logger.info("Disregarding Test Event")
            continue

        hash_str = message["hash"]
        key = message["key"]
        current_datetime = datetime.datetime.now()

        with metrics.timer(metrics.names.pdq_matcher_lambda.search_index):
            results = hash_index.query(hash_str)

        if results:
            match_ids = []
            matching_banked_signals: t.List[BankedSignal] = []
            for match in results:
                metadata = match.metadata
                logger.info(
                    "Match found for key: %s, hash %s -> %s", key, hash_str, metadata
                )
                privacy_group_list = metadata.get("privacy_groups", [])
                metadata["privacy_groups"] = list(
                    filter(
                        lambda x: get_privacy_group_matcher_active(
                            str(x),
                            time.time() // CACHED_TIME,
                            # CACHED_TIME default to 300 seconds, this will convert time.time() to an int parameter which changes every 300 seconds
                        ),
                        privacy_group_list,
                    )
                )
                if metadata["privacy_groups"]:
                    signal_id = str(metadata["id"])

                    with metrics.timer(
                        metrics.names.pdq_matcher_lambda.write_match_record
                    ):
                        # TODO: Add source (threatexchange) tags to match record
                        MatchRecord(
                            key,
                            PdqSignal,
                            hash_str,
                            current_datetime,
                            signal_id,
                            metadata["source"],
                            metadata["hash"],
                        ).write_to_table(records_table)

                    for pg in metadata.get("privacy_groups", []):
                        # Only update the metadata if it is not found in the table
                        # once intally created it is the fetcher's job to keep the item up to date
                        PDQSignalMetadata(
                            signal_id,
                            pg,
                            current_datetime,
                            metadata["source"],
                            metadata["hash"],
                            metadata["tags"].get(pg, []),
                        ).write_to_table_if_not_found(records_table)

                    match_ids.append(signal_id)

                    # TODO: change naming upstream and here from privacy_group[s]
                    # to dataset[s]
                    for privacy_group in metadata.get("privacy_groups", []):
                        banked_signal = BankedSignal(
                            str(signal_id), str(privacy_group), str(metadata["source"])
                        )
                        for tag in metadata["tags"].get(privacy_group, []):
                            banked_signal.add_classification(tag)
                        matching_banked_signals.append(banked_signal)

            # TODO: Add source (threatexchange) tags to match message
            if matching_banked_signals:
                match_message = MatchMessage(
                    content_key=key,
                    content_hash=hash_str,
                    matching_banked_signals=matching_banked_signals,
                )

                logger.info(f"Publishing match_message: {match_message}")

                # Publish one message for the set of matches.
                sns_client.publish(
                    TopicArn=OUTPUT_TOPIC_ARN, Message=match_message.to_aws_json()
                )

        else:
            logger.info(f"No matches found for key: {key} hash: {hash_str}")

    metrics.flush()