예제 #1
0
    def matches() -> MatchSummariesResponse:
        """
        Return all, or a filtered list of matches based on query params.
        """
        signal_q = bottle.request.query.signal_q or None  # type: ignore # ToDo refactor to use `jsoninator(<requestObj>, from_query=True)``
        signal_source = bottle.request.query.signal_source or None  # type: ignore # ToDo refactor to use `jsoninator(<requestObj>, from_query=True)``
        content_q = bottle.request.query.content_q or None  # type: ignore # ToDo refactor to use `jsoninator(<requestObj>, from_query=True)``

        if content_q:
            records = MatchRecord.get_from_content_id(datastore_table,
                                                      content_q,
                                                      signal_type_mapping)
        elif signal_q:
            records = MatchRecord.get_from_signal(datastore_table, signal_q,
                                                  signal_source or "",
                                                  signal_type_mapping)
        else:
            # TODO: Support pagination after implementing in UI.
            records = MatchRecord.get_recent_items_page(
                datastore_table, signal_type_mapping).items

        return MatchSummariesResponse(match_summaries=[
            MatchSummary(
                content_id=record.content_id,
                signal_id=record.signal_id,
                signal_source=record.signal_source,
                updated_at=record.updated_at.isoformat(),
            ) for record in records
        ])
예제 #2
0
    def matches() -> MatchSummariesResponse:
        """
        Return all, or a filtered list of matches based on query params.
        """
        signal_q = bottle.request.query.signal_q or None
        signal_source = bottle.request.query.signal_source or None
        content_q = bottle.request.query.content_q or None

        if content_q:
            records = MatchRecord.get_from_content_id(dynamodb_table,
                                                      content_q)
        elif signal_q:
            records = MatchRecord.get_from_signal(dynamodb_table, signal_q,
                                                  signal_source or "")
        else:
            # TODO: Support pagination after implementing in UI.
            records = MatchRecord.get_recent_items_page(dynamodb_table).items

        return MatchSummariesResponse(match_summaries=[
            MatchSummary(
                content_id=record.content_id,
                signal_id=record.signal_id,
                signal_source=record.signal_source,
                updated_at=record.updated_at.isoformat(),
            ) for record in records
        ])
예제 #3
0
    def write_match_record_for_result(
        self,
        table: Table,
        signal_type: t.Type[SignalType],
        content_hash: str,
        content_id: str,
        match: IndexMatch[BaseIndexMetadata],
    ):
        """
        Write a match record to dynamodb. The content_id is not important to the
        matcher. So, the calling lambda is expected to pass on the content_id
        for match record calls.
        """
        # Write one record for TE and one for banks.. I am sure the logic can be
        # simplified. We can do filters instead of iterating with flags. But
        # umm, we can fix that later?
        bank_record_written = False
        te_record_written = False

        for metadata_obj in match.metadata:
            match_record_attributes = {
                "content_id": content_id,
                "signal_type": signal_type,
                "content_hash": content_hash,
                "updated_at": datetime.datetime.now(),
                "signal_source": metadata_obj.get_source(),
                "match_distance": int(match.distance),
            }

            if (metadata_obj.get_source() == THREAT_EXCHANGE_SOURCE_SHORT_CODE
                    and not te_record_written):
                match_record_attributes.update(
                    signal_id=metadata_obj.indicator_id,
                    signal_hash=metadata_obj.signal_value,
                )
                te_record_written = True

            elif (metadata_obj.get_source() == BANKS_SOURCE_SHORT_CODE
                  and not bank_record_written):
                match_record_attributes.update(
                    signal_id=metadata_obj.signal_id,
                    signal_hash=metadata_obj.signal_value,
                )
                bank_record_written = True

            MatchRecord(**match_record_attributes).write_to_table(table)
예제 #4
0
def get_match_details(table: Table, content_id: str) -> t.List[MatchDetail]:
    if not content_id:
        return []

    records = MatchRecord.get_from_content_id(table, f"{content_id}")

    return [
        MatchDetail(
            content_id=record.content_id,
            content_hash=record.content_hash,
            signal_id=record.signal_id,
            signal_hash=record.signal_hash,
            signal_source=record.signal_source,
            signal_type=record.signal_type.get_name(),
            updated_at=record.updated_at.isoformat(),
            metadata=get_signal_details(table, record.signal_id,
                                        record.signal_source),
        ) for record in records
    ]
예제 #5
0
 def write_match_record_for_result(
     self,
     table: Table,
     signal_type: t.Type[SignalType],
     signal_value: str,
     content_id: str,
     match: IndexMatch,
 ):
     """
     Write a match record to dynamodb. The content_id is not important to the
     matcher. So, the calling lambda is expected to pass on the content_id
     for match record calls.
     """
     MatchRecord(
         content_id,
         signal_type,
         signal_value,
         datetime.datetime.now(),
         str(match.metadata["id"]),
         match.metadata["source"],
         match.metadata["hash"],
     ).write_to_table(table)
예제 #6
0
    def write_match_record_for_result(
        self,
        table: Table,
        signal_type: t.Type[SignalType],
        content_hash: str,
        content_id: str,
        match: IndexMatch[t.List[BaseIndexMetadata]],
    ):
        """
        Write a match record to dynamodb. The content_id is not important to the
        matcher. So, the calling lambda is expected to pass on the content_id
        for match record calls.
        """
        for metadata_obj in match.metadata:
            match_record_attributes = {
                "content_id": content_id,
                "signal_type": signal_type,
                "content_hash": content_hash,
                "updated_at": datetime.datetime.now(),
                "signal_source": metadata_obj.get_source(),
                "match_distance": int(match.distance),
            }

            if metadata_obj.get_source() == THREAT_EXCHANGE_SOURCE_SHORT_CODE:
                metadata_obj = t.cast(ThreatExchangeIndicatorIndexMetadata,
                                      metadata_obj)
                match_record_attributes.update(
                    signal_id=metadata_obj.indicator_id,
                    signal_hash=metadata_obj.signal_value,
                )

            elif metadata_obj.get_source() == BANKS_SOURCE_SHORT_CODE:
                metadata_obj = t.cast(BankedSignalIndexMetadata, metadata_obj)
                match_record_attributes.update(
                    signal_id=metadata_obj.signal_id,
                    signal_hash=metadata_obj.signal_value,
                )

            MatchRecord(**match_record_attributes).write_to_table(table)
예제 #7
0
def get_match_details(
    datastore_table: Table,
    banks_table: BanksTable,
    content_id: str,
    signal_type_mapping: HMASignalTypeMapping,
) -> t.List[MatchDetail]:
    if not content_id:
        return []

    records = MatchRecord.get_from_content_id(datastore_table, f"{content_id}",
                                              signal_type_mapping)

    return [
        MatchDetail(
            content_id=record.content_id,
            content_hash=record.content_hash,
            signal_id=record.signal_id,
            signal_hash=record.signal_hash,
            signal_source=record.signal_source,
            signal_type=record.signal_type.get_name(),
            updated_at=record.updated_at.isoformat(),
            match_distance=int(record.match_distance)
            if record.match_distance is not None else None,
            te_signal_details=get_te_signal_details(
                datastore_table=datastore_table,
                signal_id=record.signal_id,
                signal_source=record.signal_source,
                signal_type_mapping=signal_type_mapping,
            ),
            banked_signal_details=get_banked_signal_details(
                banks_table=banks_table,
                signal_id=record.signal_id,
                signal_source=record.signal_source,
            ),
        ) for record in records
    ]
예제 #8
0
    def pipeline_progress() -> ContentPipelineProgress:
        """
        WARNING: UNOPTIMIZED. DO NOT CALL FROM AUTOMATED SYSTEMS.

        Build a history of the stages that this piece of content has gone
        through and what their results were. Do not call this from anything but
        a UI. This is not optimized for performance.
        """
        content_id = bottle.request.query.content_id or None

        if not content_id:
            return bottle.abort(400, "content_id must be provided.")
        content_id = t.cast(str, content_id)

        content_object = ContentObject.get_from_content_id(
            dynamodb_table, content_id)
        if not content_object:
            return bottle.abort(400,
                                f"Content with id '{content_id}' not found.")
        content_object = t.cast(ContentObject, content_object)

        preview_url = get_preview_url(content_id, content_object)

        # The result object will be gradually built up as records are retrieved.
        result = ContentPipelineProgress(
            content_id=content_id,
            content_type=content_object.content_type,
            content_preview_url=preview_url,
            submitted_at=content_object.updated_at,
            submission_additional_fields=list(
                content_object.additional_fields),
        )

        hash_records = PipelineHashRecord.get_from_content_id(
            dynamodb_table, content_id)
        if len(hash_records) != 0:
            result.hashed_at = max(hash_records,
                                   key=lambda r: r.updated_at).updated_at
            for hash_record in hash_records:
                # Assume that each signal type has a single hash
                if hash_record.signal_type.get_name() in result.hash_results:
                    return bottle.abort(
                        500,
                        f"Content with id '{content_id}' has multiple hash records for signal-type: '{hash_record.signal_type.get_name()}'.",
                    )

                result.hash_results[hash_record.signal_type.get_name(
                )] = hash_record.content_hash

        match_records = MatchRecord.get_from_content_id(
            dynamodb_table, content_id)
        if len(match_records) != 0:
            result.matched_at = max(match_records,
                                    key=lambda r: r.updated_at).updated_at

            # TODO #751 Until we resolve type agnostic storage of signal data,
            # we can't populate match details.
            # actually populate result.match_results.

        # TODO: ActionEvaluation does not yet leave a trail. Either record
        # action evaluation or remove the evaluation stage from the
        # pipeline-progress indicator.

        action_records = ActionEvent.get_from_content_id(
            dynamodb_table, content_id)
        if len(action_records) != 0:
            result.action_performed_at = max(
                action_records, key=lambda r: r.performed_at).performed_at
            result.action_perform_results = [
                r.action_label for r in action_records
            ]

        return result
예제 #9
0
def lambda_handler(event, context):
    """
    TODO/FIXME migrate this lambda to be a part of matcher.py

    Listens to SQS events fired when new hash is generated. Loads the index
    stored in an S3 bucket and looks for a match.

    As per the default configuration
    - the index data bucket is INDEXES_BUCKET_NAME
    - the key name must be S3BackedPDQIndex._get_index_s3_key()

    When matched, publishes a notification to an SNS endpoint. Note this is in
    contrast with hasher and indexer. They publish to SQS directly. Publishing
    to SQS implies there can be only one consumer.

    Because, here, in the matcher, we publish to SNS, we can plug multiple
    queues behind it and profit!
    """
    records_table = dynamodb.Table(DYNAMODB_TABLE)

    hash_index: PDQIndex = get_index(INDEXES_BUCKET_NAME)
    logger.info("loaded_hash_index")

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])
        if message.get("Event") == "TestEvent":
            logger.info("Disregarding Test Event")
            continue

        hash_str = message["hash"]
        key = message["key"]
        current_datetime = datetime.datetime.now()

        with metrics.timer(metrics.names.pdq_matcher_lambda.search_index):
            results = hash_index.query(hash_str)

        if results:
            match_ids = []
            matching_banked_signals: t.List[BankedSignal] = []
            for match in results:
                metadata = match.metadata
                logger.info(
                    "Match found for key: %s, hash %s -> %s", key, hash_str, metadata
                )
                privacy_group_list = metadata.get("privacy_groups", [])
                metadata["privacy_groups"] = list(
                    filter(
                        lambda x: get_privacy_group_matcher_active(
                            str(x),
                            time.time() // CACHED_TIME,
                            # CACHED_TIME default to 300 seconds, this will convert time.time() to an int parameter which changes every 300 seconds
                        ),
                        privacy_group_list,
                    )
                )
                if metadata["privacy_groups"]:
                    signal_id = str(metadata["id"])

                    with metrics.timer(
                        metrics.names.pdq_matcher_lambda.write_match_record
                    ):
                        # TODO: Add source (threatexchange) tags to match record
                        MatchRecord(
                            key,
                            PdqSignal,
                            hash_str,
                            current_datetime,
                            signal_id,
                            metadata["source"],
                            metadata["hash"],
                        ).write_to_table(records_table)

                    for pg in metadata.get("privacy_groups", []):
                        # Only update the metadata if it is not found in the table
                        # once intally created it is the fetcher's job to keep the item up to date
                        PDQSignalMetadata(
                            signal_id,
                            pg,
                            current_datetime,
                            metadata["source"],
                            metadata["hash"],
                            metadata["tags"].get(pg, []),
                        ).write_to_table_if_not_found(records_table)

                    match_ids.append(signal_id)

                    # TODO: change naming upstream and here from privacy_group[s]
                    # to dataset[s]
                    for privacy_group in metadata.get("privacy_groups", []):
                        banked_signal = BankedSignal(
                            str(signal_id), str(privacy_group), str(metadata["source"])
                        )
                        for tag in metadata["tags"].get(privacy_group, []):
                            banked_signal.add_classification(tag)
                        matching_banked_signals.append(banked_signal)

            # TODO: Add source (threatexchange) tags to match message
            if matching_banked_signals:
                match_message = MatchMessage(
                    content_key=key,
                    content_hash=hash_str,
                    matching_banked_signals=matching_banked_signals,
                )

                logger.info(f"Publishing match_message: {match_message}")

                # Publish one message for the set of matches.
                sns_client.publish(
                    TopicArn=OUTPUT_TOPIC_ARN, Message=match_message.to_aws_json()
                )

        else:
            logger.info(f"No matches found for key: {key} hash: {hash_str}")

    metrics.flush()