예제 #1
0
def lambda_handler(event, context):
    """
    Listens to SQS events fired when new data files are added to the data
    bucket's data directory. If the updated key matches a set of criteria,
    converts the raw data file into an index and writes to an output S3 bucket.

    As per the default configuration, the bucket must be
    - the hashing data bucket eg.
      dipanjanm-hashing-data20210224213427723700000003
    - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/)
    - the key name must be a pdq file ending in ".pdq.te"

    Which means adding new versions of the datasets will not have an effect. You
    must add the exact pdq.te file.
    """

    if not was_pdq_data_updated(event):
        logger.info("PDQ Data Not Updated, skipping")
        return

    logger.info("PDQ Data Updated, updating pdq hash index")
    metrics_logger = metrics.names.pdq_indexer_lambda

    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
        threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
        threat_exchange_pdq_file_extension=THREAT_EXCHANGE_PDQ_FILE_EXTENSION,
    )

    pdq_data_files = ThreatExchangeS3PDQAdapter(
        config=s3_config, metrics_logger=metrics_logger).load_data()

    with metrics.timer(metrics_logger.merge_datafiles):
        logger.info("Merging PDQ Hash files")
        flat_pdq_data = [
            hash_row for pdq_file in pdq_data_files.values()
            for hash_row in pdq_file
        ]

        merged_pdq_data = reduce(merge_pdq_files, flat_pdq_data, {}).values()

    with metrics.timer(metrics_logger.build_index):
        logger.info("Creating PDQ Hash Index")
        index = PDQIndex.build(merged_pdq_data)

        logger.info("Putting index in S3")
        index_bytes = pickle.dumps(index)

    with metrics.timer(metrics_logger.upload_index):
        s3_client.put_object(Bucket=INDEXES_BUCKET_NAME,
                             Key=PDQ_INDEX_KEY,
                             Body=index_bytes)

    logger.info("Index update complete")
    metrics.flush()
예제 #2
0
def get_signal_hash_count() -> t.Dict[str, int]:
    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
        threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
        threat_exchange_pdq_file_extension=THREAT_EXCHANGE_PDQ_FILE_EXTENSION,
    )
    pdq_storage = ThreatExchangeS3PDQAdapter(
        config=s3_config, metrics_logger=metrics.names.api_hash_count())
    pdq_data_files = pdq_storage.load_data()

    return {file_name: len(rows) for file_name, rows in pdq_data_files.items()}
예제 #3
0
def _get_signal_hash_count_and_last_modified(
    threat_exchange_data_bucket_name: str,
    threat_exchange_data_folder: str,
    threat_exchange_pdq_file_extension: str,
) -> t.Dict[str, t.Tuple[int, str]]:
    # TODO this method is expensive some cache or memoization method might be a good idea.

    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=threat_exchange_data_bucket_name,
        threat_exchange_data_folder=threat_exchange_data_folder,
        threat_exchange_pdq_file_extension=threat_exchange_pdq_file_extension,
    )
    pdq_storage = ThreatExchangeS3PDQAdapter(
        config=s3_config, metrics_logger=metrics.names.api_hash_count())
    pdq_data_files = pdq_storage.load_data()
    return {
        file_name: (len(rows), pdq_storage.last_modified[file_name])
        for file_name, rows in pdq_data_files.items()
    }
예제 #4
0
def get_hash_count() -> t.Dict[str, int]:
    pdq_storage = ThreatExchangeS3PDQAdapter(
        metrics_logger=metrics.names.api_hash_count())
    pdq_data_files = pdq_storage.load_data()

    return {file_name: len(rows) for file_name, rows in pdq_data_files.items()}