def lambda_handler(event, context): """ Listens to SQS events fired when new data files are added to the data bucket's data directory. If the updated key matches a set of criteria, converts the raw data file into an index and writes to an output S3 bucket. As per the default configuration, the bucket must be - the hashing data bucket eg. dipanjanm-hashing-data20210224213427723700000003 - the key name must be in the ThreatExchange folder (eg. threat_exchange_data/) - the key name must be a pdq file ending in ".pdq.te" Which means adding new versions of the datasets will not have an effect. You must add the exact pdq.te file. """ if not was_pdq_data_updated(event): logger.info("PDQ Data Not Updated, skipping") return logger.info("PDQ Data Updated, updating pdq hash index") metrics_logger = metrics.names.pdq_indexer_lambda s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, threat_exchange_pdq_file_extension=THREAT_EXCHANGE_PDQ_FILE_EXTENSION, ) pdq_data_files = ThreatExchangeS3PDQAdapter( config=s3_config, metrics_logger=metrics_logger).load_data() with metrics.timer(metrics_logger.merge_datafiles): logger.info("Merging PDQ Hash files") flat_pdq_data = [ hash_row for pdq_file in pdq_data_files.values() for hash_row in pdq_file ] merged_pdq_data = reduce(merge_pdq_files, flat_pdq_data, {}).values() with metrics.timer(metrics_logger.build_index): logger.info("Creating PDQ Hash Index") index = PDQIndex.build(merged_pdq_data) logger.info("Putting index in S3") index_bytes = pickle.dumps(index) with metrics.timer(metrics_logger.upload_index): s3_client.put_object(Bucket=INDEXES_BUCKET_NAME, Key=PDQ_INDEX_KEY, Body=index_bytes) logger.info("Index update complete") metrics.flush()
def get_signal_hash_count() -> t.Dict[str, int]: s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME, threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER, threat_exchange_pdq_file_extension=THREAT_EXCHANGE_PDQ_FILE_EXTENSION, ) pdq_storage = ThreatExchangeS3PDQAdapter( config=s3_config, metrics_logger=metrics.names.api_hash_count()) pdq_data_files = pdq_storage.load_data() return {file_name: len(rows) for file_name, rows in pdq_data_files.items()}
def _get_signal_hash_count_and_last_modified( threat_exchange_data_bucket_name: str, threat_exchange_data_folder: str, threat_exchange_pdq_file_extension: str, ) -> t.Dict[str, t.Tuple[int, str]]: # TODO this method is expensive some cache or memoization method might be a good idea. s3_config = S3ThreatDataConfig( threat_exchange_data_bucket_name=threat_exchange_data_bucket_name, threat_exchange_data_folder=threat_exchange_data_folder, threat_exchange_pdq_file_extension=threat_exchange_pdq_file_extension, ) pdq_storage = ThreatExchangeS3PDQAdapter( config=s3_config, metrics_logger=metrics.names.api_hash_count()) pdq_data_files = pdq_storage.load_data() return { file_name: (len(rows), pdq_storage.last_modified[file_name]) for file_name, rows in pdq_data_files.items() }
def get_hash_count() -> t.Dict[str, int]: pdq_storage = ThreatExchangeS3PDQAdapter( metrics_logger=metrics.names.api_hash_count()) pdq_data_files = pdq_storage.load_data() return {file_name: len(rows) for file_name, rows in pdq_data_files.items()}