示例#1
0
def lambda_handler(event, context):
    """
    This is the main entry point for writing back to ThreatExchange. The action evaluator
    sends a writeback message by way of the writebacks queue and here's where they're
    popped off and dealt with.
    """
    HMAConfig.initialize(os.environ["CONFIG_TABLE_NAME"])

    writebacks_performed = {}
    for sqs_record in event["Records"]:
        # TODO research max # sqs records / lambda_handler invocation
        writeback_message = WritebackMessage.from_aws_json(sqs_record["body"])
        logger.info("Writing Back: %s", writeback_message)

        # get all sources that are related to this writeback
        sources = {
            banked_signal.bank_source
            for banked_signal in writeback_message.banked_signals
        }
        source_writebackers = [
            Writebacker.get_writebacker_for_source(source)
            for source in sources
            if Writebacker.get_writebacker_for_source(source)
        ]
        for writebacker in source_writebackers:
            result = writebacker.perform_writeback(writeback_message)
            logger.info("Writeback result: %s", result)
            writebacks_performed[writebacker.source] = result

    return {"writebacks_performed": writebacks_performed}
def get_actions_api(hma_config_table: str) -> bottle.Bottle:
    # The documentation below expects prefix to be '/actions/'
    actions_api = bottle.Bottle()
    HMAConfig.initialize(hma_config_table)

    @actions_api.get("/", apply=[jsoninator])
    def fetch_all_actions() -> FetchAllActionsResponse:
        """
        Returns all action configs.
        """
        action_configs = ActionPerformer.get_all()
        return FetchAllActionsResponse(
            actions_response=[config.__dict__ for config in action_configs])

    @actions_api.put(
        "/<old_name>/<old_config_sub_stype>",
        apply=[jsoninator(CreateUpdateActionRequest)],
    )
    def update_action(request: CreateUpdateActionRequest, old_name: str,
                      old_config_sub_stype: str) -> UpdateActionResponse:
        """
        Update an action url and headers
        """
        if old_name != request.name or old_config_sub_stype != request.config_subtype:
            # The name field can't be updated because it is the primary key
            # The config sub type can't be updated because it is the config class level param
            delete_action(old_name)
            create_action(request)
        else:
            config = ActionPerformer._get_subtypes_by_name()[
                request.config_subtype].getx(request.name)
            for key, value in request.fields.items():
                setattr(config, key, value)
            hmaconfig.update_config(config)
        return UpdateActionResponse(response="The action config is updated.")

    @actions_api.post("/", apply=[jsoninator(CreateUpdateActionRequest)])
    def create_action(
            request: CreateUpdateActionRequest) -> CreateActionResponse:
        """
        create an action
        """
        config = ActionPerformer._get_subtypes_by_name()[
            request.config_subtype](**{
                "name": request.name,
                **request.fields
            })
        hmaconfig.create_config(config)
        return CreateActionResponse(response="The action config is created.")

    @actions_api.delete("/<name>", apply=[jsoninator])
    def delete_action(name: str) -> DeleteActionResponse:
        """
        Delete an action
        """
        hmaconfig.delete_config_by_type_and_name("ActionPerformer", name)
        return DeleteActionResponse(response="The action config is deleted.")

    return actions_api
 def get(cls):
     logger.info("Initializing configs using table name %s",
                 os.environ["CONFIG_TABLE_NAME"])
     HMAConfig.initialize(os.environ["CONFIG_TABLE_NAME"])
     return cls(
         actions_queue_url=os.environ["ACTIONS_QUEUE_URL"],
         sqs_client=boto3.client("sqs"),
     )
示例#4
0
def get_matches_api(
    dynamodb_table: Table,
    hma_config_table: str,
    indexes_bucket_name: str,
    writeback_queue_url: str,
) -> bottle.Bottle:
    """
    A Closure that includes all dependencies that MUST be provided by the root
    API that this API plugs into. Declare dependencies here, but initialize in
    the root API alone.
    """

    # A prefix to all routes must be provided by the api_root app
    # The documentation below expects prefix to be '/matches/'
    matches_api = bottle.Bottle()
    HMAConfig.initialize(hma_config_table)

    @matches_api.get("/", apply=[jsoninator])
    def matches() -> MatchSummariesResponse:
        """
        Return all, or a filtered list of matches based on query params.
        """
        signal_q = bottle.request.query.signal_q or None
        signal_source = bottle.request.query.signal_source or None
        content_q = bottle.request.query.content_q or None

        if content_q:
            records = MatchRecord.get_from_content_id(dynamodb_table,
                                                      content_q)
        elif signal_q:
            records = MatchRecord.get_from_signal(dynamodb_table, signal_q,
                                                  signal_source or "")
        else:
            # TODO: Support pagination after implementing in UI.
            records = MatchRecord.get_recent_items_page(dynamodb_table).items

        return MatchSummariesResponse(match_summaries=[
            MatchSummary(
                content_id=record.content_id,
                signal_id=record.signal_id,
                signal_source=record.signal_source,
                updated_at=record.updated_at.isoformat(),
            ) for record in records
        ])

    @matches_api.get("/match/", apply=[jsoninator])
    def match_details() -> MatchDetailsResponse:
        """
        Return the match details for a given content id.
        """
        results = []
        if content_id := bottle.request.query.content_id or None:
            results = get_match_details(dynamodb_table, content_id)
        return MatchDetailsResponse(match_details=results)
示例#5
0
def get_matches_api(dynamodb_table: Table,
                    hma_config_table: str) -> bottle.Bottle:
    """
    A Closure that includes all dependencies that MUST be provided by the root
    API that this API plugs into. Declare dependencies here, but initialize in
    the root API alone.
    """

    # A prefix to all routes must be provided by the api_root app
    # The documentation below expects prefix to be '/matches/'
    matches_api = bottle.Bottle()
    HMAConfig.initialize(hma_config_table)

    @matches_api.get("/", apply=[jsoninator])
    def matches() -> MatchSummariesResponse:
        """
        Returns all, or a filtered list of matches.
        """
        signal_q = bottle.request.query.signal_q or None
        signal_source = bottle.request.query.signal_source or None
        content_q = bottle.request.query.content_q or None

        if content_q:
            records = PDQMatchRecord.get_from_content_id(
                dynamodb_table, content_q)
        elif signal_q:
            records = PDQMatchRecord.get_from_signal(dynamodb_table, signal_q,
                                                     signal_source or "")
        else:
            records = PDQMatchRecord.get_from_time_range(dynamodb_table)

        return MatchSummariesResponse(match_summaries=[
            MatchSummary(
                content_id=record.content_id,
                signal_id=record.signal_id,
                signal_source=record.signal_source,
                updated_at=record.updated_at.isoformat(),
            ) for record in records
        ])

    @matches_api.get("/match/", apply=[jsoninator])
    def match_details() -> MatchDetailsResponse:
        """
        match details API endpoint:
        return format: match_details : [MatchDetailsResult]
        """
        results = []
        if content_id := bottle.request.query.content_id or None:
            results = get_match_details(dynamodb_table, content_id)
        return MatchDetailsResponse(match_details=results)
示例#6
0
def lambda_init_once():
    """
    Do some late initialization for required lambda components.

    Lambda initialization is weird - despite the existence of perfectly
    good constructions like __name__ == __main__, there don't appear
    to be easy ways to split your lambda-specific logic from your
    module logic except by splitting up the files and making your
    lambda entry as small as possible.

    TODO: Just refactor this file to separate the lambda and functional
          components
    """
    cfg = FetcherConfig.get()
    HMAConfig.initialize(cfg.config_table_name)
    def get(cls):
        logger.info("Initializing configs using table name %s",
                    os.environ["CONFIG_TABLE_NAME"])
        logger.info(
            "Initializing dynamo table using table name %s",
            os.environ["DYNAMODB_TABLE"],
        )
        HMAConfig.initialize(os.environ["CONFIG_TABLE_NAME"])

        dynamo_db_table_name = os.environ["DYNAMODB_TABLE"]
        dynamodb: DynamoDBServiceResource = boto3.resource("dynamodb")

        writeback_queue_url = os.environ["WRITEBACKS_QUEUE_URL"]

        return cls(
            actions_queue_url=os.environ["ACTIONS_QUEUE_URL"],
            sqs_client=boto3.client("sqs"),
            dynamo_db_table=dynamodb.Table(dynamo_db_table_name),
            writeback_queue_url=writeback_queue_url,
        )
    def _create_privacy_groups(self):
        # Since we already have a mock_dynamodb2 courtesy BanksTableTestBase,
        # re-use it for initing configs. Requires some clever hot-wiring.
        config_test_mock = config_test.ConfigTest()
        config_test_mock.mock_dynamodb2 = self.__class__.mock_dynamodb2
        config_test_mock.create_mocked_table()
        HMAConfig.initialize(config_test_mock.TABLE_NAME)
        # Hot wiring ends...

        self.active_pg = ThreatExchangeConfig(
            "ACTIVE_PG", True, "", True, True, True, "ACTIVE_PG"
        )
        create_config(self.active_pg)

        # Active PG has a distance threshold of 31.
        create_config(AdditionalMatchSettingsConfig("ACTIVE_PG", 31))

        self.inactive_pg = ThreatExchangeConfig(
            "INACTIVE_PG", True, "", True, True, False, "INACTIVE_PG"
        )
        create_config(self.inactive_pg)
示例#9
0
def get_datasets_api(hma_config_table: str) -> bottle.Bottle:
    # The documentation below expects prefix to be '/datasets/'
    datasets_api = bottle.Bottle()
    HMAConfig.initialize(hma_config_table)

    @datasets_api.get("/", apply=[jsoninator])
    def datasets() -> DatasetsResponse:
        """
        Returns all datasets.
        """
        collabs = ThreatExchangeConfig.get_all()
        return DatasetsResponse(datasets_response=[
            Dataset.from_collab(collab) for collab in collabs
        ])

    @datasets_api.post("/update", apply=[jsoninator(UpdateDatasetRequest)])
    def update_dataset(request: UpdateDatasetRequest) -> Dataset:
        """
        Update dataset fetcher_active and write_back
        """
        config = ThreatExchangeConfig.getx(str(request.privacy_group_id))
        config.fetcher_active = request.fetcher_active
        config.write_back = request.write_back
        updated_config = hmaconfig.update_config(config).__dict__
        updated_config["privacy_group_id"] = updated_config["name"]
        return Dataset.from_dict(updated_config)

    @datasets_api.post("/sync", apply=[jsoninator])
    def sync_datasets() -> SyncDatasetResponse:
        """
        Fetch new collaborations from ThreatExchnage and potentially update the configs stored in AWS
        """
        sync_privacy_groups()
        return SyncDatasetResponse(response="Dataset is update-to-date")

    return datasets_api
示例#10
0
def get_action_rules_api(hma_config_table: str) -> bottle.Bottle:
    # The endpoints below imply a prefix of '/action-rules'
    action_rules_api = bottle.Bottle()
    HMAConfig.initialize(hma_config_table)

    @action_rules_api.get("/", apply=[jsoninator])
    def get_action_rules() -> ActionRulesResponse:
        """
        Return all action rules.
        """
        error_message = ""
        action_rules = []

        try:
            action_rules = ActionRule.get_all()
            logger.info("action_rules: %s", action_rules)
        except Exception as e:
            error_message = "Unexpected error."
            handle_unexpected_error(e)

        return ActionRulesResponse(error_message, action_rules)

    @action_rules_api.post("/", apply=[jsoninator(ActionRulesRequest)])
    def create_action_rule(
        request: ActionRulesRequest,
    ) -> ActionRulesResponse:
        """
        Create an action rule.
        """
        logger.info("request: %s", request)
        error_message = ""

        try:
            hmaconfig.create_config(request.action_rule)
        except ClientError as e:
            # TODO this test for "already exists" should be moved to a common place
            if e.response["Error"]["Code"] == "ConditionalCheckFailedException":
                error_message = f"An action rule with the name '{request.action_rule.name}' already exists."
                logger.warning(
                    "Duplicate action rule creation attempted: %s",
                    e.response["Error"]["Message"],
                )
            else:
                error_message = "Unexpected error."
                logger.error(
                    "Unexpected client error: %s", e.response["Error"]["Message"]
                )
                logger.exception(e)
            response.status = 500
        except Exception as e:
            error_message = "Unexpected error."
            handle_unexpected_error(e)

        return ActionRulesResponse(error_message)

    @action_rules_api.put("/<old_name>", apply=[jsoninator(ActionRulesRequest)])
    def update_action_rule(
        request: ActionRulesRequest,
        old_name: str,
    ) -> ActionRulesResponse:
        """
        Update the action rule with name=<oldname>.
        """
        logger.info("old_name: %s", old_name)
        logger.info("request: %s", request)
        error_message = ""

        if ActionRule.exists(request.action_rule.name):
            try:
                hmaconfig.update_config(request.action_rule)
            except Exception as e:
                error_message = "Unexpected error."
                handle_unexpected_error(e)
        elif ActionRule.exists(old_name):
            try:
                hmaconfig.create_config(request.action_rule)
                hmaconfig.delete_config_by_type_and_name("ActionRule", old_name)
            except Exception as e:
                error_message = "Unexpected error."
                handle_unexpected_error(e)
        else:
            error_message = f"An action rule named '{request.action_rule.name}' or '{old_name}' does not exist."
            logger.warning(
                "An attempt was made to update an action rule named either '%s' or '%s' but neither exist.",
                request.action_rule.name,
                old_name,
            )
            response.status = 500

        return ActionRulesResponse(error_message)

    @action_rules_api.delete("/<name>", apply=[jsoninator])
    def delete_action_rule(name: str) -> ActionRulesResponse:
        """
        Delete the action rule with name=<name>.
        """
        logger.info("name: %s", name)
        error_message = ""

        if ActionRule.exists(name):
            try:
                hmaconfig.delete_config_by_type_and_name("ActionRule", name)
            except Exception as e:
                error_message = "Unexpected error."
                handle_unexpected_error(e)
        else:
            error_message = f"An action rule named '{name}' does not exist."
            logger.warning(
                "An attempt was made to delete an action rule named '%s' that does not exist.",
                name,
            )
            response.status = 500

        return ActionRulesResponse(error_message)

    return action_rules_api
示例#11
0
def get_datasets_api(
    hma_config_table: str,
    datastore_table: Table,
    threat_exchange_data_bucket_name: str,
    threat_exchange_data_folder: str,
    threat_exchange_pdq_file_extension: str,
) -> bottle.Bottle:
    # The documentation below expects prefix to be '/datasets/'
    datasets_api = bottle.Bottle()
    HMAConfig.initialize(hma_config_table)

    @datasets_api.get("/", apply=[jsoninator])
    def get_all_dataset_summaries() -> DatasetSummariesResponse:
        """
        Returns summaries for all datasets. Summary includes all facts that are
        not configurable. Eg. its name, the number of hashes it has, the
        number of matches it has caused, etc.
        """
        return DatasetSummariesResponse(
            threat_exchange_datasets=_get_threat_exchange_datasets(
                datastore_table,
                threat_exchange_data_bucket_name,
                threat_exchange_data_folder,
                threat_exchange_pdq_file_extension,
            ),
            test_datasets=[],
        )

    @datasets_api.post("/update", apply=[jsoninator(UpdateDatasetRequest)])
    def update_dataset(request: UpdateDatasetRequest) -> Dataset:
        """
        Update dataset fetcher_active, write_back and matcher_active
        """
        config = ThreatExchangeConfig.getx(str(request.privacy_group_id))
        config.fetcher_active = request.fetcher_active
        config.write_back = request.write_back
        config.matcher_active = request.matcher_active
        updated_config = hmaconfig.update_config(config).__dict__
        updated_config["privacy_group_id"] = updated_config["name"]
        return Dataset.from_dict(updated_config)

    @datasets_api.post("/create", apply=[jsoninator(CreateDatasetRequest)])
    def create_dataset(request: CreateDatasetRequest) -> CreateDatasetResponse:
        """
        Create a local dataset (defaults defined in CreateDatasetRequest)
        """
        assert isinstance(request, CreateDatasetRequest)

        create_privacy_group_if_not_exists(
            privacy_group_id=str(request.privacy_group_id),
            privacy_group_name=request.privacy_group_name,
            description=request.description,
            in_use=True,
            fetcher_active=request.fetcher_active,
            matcher_active=request.matcher_active,
            write_back=request.write_back,
        )

        return CreateDatasetResponse(
            response=f"Created dataset {request.privacy_group_id}"
        )

    @datasets_api.post("/sync", apply=[jsoninator])
    def sync_datasets() -> SyncDatasetResponse:
        """
        Fetch new collaborations from ThreatExchnage and potentially update the configs stored in AWS
        """
        sync_privacy_groups()
        return SyncDatasetResponse(response="Privacy groups are up to date")

    @datasets_api.post("/delete/<key>", apply=[jsoninator])
    def delete_dataset(key=None) -> DeleteDatasetResponse:
        """
        Delete dataset
        """
        config = ThreatExchangeConfig.getx(str(key))
        hmaconfig.delete_config(config)
        return DeleteDatasetResponse(response="The privacy group is deleted")

    return datasets_api
示例#12
0
def remove_superseded_actions(
    action_label_to_action_rules: t.Dict[ActionLabel, t.List[ActionRule]],
) -> t.Dict[ActionLabel, t.List[ActionRule]]:
    """
    TODO implement
    Evaluates a dictionary of action labels and the associated action rules generated for
    a match message against the actions. Action labels that are superseded by another will
    be removed.
    """
    return action_label_to_action_rules


if __name__ == "__main__":
    # For basic debugging
    HMAConfig.initialize(os.environ["CONFIG_TABLE_NAME"])
    action_rules = get_action_rules()
    match_message = MatchMessage(
        content_key="m2",
        content_hash=
        "361da9e6cf1b72f5cea0344e5bb6e70939f4c70328ace762529cac704297354a",
        matching_banked_signals=[
            BankedSignal(
                banked_content_id="3070359009741438",
                bank_id="258601789084078",
                bank_source="te",
                classifications={
                    BankedContentIDClassificationLabel(
                        value="258601789084078"),
                    ClassificationLabel(value="true_positive"),
                    BankSourceClassificationLabel(value="te"),
示例#13
0
logger = get_logger(__name__)
s3_client = boto3.client("s3")
sns_client: SNSClient = boto3.client("sns")
dynamodb = boto3.resource("dynamodb")

CACHED_TIME = 300
THRESHOLD = 31
LOCAL_INDEX_FILENAME = "/tmp/hashes.index"

INDEXES_BUCKET_NAME = os.environ["INDEXES_BUCKET_NAME"]
PDQ_INDEX_KEY = os.environ["PDQ_INDEX_KEY"]
OUTPUT_TOPIC_ARN = os.environ["PDQ_MATCHES_TOPIC_ARN"]

DYNAMODB_TABLE = os.environ["DYNAMODB_TABLE"]
HMA_CONFIG_TABLE = os.environ["HMA_CONFIG_TABLE"]
HMAConfig.initialize(HMA_CONFIG_TABLE)


@lru_cache(maxsize=None)
def get_index(bucket_name, key):
    """
    Load the given index from the s3 bucket and deserialize it
    """
    # TODO Cache this index for a period of time to reduce S3 calls and bandwidth.
    with metrics.timer(metrics.names.pdq_matcher_lambda.download_index):
        with open(LOCAL_INDEX_FILENAME, "wb") as index_file:
            s3_client.download_fileobj(bucket_name, key, index_file)

    with metrics.timer(metrics.names.pdq_matcher_lambda.parse_index):
        result = pickle.load(open(LOCAL_INDEX_FILENAME, "rb"))
示例#14
0
 def __init__(self, migration: str, config_table: str):
     self.migration = migration
     HMAConfig.initialize(config_table)
示例#15
0
def bottle_init_once() -> t.Tuple[bottle.AppStack, t.Callable[
    [t.Dict[str, t.Any], t.Any], t.Dict[str, t.Any]]]:
    """
    Meant to be called once per lambda instance. Returns a bottle app and an
    api_wsgi_handler that can be plugged into a lambda handler.

    The method also serves as a closure for all dependencies that need to be
    resolved at startup.
    """
    app = bottle.default_app()

    # Initialize hmaconfig at module level. Mounted SubApps need not initialize
    # their own HMAConfigs.
    HMAConfig.initialize(HMA_CONFIG_TABLE)

    functionality_mapping = get_pytx_functionality_mapping()

    @app.get("/root/")
    def root():
        """
        root endpoint to make sure the API is live and check when it was last updated
        """
        context = bottle.request.environ.get("apig_wsgi.context")
        invoked_function_arn = context.invoked_function_arn
        client = boto3.client("lambda")
        last_modified = client.get_function_configuration(
            FunctionName=invoked_function_arn)["LastModified"]

        return {
            "message": "Welcome to the HMA API!",
            "last_modified": last_modified,
        }

    app.mount(
        "/action-rules/",
        get_action_rules_api(hma_config_table=HMA_CONFIG_TABLE),
    )

    app.mount(
        "/matches/",
        get_matches_api(
            datastore_table=dynamodb.Table(DYNAMODB_TABLE),
            hma_config_table=HMA_CONFIG_TABLE,
            indexes_bucket_name=INDEXES_BUCKET_NAME,
            writeback_queue_url=WRITEBACK_QUEUE_URL,
            bank_table=dynamodb.Table(BANKS_TABLE),
            signal_type_mapping=functionality_mapping.signal_and_content,
        ),
    )

    app.mount(
        "/content/",
        get_content_api(
            dynamodb_table=dynamodb.Table(DYNAMODB_TABLE),
            image_bucket=IMAGE_BUCKET_NAME,
            image_prefix=IMAGE_PREFIX,
            signal_type_mapping=functionality_mapping.signal_and_content,
        ),
    )

    app.mount(
        "/submit/",
        get_submit_api(
            dynamodb_table=dynamodb.Table(DYNAMODB_TABLE),
            image_bucket=IMAGE_BUCKET_NAME,
            image_prefix=IMAGE_PREFIX,
            submissions_queue_url=SUBMISSIONS_QUEUE_URL,
            hash_queue_url=HASHES_QUEUE_URL,
            signal_type_mapping=functionality_mapping.signal_and_content,
        ),
    )

    app.mount(
        "/datasets/",
        get_datasets_api(
            hma_config_table=HMA_CONFIG_TABLE,
            datastore_table=dynamodb.Table(DYNAMODB_TABLE),
            threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
            threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
        ),
    )

    app.mount("/stats/",
              get_stats_api(counts_table=dynamodb.Table(COUNTS_TABLE_NAME)))

    app.mount(
        "/actions/",
        get_actions_api(hma_config_table=HMA_CONFIG_TABLE),
    )

    app.mount(
        "/banks/",
        get_bank_api(
            bank_table=dynamodb.Table(BANKS_TABLE),
            bank_user_media_bucket=BANKS_MEDIA_BUCKET_NAME,
            submissions_queue_url=SUBMISSIONS_QUEUE_URL,
            signal_type_mapping=functionality_mapping.signal_and_content,
        ),
    )

    app.mount(
        "/indexes/",
        get_indexes_api(
            indexes_bucket_name=INDEXES_BUCKET_NAME,
            indexer_function_name=INDEXER_FUNCTION_NAME,
        ),
    )

    app.mount(
        "/lcc/",
        get_lcc_api(
            storage_path=LCC_DURABLE_FS_PATH,
            signal_type_mapping=functionality_mapping.signal_and_content,
        ),
    )

    apig_wsgi_handler = make_lambda_handler(app)
    return (app, apig_wsgi_handler)
def lambda_handler(event, context):
    """
    Runs on a schedule. On each run, gets all data files for
    ALL_INDEXABLE_SIGNAL_TYPES from s3, converts the raw data file into an index
    and writes to an output S3 bucket.

    As per the default configuration, the bucket must be
    - the hashing data bucket eg. dipanjanm-hashing-<...>
    - the key name must be in the ThreatExchange folder (eg.
      threat_exchange_data/)
    - the key name must return a signal_type in
      ThreatUpdateS3Store.get_signal_type_from_object_key
    """
    # Note: even though we know which files were updated, threatexchange indexes
    # do not yet allow adding new entries. So, we must do a full rebuild. So, we
    # only end up using the signal types that were updated, not the actual files
    # that changed.

    s3_config = S3ThreatDataConfig(
        threat_exchange_data_bucket_name=THREAT_EXCHANGE_DATA_BUCKET_NAME,
        threat_exchange_data_folder=THREAT_EXCHANGE_DATA_FOLDER,
    )

    banks_table = BanksTable(dynamodb.Table(BANKS_TABLE))

    HMAConfig.initialize(HMA_CONFIG_TABLE)
    signal_content_mapping = get_pytx_functionality_mapping()

    for signal_type in ALL_INDEXABLE_SIGNAL_TYPES:
        adapter_class = _ADAPTER_MAPPING[signal_type]
        data_files = adapter_class(
            config=s3_config,
            metrics_logger=metrics.names.indexer).load_data()

        with metrics.timer(metrics.names.indexer.get_bank_data):
            bank_data = get_all_bank_hash_rows(signal_type, banks_table)

        with metrics.timer(metrics.names.indexer.merge_datafiles):
            logger.info(f"Merging {signal_type} Hash files")

            # go from dict[filename, list<hash rows>] → list<hash rows>
            flattened_data = [
                hash_row for file_ in data_files.values() for hash_row in file_
            ]

            merged_data = functools.reduce(merge_hash_rows_on_hash_value,
                                           flattened_data + bank_data,
                                           {}).values()

        with metrics.timer(metrics.names.indexer.build_index):
            logger.info(f"Rebuilding {signal_type} Index")

            for index_class in [
                    signal_type.get_index_cls()
                    for signal_type in signal_content_mapping.
                    signal_and_content.signal_type_by_name.values()
            ]:
                index: S3BackedInstrumentedIndexMixin = index_class.build(
                    merged_data)

                logger.info(
                    f"Putting {signal_type} index in S3 for index {index.get_index_class_name()}"
                )
                index.save(bucket_name=INDEXES_BUCKET_NAME)
            metrics.flush()

    logger.info("Index updates complete")
示例#17
0
def lambda_handler(event, context):
    """
    SQS Events generated by the submissions API or by files being added to S3.
    Downloads files to temp-storage, identifies content_type and generates
    allowed signal_types from it.

    Saves hash output to DynamoDB, sends a message on an output queue.

    Note that this brings the contents of a file into memory. This is subject to
    the resource limitation on the lambda. Potentially extendable until 10GB, but
    that would be super-expensive. [1]

    [1]: https://docs.aws.amazon.com/lambda/latest/dg/configuration-console.html
    """
    records_table = get_dynamodb().Table(DYNAMODB_TABLE)
    HMAConfig.initialize(HMA_CONFIG_TABLE)
    banks_table = BanksTable(
        get_dynamodb().Table(BANKS_TABLE),
        _get_signal_type_mapping(),
    )
    sqs_client = get_sqs_client()

    hasher = _get_hasher(_get_signal_type_mapping())

    for sqs_record in event["Records"]:
        message = json.loads(sqs_record["body"])

        if message.get("Event") == "s3:TestEvent":
            continue

        media_to_process: t.List[t.Union[S3ImageSubmission,
                                         URLSubmissionMessage,
                                         BankSubmissionMessage]] = []

        if URLSubmissionMessage.could_be(message):
            media_to_process.append(
                URLSubmissionMessage.from_sqs_message(
                    message, _get_signal_type_mapping()))
        elif S3ImageSubmissionBatchMessage.could_be(message):
            # S3 submissions can only be images for now.
            media_to_process.extend(
                S3ImageSubmissionBatchMessage.from_sqs_message(
                    message, image_prefix=IMAGE_PREFIX).image_submissions)
        elif BankSubmissionMessage.could_be(message):
            media_to_process.append(
                BankSubmissionMessage.from_sqs_message(
                    message, _get_signal_type_mapping()))
        else:
            logger.warn(f"Unprocessable Message: {message}")

        for media in media_to_process:
            if not hasher.supports(media.content_type):
                if isinstance(media, BankSubmissionMessage):
                    object_id = media.bank_id
                else:
                    object_id = media.content_id
                logger.warn(
                    f"Unprocessable content type: {media.content_type}, id: {object_id}"
                )
                continue

            with metrics.timer(metrics.names.hasher.download_file):
                try:
                    if hasattr(media, "key") and hasattr(media, "bucket"):
                        # Classic duck-typing. If it has key and bucket, must be an
                        # S3 submission.
                        media = t.cast(S3ImageSubmission, media)
                        bytes_: bytes = S3BucketContentSource(
                            media.bucket,
                            IMAGE_PREFIX).get_bytes(media.content_id)
                    else:
                        media = t.cast(URLSubmissionMessage, media)
                        bytes_: bytes = URLContentSource().get_bytes(media.url)
                except Exception:
                    if isinstance(media, BankSubmissionMessage):
                        object_id = media.bank_id
                    else:
                        object_id = media.content_id
                    logger.exception(
                        f"Encountered exception while trying to get_bytes for id: {object_id}. Unable to hash content."
                    )
                    continue

            for signal in hasher.get_hashes(media.content_type, bytes_):
                if isinstance(media, BankSubmissionMessage):
                    # route signals to bank datastore only.
                    bank_operations.add_bank_member_signal(
                        banks_table=banks_table,
                        bank_id=media.bank_id,
                        bank_member_id=media.bank_member_id,
                        signal_type=signal.signal_type,
                        signal_value=signal.signal_value,
                    )
                    # don't write hash records etc.
                    continue

                hash_record = PipelineHashRecord(
                    content_id=media.content_id,
                    signal_type=signal.signal_type,
                    content_hash=signal.signal_value,
                    updated_at=datetime.datetime.now(),
                )

                hasher.write_hash_record(records_table, hash_record)
                hasher.publish_hash_message(sqs_client, hash_record)

    metrics.flush()
示例#18
0
def get_datasets_api(
    hma_config_table: str,
    datastore_table: Table,
    threat_exchange_data_bucket_name: str,
    threat_exchange_data_folder: str,
) -> bottle.Bottle:
    """
    ToDo / FixMe: this file is probably more about privacy groups than datasets...
    """
    # The documentation below expects prefix to be '/datasets/'
    datasets_api = SubApp()
    HMAConfig.initialize(hma_config_table)

    @datasets_api.get("/", apply=[jsoninator])
    def get_all_dataset_summaries() -> DatasetSummariesResponse:
        """
        Returns summaries for all datasets. Summary includes all facts that are
        not configurable. Eg. its name, the number of hashes it has, the
        number of matches it has caused, etc.
        """
        return DatasetSummariesResponse(
            threat_exchange_datasets=_get_threat_exchange_datasets(
                datastore_table,
                threat_exchange_data_bucket_name,
                threat_exchange_data_folder,
            ))

    @datasets_api.post("/update", apply=[jsoninator(UpdateDatasetRequest)])
    def update_dataset(request: UpdateDatasetRequest) -> Dataset:
        """
        Update dataset values: fetcher_active, write_back, and matcher_active.
        """
        config = ThreatExchangeConfig.getx(str(request.privacy_group_id))
        config.fetcher_active = request.fetcher_active
        config.write_back = request.write_back
        config.matcher_active = request.matcher_active
        updated_config = hmaconfig.update_config(config).__dict__
        updated_config["privacy_group_id"] = updated_config["name"]

        additional_config = AdditionalMatchSettingsConfig.get(
            str(request.privacy_group_id))
        if request.pdq_match_threshold:
            if additional_config:
                additional_config.pdq_match_threshold = int(
                    request.pdq_match_threshold)
                hmaconfig.update_config(additional_config)
            else:
                additional_config = AdditionalMatchSettingsConfig(
                    str(request.privacy_group_id),
                    int(request.pdq_match_threshold))
                hmaconfig.create_config(additional_config)
        elif additional_config:  # pdq_match_threshold was set and now should be removed
            hmaconfig.delete_config(additional_config)

        return Dataset.from_dict(updated_config)

    @datasets_api.post("/create", apply=[jsoninator(CreateDatasetRequest)])
    def create_dataset(request: CreateDatasetRequest) -> CreateDatasetResponse:
        """
        Create a local dataset (defaults defined in CreateDatasetRequest)
        """
        assert isinstance(request, CreateDatasetRequest)

        create_privacy_group_if_not_exists(
            privacy_group_id=str(request.privacy_group_id),
            privacy_group_name=request.privacy_group_name,
            description=request.description,
            in_use=True,
            fetcher_active=request.fetcher_active,
            matcher_active=request.matcher_active,
            write_back=request.write_back,
        )

        return CreateDatasetResponse(
            response=f"Created dataset {request.privacy_group_id}")

    @datasets_api.post("/sync", apply=[jsoninator])
    def sync_datasets() -> SyncDatasetResponse:
        """
        Fetch new collaborations from ThreatExchange and sync with the configs stored in DynamoDB.
        """
        sync_privacy_groups()
        return SyncDatasetResponse(response="Privacy groups are up to date")

    @datasets_api.post("/delete/<key>", apply=[jsoninator])
    def delete_dataset(key=None) -> DeleteDatasetResponse:
        """
        Delete the dataset with key=<key>
        """
        config = ThreatExchangeConfig.getx(str(key))
        hmaconfig.delete_config(config)
        return DeleteDatasetResponse(response="The privacy group is deleted")

    @datasets_api.get("/match-settings", apply=[jsoninator])
    def get_all_match_settings() -> MatchSettingsResponse:
        """
        Return all match settings configs
        """
        return MatchSettingsResponse(match_settings=[
            MatchSettingsResponseBody(c)
            for c in AdditionalMatchSettingsConfig.get_all()
        ])

    @datasets_api.get("/match-settings/<key>", apply=[jsoninator])
    def get_match_settings(key=None, ) -> MatchSettingsResponseBody:
        """
        Return a match settings config for a given privacy_group_id
        """
        if config := AdditionalMatchSettingsConfig.get(str(key)):
            return MatchSettingsResponseBody(config)
        return bottle.abort(400, f"No match_settings for pg_id {key} found")