def lambda_handler(event, context):
    """
    This method takes the new columns and adds them all onto the main dataset.

    :param event: { "RuntimeVariables": {
        aggregated_column - A column to aggregate by. e.g. Enterprise_Reference.
        additional_aggregated_column - A column to aggregate by. e.g. Region.
    }}
    :param context:
    :return:
    """

    current_module = "Aggregation_Combiner"
    error_message = ""
    bpm_queue_url = None
    current_step_num = 5

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        additional_aggregated_column = runtime_variables["additional_aggregated_column"]
        aggregated_column = runtime_variables["aggregated_column"]
        aggregation_files = runtime_variables["aggregation_files"]
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)
    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - Retrieved configuration variables.")
        # Get file from s3
        imp_df = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Retrieved data from s3")

        # Receive the 3 aggregation outputs.
        ent_ref_agg = aggregation_files["ent_ref_agg"]
        cell_agg = aggregation_files["cell_agg"]
        top2_agg = aggregation_files["top2_agg"]

        # Load file content.
        ent_ref_agg_df = aws_functions.read_dataframe_from_s3(bucket_name, ent_ref_agg)
        cell_agg_df = aws_functions.read_dataframe_from_s3(bucket_name, cell_agg)
        top2_agg_df = aws_functions.read_dataframe_from_s3(bucket_name, top2_agg)
        logger.info("Successfully retrievied aggragation data from s3")

        to_aggregate = [aggregated_column]
        if additional_aggregated_column != "":
            to_aggregate.append(additional_aggregated_column)

        # merge the imputation output from s3 with the 3 aggregation outputs
        first_merge = pd.merge(
            imp_df, ent_ref_agg_df, on=to_aggregate, how="left")

        second_merge = pd.merge(
            first_merge, cell_agg_df, on=to_aggregate, how="left")

        third_merge = pd.merge(
            second_merge, top2_agg_df, on=to_aggregate, how="left")

        logger.info("Successfully merged dataframes")

        # convert output to json ready to return
        final_output = third_merge.to_json(orient="records")

        # send output onwards
        aws_functions.save_to_s3(bucket_name, out_file_name, final_output)
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, ent_ref_agg))
            logger.info(aws_functions.delete_data(bucket_name, cell_agg))
            logger.info(aws_functions.delete_data(bucket_name, top2_agg))
            logger.info("Successfully deleted input data.")

        aws_functions.send_sns_message(sns_topic_arn, "Aggregation - Combiner.")
        logger.info("Successfully sent data to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context,
                                                           bpm_queue_url=bpm_queue_url)

    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send end of module status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                  current_step_num, total_steps)

    return {"success": True}
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the calculate top two
    statistical method.
    The method requires a dataframe which must contain the columns specified by:
    - aggregated column
    - additional aggregated column
    - total columns

    :param event: {"RuntimeVariables":{
        aggregated_column - A column to aggregate by. e.g. Enterprise_Reference.
        additional_aggregated_column - A column to aggregate by. e.g. Region.
        total_columns - The names of the columns to produce aggregations for.
        top1_column - The prefix for the largest_contibutor column
        top2_column - The prefix for the second_largest_contibutor column
    }}
    :param context: N/A
    :return: {"success": True}
            or LambdaFailure exception
    """
    current_module = "Aggregation Calc Top Two - Wrangler."

    error_message = ""
    bpm_queue_url = None
    current_step_num = 5

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Needs to be declared inside of the lambda handler
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]

        # Runtime Variables
        additional_aggregated_column = runtime_variables[
            "additional_aggregated_column"]
        aggregated_column = runtime_variables["aggregated_column"]
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        top1_column = runtime_variables["top1_column"]
        top2_column = runtime_variables["top2_column"]
        total_columns = runtime_variables["total_columns"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)
    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables")
        # Send start of module status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        # Read from S3 bucket
        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)
        logger.info("Retrieved data from s3")

        # Serialise data
        logger.info("Converting dataframe to json.")
        prepared_data = data.to_json(orient="records")

        # Invoke aggregation top2 method
        logger.info("Invoking the statistical method.")

        json_payload = {
            "RuntimeVariables": {
                "additional_aggregated_column": additional_aggregated_column,
                "aggregated_column": aggregated_column,
                "bpm_queue_url": bpm_queue_url,
                "data": prepared_data,
                "environment": environment,
                "run_id": run_id,
                "survey": survey,
                "top1_column": top1_column,
                "top2_column": top2_column,
                "total_columns": total_columns
            }
        }

        top2 = lambda_client.invoke(FunctionName=method_name,
                                    Payload=json.dumps(json_payload))

        json_response = json.loads(top2.get("Payload").read().decode("utf-8"))

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        # Sending output to S3, notice to SNS
        logger.info("Sending function response downstream.")
        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent the data to S3")

        aws_functions.send_sns_message(sns_topic_arn, "Aggregation - Top 2.")
        logger.info("Successfully sent the SNS message")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}
Exemplo n.º 3
0
def lambda_handler(event, context):
    """
    The wrangler is responsible for preparing the data so the IQRS method can be applied.
    :param event: Contains all the variables which are required for the specific run.
    :param context: N/A
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Imputation IQRS - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        distinct_values = runtime_variables["distinct_values"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        questions_list = runtime_variables["questions_list"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables['survey']

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:

        logger.info("Started - retrieved configuration variables.")

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)
        logger.info("Successfully retrieved data.")
        iqrs_columns = imp_func.produce_columns("iqrs_", questions_list)
        for col in iqrs_columns:
            data[col] = 0

        logger.info("IQRS columns successfully added")

        data_json = data.to_json(orient="records")

        logger.info("Dataframe converted to JSON")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(data_json),
                "distinct_values": distinct_values,
                "environment": environment,
                "questions_list": questions_list,
                "run_id": run_id,
                "survey": survey
            }
        }

        wrangled_data = lambda_client.invoke(FunctionName=method_name,
                                             Payload=json.dumps(payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            wrangled_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data from s3.")

        aws_functions.send_sns_message(sns_topic_arn, "Imputation - IQRs.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}
Exemplo n.º 4
0
def lambda_handler(event, context):
    """
    Lambda function preparing data for enrichment and then calling the enrichment method.
    :param event: Json string representing input - String
    :param context:
    :return Json: success and/or indication of error message.
    """

    # Set up logger.
    current_module = "Enrichment - Wrangler"
    error_message = ""

    bpm_queue_url = None
    current_step_num = 2

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables.
        bucket_name = environment_variables["bucket_name"]
        identifier_column = environment_variables["identifier_column"]
        method_name = environment_variables["method_name"]

        # Runtime Variables.
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables['environment']
        lookups = runtime_variables["lookups"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        marine_mismatch_check = runtime_variables["marine_mismatch_check"]
        period_column = runtime_variables["period_column"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables['survey']
        survey_column = runtime_variables["survey_column"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)

        raise exception_classes.LambdaFailure(error_message)

    try:

        # Send start of method status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        # Set up client.
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        data_df = aws_functions.read_dataframe_from_s3(bucket_name,
                                                       in_file_name)

        logger.info("Started - retrieved data from s3")
        data_json = data_df.to_json(orient="records")
        json_payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "environment": environment,
                "data": data_json,
                "lookups": lookups,
                "marine_mismatch_check": marine_mismatch_check,
                "survey": survey,
                "survey_column": survey_column,
                "period_column": period_column,
                "identifier_column": identifier_column,
                "run_id": run_id
            }
        }
        response = lambda_client.invoke(FunctionName=method_name,
                                        Payload=json.dumps(json_payload))

        logger.info("Successfully invoked method.")
        json_response = json.loads(
            response.get("Payload").read().decode("utf-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])

        logger.info("Successfully sent data to s3.")

        anomalies = json_response["anomalies"]

        if anomalies != "[]":
            aws_functions.save_to_s3(bucket_name, "Enrichment_Anomalies",
                                     anomalies)
            have_anomalies = True
        else:
            have_anomalies = False

        aws_functions.send_sns_message_with_anomalies(have_anomalies,
                                                      sns_topic_arn,
                                                      "Enrichment.")

        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)

    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    # Send end of method status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)

    return {"success": True}
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the calculate movements statistical method.
    The method requires a column per question to store the movements, named as follows:
    'movement_questionNameAndNumber'. The wrangler checks for non response and if everyone
    has responded the calculate movements is skipped.
    :param event: Contains Runtime_variables, which contains the movement_type
    :param context: N/A
    :return: Success & Impute/Error - Type: JSON
    """
    to_be_imputed = True
    current_module = "Imputation Movement - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None
    current_step_num = 4

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        response_type = environment_variables["response_type"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_data = runtime_variables["current_data"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        movement_type = runtime_variables["movement_type"]
        out_file_name = runtime_variables["out_file_name"]
        out_file_name_skip = runtime_variables["out_file_name_skip"]
        period = runtime_variables["period"]
        period_column = runtime_variables["period_column"]
        periodicity = runtime_variables["periodicity"]
        previous_data = runtime_variables["previous_data"]
        questions_list = runtime_variables["questions_list"]
        reference = runtime_variables["unique_identifier"][0]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        # Send in progress status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        previous_period = general_functions.calculate_adjacent_periods(
            period, periodicity)
        logger.info("Completed reading data from s3")
        previous_period_data = data[data[period_column].astype("str") == str(
            previous_period)]
        data = data[data[period_column].astype("str") == str(period)]
        logger.info("Split input data")

        # Create a Dataframe where the response column
        # value is set as 1 i.e non responders
        filtered_non_responders = data.loc[(data[response_type] == 1) & (
            data[period_column].astype("str") == str(period))]

        logger.info("Successfully created filtered non responders DataFrame")

        response_check = len(filtered_non_responders.index)

        # If greater than 0 it means there is non-responders so Imputation need to be run
        if response_check > 0:

            # Save previous period data to s3 for apply to pick up later
            aws_functions.save_to_s3(
                bucket_name, previous_data,
                previous_period_data.to_json(orient="records"))
            # Save raw data to s3 for apply to pick up later
            aws_functions.save_to_s3(bucket_name, current_data,
                                     data.to_json(orient="records"))
            logger.info("Successfully sent data.")

            # Ensure that only responder_ids with a response
            # type of 2 (returned) get picked up
            data = data[data[response_type] == 2]
            previous_period_data = \
                previous_period_data[previous_period_data[response_type] == 2]

            # Ensure that only rows that exist in both current and previous get picked up.
            data = data[data[reference].isin(
                previous_period_data[reference])].dropna()
            previous_period_data = previous_period_data[
                previous_period_data[reference].isin(
                    data[reference])].dropna()  # noqa e501

            # Merged together so it can be sent via the payload to the method
            merged_data = pd.concat([data, previous_period_data])

            # Make sure there is some data, non-responders were removed at this stage
            if len(merged_data.index) > 0:
                logger.info(
                    "Successfully filtered and merged the previous period data"
                )
            else:
                raise exception_classes.LambdaFailure(
                    "No data left after filtering")

            for question in questions_list:
                merged_data["movement_" + question] = 0.0

            json_ordered_data = merged_data.to_json(orient="records")

            json_payload = {
                "RuntimeVariables": {
                    "bpm_queue_url": bpm_queue_url,
                    "current_period": period,
                    "data": json.loads(json_ordered_data),
                    "environment": environment,
                    "movement_type": movement_type,
                    "period_column": period_column,
                    "previous_period": previous_period,
                    "questions_list": questions_list,
                    "run_id": run_id,
                    "survey": survey
                }
            }

            logger.info("Successfully created movement columns on the data")

            imputed_data = lambda_client.invoke(
                FunctionName=method_name, Payload=json.dumps(json_payload))

            logger.info("Successfully invoked method.")

            json_response = json.loads(
                imputed_data.get("Payload").read().decode("UTF-8"))
            logger.info("JSON extracted from method response.")

            if not json_response["success"]:
                raise exception_classes.MethodFailure(json_response["error"])

            imputation_run_type = "Calculate Movement."
            aws_functions.save_to_s3(bucket_name, out_file_name,
                                     json_response["data"])

            logger.info("Successfully sent the data to s3")

        else:

            to_be_imputed = False
            imputation_run_type = "Has Not Run."

            aws_functions.save_to_s3(bucket_name, out_file_name_skip,
                                     data.to_json(orient="records"))

            logger.info("Successfully sent the unchanged data to s3")

            aws_functions.send_sns_message(sns_topic_arn,
                                           "Imputation - Did not run")
            logger.info("Successfully sent message to sns")

        aws_functions.send_sns_message(sns_topic_arn,
                                       "Imputation - " + imputation_run_type)

        logger.info("Successfully sent the SNS message")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True, "impute": to_be_imputed}
def lambda_handler(event, context):
    """
    prepares the data for the Means method.
    - Read in data from the SQS queue.
    - Invokes the Mean Method.
    - Send data received back from the Mean method to the SQS queue.

    :param event:
    :param context:
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Imputation Recalculate Means - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", "eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        distinct_values = runtime_variables["distinct_values"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        questions_list = runtime_variables["questions_list"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Successfully retrieved data")

        # Add means columns
        for question in questions_list:
            data.drop(["movement_" + question + "_count"],
                      axis=1,
                      inplace=True)
            data.drop(["movement_" + question + "_sum"], axis=1, inplace=True)
            data.drop(["atyp_" + question, "iqrs_" + question],
                      axis=1,
                      inplace=True)
            data["mean_" + question] = 0.0

        data_json = data.to_json(orient="records")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(data_json),
                "distinct_values": distinct_values,
                "environment": environment,
                "questions_list": questions_list,
                "run_id": run_id,
                "survey": survey
            }
        }

        returned_data = lambda_client.invoke(FunctionName=method_name,
                                             Payload=json.dumps(payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            returned_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data from s3.")

        aws_functions.send_sns_message(sns_topic_arn,
                                       "Imputation - Recalculate Means.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}
Exemplo n.º 7
0
def lambda_handler(event, context):
    """
    This method is used to prepare data for the calculation of column totals.

    :param event: {"RuntimeVariables":{
        aggregated_column - A column to aggregate by. e.g. Enterprise_Reference.
        additional_aggregated_column - A column to aggregate by. e.g. Region.
        aggregation_type - How we wish to do the aggregation. e.g. sum, count, nunique.
        total_columns - The names of the columns to produce aggregations for.
        cell_total_column - Name of column to rename each total_column.
                        Is concatenated to the front of the total_column name.
    }}

    :param context: N/A
    :return: {"success": True}
            or LambdaFailure exception
    """
    current_module = "Aggregation by column - Wrangler"
    error_message = ""

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Needs to be declared inside the lambda_handler
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]

        # Runtime Variables
        additional_aggregated_column = runtime_variables[
            "additional_aggregated_column"]
        aggregated_column = runtime_variables["aggregated_column"]
        aggregation_type = runtime_variables["aggregation_type"]
        cell_total_column = runtime_variables["cell_total_column"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_columns = runtime_variables["total_columns"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)
    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")
        # Read from S3 bucket
        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)
        logger.info("Started - retrieved data from s3")

        formatted_data = data.to_json(orient="records")
        logger.info("Formatted disaggregated_data")

        json_payload = {
            "RuntimeVariables": {
                "additional_aggregated_column": additional_aggregated_column,
                "aggregated_column": aggregated_column,
                "aggregation_type": aggregation_type,
                "cell_total_column": cell_total_column,
                "data": formatted_data,
                "environment": environment,
                "run_id": run_id,
                "survey": survey,
                "total_columns": total_columns
            }
        }

        by_column = lambda_client.invoke(FunctionName=method_name,
                                         Payload=json.dumps(json_payload))

        json_response = json.loads(
            by_column.get("Payload").read().decode("utf-8"))

        logger.info("Successfully invoked the method lambda")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent the data to S3")

        aws_functions.send_sns_message(
            sns_topic_arn, "Aggregation - " + aggregated_column + ".")

        logger.info("Successfully sent the SNS message")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e, current_module, run_id, context)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the apply factors statistical method.
    The method requires a column per question to store the factors.
    :param event:  Contains all the variables which are required for the specific run.
    :param context: N/A
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Imputation Apply Factors - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None
    current_step_num = 4

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        response_type = environment_variables["response_type"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_data = runtime_variables["current_data"]
        distinct_values = runtime_variables["distinct_values"]
        environment = runtime_variables["environment"]
        factors_parameters = runtime_variables["factors_parameters"][
            "RuntimeVariables"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        previous_data = runtime_variables["previous_data"]
        questions_list = runtime_variables["questions_list"]
        reference = runtime_variables["unique_identifier"][0]
        region_column = factors_parameters["region_column"]
        regionless_code = factors_parameters["regionless_code"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        sum_columns = runtime_variables["sum_columns"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        # Get factors data from calculate_factors
        factors_dataframe = aws_functions.read_dataframe_from_s3(
            bucket_name, in_file_name)
        logger.info("Successfully retrieved factors data from s3")

        # Get data from module that preceded imputation
        input_data = aws_functions.read_dataframe_from_s3(
            bucket_name, current_data)

        # Split out non responder data from input
        non_responder_dataframe = input_data[input_data[response_type] == 1]
        logger.info("Successfully retrieved raw-input data from s3")

        # Read in previous period data for current period non-responders
        prev_period_data = aws_functions.read_dataframe_from_s3(
            bucket_name, previous_data)
        logger.info("Successfully retrieved previous period data from s3")
        # Filter so we only have those that responded in prev
        prev_period_data = prev_period_data[prev_period_data[response_type] ==
                                            2]

        prev_questions_list = produce_columns("prev_", questions_list,
                                              [reference])

        for question in questions_list:
            prev_period_data = prev_period_data.rename(
                index=str, columns={question: "prev_" + question})
        logger.info("Successfully renamed previous period data")

        non_responder_dataframe_with_prev = pd.merge(
            non_responder_dataframe,
            prev_period_data[prev_questions_list],
            on=reference,
        )
        logger.info(
            "Successfully merged previous period data with non-responder df")

        # Merge the factors onto the non responders
        non_responders_with_factors = pd.merge(
            non_responder_dataframe_with_prev,
            factors_dataframe[produce_columns("imputation_factor_",
                                              questions_list,
                                              distinct_values)],
            on=distinct_values,
            how="inner",
        )
        logger.info("Successfully merged non-responders with factors")

        # Collects all rows where an imputation factor doesn't exist.
        dropped_rows = non_responder_dataframe_with_prev[
            ~non_responder_dataframe_with_prev[reference].
            isin(non_responders_with_factors[reference])].dropna()

        if len(dropped_rows) > 0:
            merge_values = distinct_values
            merge_values.remove(region_column)

            # Collect the GB region imputation factors if they exist.
            regionless_factors = \
                factors_dataframe[
                    produce_columns("imputation_factor_",
                                    questions_list,
                                    distinct_values)
                ][factors_dataframe[region_column] == regionless_code]

            if len(merge_values) != 0:
                # Basic merge where we have values to merge on.
                dropped_rows_with_factors = \
                    pd.merge(dropped_rows, regionless_factors,
                             on=merge_values, how="inner")
            else:
                # Added a column to both dataframes to use for the merge.
                dropped_rows["Temp_Key"] = 0
                regionless_factors["Temp_Key"] = 0

                dropped_rows_with_factors = \
                    pd.merge(dropped_rows, regionless_factors,
                             on="Temp_Key", how="inner")

                dropped_rows_with_factors = dropped_rows_with_factors.drop(
                    "Temp_Key", axis=1)

            non_responders_with_factors = \
                pd.concat([non_responders_with_factors, dropped_rows_with_factors])
            logger.info("Successfully merged missing rows with non_responders")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url":
                bpm_queue_url,
                "data":
                json.loads(
                    non_responders_with_factors.to_json(orient="records")),
                "environment":
                environment,
                "questions_list":
                questions_list,
                "run_id":
                run_id,
                "sum_columns":
                sum_columns,
                "survey":
                survey
            }
        }

        # Non responder data should now contain all previous values
        #   and the imputation columns
        imputed_data = lambda_client.invoke(
            FunctionName=method_name,
            Payload=json.dumps(payload),
        )
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            imputed_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        imputed_non_responders = pd.read_json(json_response["data"],
                                              dtype=False)

        # retrieve current responders from input data..
        current_responders = input_data[input_data[response_type] == 2]

        # Joining Datasets Together.
        final_imputed = pd.concat([current_responders, imputed_non_responders])
        logger.info("Successfully joined imputed data with responder data")

        # Create A List Of Factor Columns To Drop
        cols_to_drop = produce_columns(
            "imputation_factor_", questions_list,
            produce_columns("prev_", questions_list))

        filtered_data = final_imputed.drop(cols_to_drop, axis=1)

        message = filtered_data.to_json(orient="records")

        aws_functions.save_to_s3(bucket_name, out_file_name, message)
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, current_data))
            logger.info(aws_functions.delete_data(bucket_name, previous_data))
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data.")

        aws_functions.send_sns_message(sns_topic_arn,
                                       "Imputation - Apply Factors.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    # Send end status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)

    return {"success": True}
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the addition of regionless records.
    :param event: Contains all the variables which are required for the specific run.
    :param context: N/A
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Add an all-GB region - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables['environment']
        factors_parameters = runtime_variables["factors_parameters"]["RuntimeVariables"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        region_column = factors_parameters["region_column"]
        regionless_code = factors_parameters["regionless_code"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables['survey']

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:

        logger.info("Started - retrieved configuration variables.")

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        # Get data from module that preceded this step
        input_data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Successfully retrieved input data from s3")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(
                    input_data.to_json(orient="records")),
                "environment": environment,
                "regionless_code": regionless_code,
                "region_column": region_column,
                "run_id": run_id,
                "survey": survey
            }
        }

        # Pass the data for processing (adding of the regionless region)
        imputed_data = lambda_client.invoke(
            FunctionName=method_name,
            Payload=json.dumps(payload),
        )
        logger.info("Successfully invoked method.")

        json_response = json.loads(imputed_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        # Save
        aws_functions.save_to_s3(bucket_name, out_file_name, json_response["data"])
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data.")

        aws_functions.send_sns_message(sns_topic_arn, "Add a all-GB region.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)
    return {"success": True}
def lambda_handler(event, context):
    """
    This method will take the simple bricks survey data and expand it to have seperate
     column for each brick type as expceted by the results pipeline. It'll then send it
     to the Results S3 bucket for further processing.
    :param event: Event object
    :param context: Context object
    :return: JSON String - {"success": boolean, "error": string}
    """
    current_module = "Results Ingest - Brick Type - Wrangler"
    error_message = ""
    # Variables required for error handling.
    bpm_queue_url = None
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Load variables.
        environment_variables = EnvironmentSchema().load(os.environ)
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables.
        method_name = environment_variables["method_name"]
        results_bucket_name = environment_variables["results_bucket_name"]

        # Runtime Variables.
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        ingestion_parameters = runtime_variables["ingestion_parameters"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")
        # Send in progress status to BPM.
        status = "IN PROGRESS"
        current_step_num = 1
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)
        # Set up client.
        lambda_client = boto3.client("lambda", region_name="eu-west-2")
        data_df = aws_functions.read_dataframe_from_s3(results_bucket_name,
                                                       in_file_name)

        logger.info("Retrieved data from S3.")
        data_json = data_df.to_json(orient="records")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "brick_questions": ingestion_parameters["brick_questions"],
                "brick_types": ingestion_parameters["brick_types"],
                "brick_type_column": ingestion_parameters["brick_type_column"],
                "data": json.loads(data_json),
                "environment": environment,
                "run_id": run_id,
                "survey": survey
            },
        }

        method_return = lambda_client.invoke(FunctionName=method_name,
                                             Payload=json.dumps(payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            method_return.get("Payload").read().decode("utf-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(results_bucket_name, out_file_name,
                                 json_response["data"])

        logger.info("Data ready for Results pipeline. Written to S3.")

        aws_functions.send_sns_message(sns_topic_arn, "Ingest.")
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send end status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)

    return {"success": True}
Exemplo n.º 11
0
def lambda_handler(event, context):
    """
    prepares the data for the Strata method.
    - Read in data from the SQS queue.
    - Invoke the Strata Method.
    - Send data from the Strata method to the SQS queue.

    :param event:
    :param context:
    :return: string - Json string to send to the SNS topic upon completion
    """
    current_module = "Strata - Wrangler"
    error_message = ""
    log_message = ""
    bpm_queue_url = None
    current_step_num = 3

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]
        # Set up clients
        var_lambda = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        period_column = environment_variables["period_column"]
        segmentation = environment_variables["segmentation"]
        reference = environment_variables["reference"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_period = runtime_variables["period"]
        environment = runtime_variables['environment']
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        region_column = runtime_variables["distinct_values"][0]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        survey_column = runtime_variables["survey_column"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)

        raise exception_classes.LambdaFailure(error_message)

    try:

        logger.info("Started - retrieved configuration variables.")

        # Send start of module status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        data_df = aws_functions.read_dataframe_from_s3(bucket_name,
                                                       in_file_name)
        logger.info("Successfully retrieved data from s3")

        data_json = data_df.to_json(orient="records")
        json_payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "current_period": current_period,
                "data": data_json,
                "environment": environment,
                "period_column": period_column,
                "reference": reference,
                "region_column": region_column,
                "run_id": run_id,
                "segmentation": segmentation,
                "survey": survey,
                "survey_column": survey_column
            }
        }
        returned_data = var_lambda.invoke(FunctionName=method_name,
                                          Payload=json.dumps(json_payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            returned_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        # Push current period data onwards
        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent data to s3")

        anomalies = json_response["anomalies"]

        if anomalies != "[]":
            aws_functions.save_to_s3(bucket_name, "Strata_Anomalies",
                                     anomalies)
            have_anomalies = True
        else:
            have_anomalies = False
        logger.info("Successfully sent anomalies to s3")

        aws_functions.send_sns_message_with_anomalies(have_anomalies,
                                                      sns_topic_arn, "Strata.")

        logger.info("Successfully sent message to sns")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(log_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    # Send end of module status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)
    return {"success": True}
def lambda_handler(event, context):
    """
    Responsible for executing specified disclosure methods, masking values which could
    be used to identify specific responders.
    :param event: JSON payload containing:
    RuntimeVariables:{
        bpm_queue_url: Queue url to send BPM status message.
        cell_total_column: The name of the column holding the cell total.
        disclosivity_marker: The name of the column to put "disclosive" marker.
        disclosure_stages: The stages of disclosure you wish to run e.g. (1 2 5)
        environment: The operating environment to use in the spp logger.
        explanation: The name of the column to put reason for pass/fail.
        in_file_name: Input file specified.
        out_file_name: Output file specified.
        parent_column: The name of the column holding the count of parent company.
        publishable_indicator: The name of the column to put "publish" marker.
        stage5_threshold: The threshold used in the disclosure calculation.
        survey: The survey selected to be used in the logger.
        threshold: The threshold used in the disclosure steps.
        top1_column: The name of the column largest contributor to the cell.
        top2_column: The name of the column second largest contributor to the cell.
        total_column: The name of the column holding the cell total.
        total_steps: The total number of steps in the system.
        unique_identifier: A list of the column names to specify a unique cell.
    }
    :param context: AWS Context Object.
    :return final_output: Dict containing either:
        {"success": True}
        {"success": False, "error": <error message - Type: String>}
    """
    current_module = "Disclosure Wrangler"
    error_message = ""
    # Set-up variables for status message
    current_step_num = 6
    bpm_queue_url = None
    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        cell_total_column = runtime_variables["cell_total_column"]
        disclosivity_marker = runtime_variables["disclosivity_marker"]
        disclosure_stages = runtime_variables["disclosure_stages"]
        environment = runtime_variables["environment"]
        explanation = runtime_variables["explanation"]
        final_output_location = runtime_variables["final_output_location"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        parent_column = runtime_variables["parent_column"]
        publishable_indicator = runtime_variables["publishable_indicator"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        stage5_threshold = runtime_variables["stage5_threshold"]
        survey = runtime_variables["survey"]
        threshold = runtime_variables["threshold"]
        top1_column = runtime_variables["top1_column"]
        top2_column = runtime_variables["top2_column"]
        total_columns = runtime_variables["total_columns"]
        total_steps = runtime_variables["total_steps"]
        unique_identifier = runtime_variables["unique_identifier"]
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        # Send start of method status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                      current_step_num, total_steps)

        # Set up clients
        lambda_client = boto3.client("lambda", "eu-west-2")

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)
        logger.info("Successfully retrieved data")

        formatted_data = data.to_json(orient="records")

        disclosure_stages_list = disclosure_stages.split()
        disclosure_stages_list.sort()

        generic_json_payload = {
            "bpm_queue_url": bpm_queue_url,
            "data": formatted_data,
            "disclosivity_marker": disclosivity_marker,
            "environment": environment,
            "explanation": explanation,
            "publishable_indicator": publishable_indicator,
            "run_id": run_id,
            "survey": survey,
            "total_columns": total_columns,
            "unique_identifier": unique_identifier
        }

        stage1_payload = {
            "cell_total_column": cell_total_column
        }

        stage2_payload = {
            "parent_column": parent_column,
            "threshold": threshold
        }

        stage3_payload = {
        }

        stage4_payload = {
        }

        stage5_payload = {
            "top1_column": top1_column,
            "top2_column": top2_column,
            "cell_total_column": cell_total_column,
            "threshold": stage5_threshold
        }

        for disclosure_step in disclosure_stages_list:

            payload_array = [generic_json_payload, stage1_payload, stage2_payload,
                             stage3_payload, stage4_payload, stage5_payload]

            # Find the specific location where the stage number need to be inserted and
            # constructs the relevant method name using the disclosure stage number.
            index = method_name.find("-method")
            lambda_name = method_name[:index] + disclosure_step + method_name[index:]

            # Combines the generic payload and the stage specific payload.
            combined_input = {**payload_array[0], **(payload_array[int(disclosure_step)])}
            combined_input = {"RuntimeVariables": combined_input}

            formatted_data = invoke_method(lambda_name,
                                           combined_input,
                                           lambda_client)

            if not formatted_data["success"]:
                raise exception_classes.MethodFailure(formatted_data["error"])

            logger.info("Successfully invoked stage " + disclosure_step + " lambda")

            # Located here as after the first loop it requires formatted data to be
            # referenced with "data" and the JSON needs to be reset to use the right data.
            generic_json_payload = {
                "bpm_queue_url": bpm_queue_url,
                "data": formatted_data["data"],
                "disclosivity_marker": disclosivity_marker,
                "environment": environment,
                "explanation": explanation,
                "publishable_indicator": publishable_indicator,
                "run_id": run_id,
                "survey": survey,
                "total_columns": total_columns,
                "unique_identifier": unique_identifier
            }

        aws_functions.save_to_s3(bucket_name, out_file_name, formatted_data["data"])

        logger.info("Successfully sent data to s3")

        output_data = formatted_data["data"]

        aws_functions.save_dataframe_to_csv(pd.read_json(output_data, dtype=False),
                                            bucket_name, final_output_location)

        aws_functions.send_sns_message(sns_topic_arn, "Disclosure")
        logger.info("Successfully sent message to sns")

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send start of method status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                  current_step_num, total_steps)

    return {"success": True}
Exemplo n.º 13
0
def lambda_handler(event, context):
    """
    Prepares data for and calls the Calculate imputation factors method by adding on the
    required columns needed by the method.
    :param event: Contains all the variables which are required for the specific run.
    :param context: lambda context
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Imputation Calculate Factors - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        distinct_values = runtime_variables["distinct_values"]
        environment = runtime_variables["environment"]
        factors_parameters = runtime_variables["factors_parameters"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        period_column = runtime_variables["period_column"]
        questions_list = runtime_variables["questions_list"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables['survey']

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Successfully retrieved data")

        factor_columns = imp_func.\
            produce_columns("imputation_factor_", questions_list)

        # create df columns needed for method
        for factor in factor_columns:
            data[factor] = 0

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(data.to_json(orient="records")),
                "environment": environment,
                "questions_list": questions_list,
                "distinct_values": distinct_values,
                "factors_parameters": factors_parameters,
                "run_id": run_id,
                "survey": survey
            }
        }

        # invoke the method to calculate the factors
        calculate_factors = lambda_client.invoke(FunctionName=method_name,
                                                 Payload=json.dumps(payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            calculate_factors.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        output_df = pd.read_json(json_response["data"], dtype=False)
        distinct_values.append(period_column)
        columns_to_keep = imp_func.produce_columns("imputation_factor_",
                                                   questions_list,
                                                   distinct_values)

        final_df = output_df[columns_to_keep].drop_duplicates().to_json(
            orient="records")
        aws_functions.save_to_s3(bucket_name, out_file_name, final_df)
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data.")

        aws_functions.send_sns_message(sns_topic_arn,
                                       "Imputation - Calculate Factors.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}
Exemplo n.º 14
0
def lambda_handler(event, context):
    """
    This method will ingest data from Take On S3 bucket, transform it so that it fits
    in the results pipeline, and send it to the Results S3 bucket for further processing.
    :param event: Event object
    :param context: Context object
    :return: JSON String - {"success": boolean, "error": string}
    """
    current_module = "Results Ingest - Takeon Data - Wrangler"
    error_message = ""
    # Variables required for error handling.
    bpm_queue_url = None
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling.
        run_id = event["RuntimeVariables"]["run_id"]

        # Load variables.
        environment_variables = EnvironmentSchema().load(os.environ)
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables.
        method_name = environment_variables["method_name"]
        results_bucket_name = environment_variables["results_bucket_name"]

        # Runtime Variables.
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        ingestion_parameters = runtime_variables["ingestion_parameters"]
        out_file_name = runtime_variables["out_file_name"]
        period = runtime_variables["period"]
        periodicity = runtime_variables["periodicity"]
        snapshot_s3_uri = runtime_variables["snapshot_s3_uri"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context,
                                                           bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context,
                                                           bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")
        # Send in progress status to BPM.
        current_step_num = 1
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                      current_step_num, total_steps)

        # Set up client.
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        # Wrangle the S3 URI into bucket + name.
        snapshot_parsed_uri = urlparse(snapshot_s3_uri)
        snapshot_bucket = snapshot_parsed_uri.netloc
        snapshot_file = snapshot_parsed_uri.path
        snapshot_file = snapshot_file[1:]  # Remove the leading '/'

        # Get the file from S3
        input_file = aws_functions.read_from_s3(snapshot_bucket,
                                                snapshot_file,
                                                file_extension="")

        logger.info(f"Read Snapshot {snapshot_file} from S3 bucket {snapshot_bucket}")

        payload = {

            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(input_file),
                "environment": environment,
                "period": period,
                "periodicity": periodicity,
                "question_labels": ingestion_parameters["question_labels"],
                "run_id": run_id,
                "statuses": ingestion_parameters["statuses"],
                "survey": survey,
                "survey_codes": ingestion_parameters["survey_codes"]
            },
        }

        method_return = lambda_client.invoke(
            FunctionName=method_name, Payload=json.dumps(payload)
        )
        logger.info("Successfully invoked method.")

        json_response = json.loads(method_return.get("Payload").read().decode("utf-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(results_bucket_name, out_file_name,
                                 json_response["data"])

        logger.info("Data ready for Results pipeline. Written to S3.")

        aws_functions.send_sns_message(sns_topic_arn, "Ingest.")
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send end status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                  current_step_num, total_steps)

    return {"success": True}
def lambda_handler(event, context):
    """
    The wrangler converts the data from JSON format into a dataframe and then edits data.
    This process consolidates 36 columns of data down to 12 and adds brick_type, then
    creates two outputs. One with the GB region added and one with a
    consolidated brick_type.

    :param event: Contains all the variables which are required for the specific run.
    :param context: N/A

    :return:  Success & None/Error - Type: JSON
    """
    current_module = "Pre Aggregation Data Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        column_list = runtime_variables["total_columns"]
        environment = runtime_variables["environment"]
        factors_parameters = runtime_variables["factors_parameters"][
            "RuntimeVariables"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name_bricks = runtime_variables["out_file_name_bricks"]
        out_file_name_region = runtime_variables["out_file_name_region"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        unique_identifier = runtime_variables["unique_identifier"]

        # Factors Parameters
        region_column = factors_parameters["region_column"]
        regionless_code = factors_parameters["regionless_code"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)
    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)

        raise exception_classes.LambdaFailure(error_message)
    try:
        logger.info("Started - retrieved configuration variables.")
        # Send start of module status to BPM.
        # (NB: Current step and total steps omitted to display as "-" in bpm.)
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id)

        # Pulls In Data.
        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Retrieved data from s3")
        new_type = 1  # This number represents Clay & Sandlime Combined
        brick_type = {"clay": 3, "concrete": 2, "sandlime": 4}

        # Prune rows that contain no data
        questions_list = [
            brick + "_" + column for column in column_list
            for brick in brick_type.keys()
        ]
        data["zero_data"] = data.apply(lambda x: do_check(x, questions_list),
                                       axis=1)
        data = data[~data["zero_data"]]
        data.drop(["zero_data"], axis=1, inplace=True)

        # Identify The Brick Type Of The Row.
        data[unique_identifier[0]] = data.apply(
            lambda x: calculate_row_type(x, brick_type, column_list), axis=1)

        # Collate Each Rows 12 Good Brick Type Columns And 24 Empty Columns Down
        # Into 12 With The Same Name.
        data = data.apply(lambda x: sum_columns(x, brick_type, column_list,
                                                unique_identifier),
                          axis=1)

        # Old Columns With Brick Type In The Name Are Dropped.
        for question in questions_list:
            data.drop([question], axis=1, inplace=True)

        # Add GB Region For Aggregation By Region.
        logger.info("Creating File For Aggregation By Region.")
        data_region = data.to_json(orient="records")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(data_region),
                "environment": environment,
                "region_column": region_column,
                "regionless_code": regionless_code,
                "run_id": run_id,
                "survey": survey
            }
        }

        # Pass the data for processing (adding of the regionless region.
        gb_region_data = lambda_client.invoke(FunctionName=method_name,
                                              Payload=json.dumps(payload))
        logger.info("Succesfully invoked method.")

        json_response = json.loads(
            gb_region_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        region_dataframe = pd.DataFrame(json.loads(json_response["data"]))

        totals_dict = {total_column: "sum" for total_column in column_list}

        data_region = region_dataframe.groupby(
            unique_identifier[1:]).agg(totals_dict).reset_index()

        region_output = data_region.to_json(orient="records")

        aws_functions.save_to_s3(bucket_name, out_file_name_region,
                                 region_output)

        logger.info("Successfully sent data to s3")

        # Collate Brick Types Clay And Sand Lime Into A Single Type And Add To Data
        # For Aggregation By Brick Type.
        logger.info("Creating File For Aggregation By Brick Type.")
        data_brick = data.copy()

        data = data[data[unique_identifier[0]] != brick_type["concrete"]]
        data[unique_identifier[0]] = new_type

        data_brick = pd.concat([data_brick, data])

        brick_dataframe = data_brick.groupby(
            unique_identifier[0:2]).agg(totals_dict).reset_index()

        brick_output = brick_dataframe.to_json(orient="records")
        aws_functions.save_to_s3(bucket_name, out_file_name_bricks,
                                 brick_output)

        logger.info("Successfully sent data to s3")

        logger.info(
            aws_functions.send_sns_message(sns_topic_arn, "Pre Aggregation."))

        logger.info("Succesfully sent message to sns")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send end of module status to BPM.
    # (NB: Current step and total steps omitted to display as "-" in bpm.)
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id)

    return {"success": True}