def handle_exception(exception,
                     module,
                     run_id,
                     context=None,
                     bpm_queue_url=None):
    """
    Description: Generates an error message from an exception.
    Returns an error message detailing exception type, arguments, and line number.
    :param exception: Exception that has occurred - Type: Exception
    :param module: Name of current module - Type: String
    :param run_id: The current run's ID - Type: String
    :param context: AWS Context object
    (has default so that moving to glue will not require lots of changes)
    :param bpm_queue_url: The url of the queue to send the BPM status message to.
    :return error_message: Error message generated for exception - Type: String
    """
    exc_type, exc_obj, exc_tb = sys.exc_info()
    tb = traceback.extract_tb(exc_tb)[-1]
    error_message = str(exc_type) + " in " + module + " | RunID: " + str(run_id) + \
        " |- " + str(exception.args)
    if context:
        error_message += " | Request ID: " + str(context.aws_request_id)
    error_message += " | Outer line number: " + str(
        exception.__traceback__.tb_lineno)
    error_message += " | Inner Line number: " + str(tb[1]) + " in: " + str(
        tb[0])

    if bpm_queue_url:
        status_msg = "ERROR"
        aws_functions.send_bpm_status(bpm_queue_url, module, status_msg,
                                      run_id)

    return error_message
def send_status(status, module_name, current_step_num=None):
    aws_functions.send_bpm_status(
        bpm_queue_url,
        module_name,
        status,
        run_id,
        survey=pipeline,
        current_step_num=current_step_num,
        total_steps=num_methods,
    )
Пример #3
0
    def send_status(self, status, module_name, current_step_num=None):
        """
        Send a status message for the pipeline
        :param current_step_num: the number of the step in the pipeline - Int or None
        :param module_name: the name of the module to be reported - String
        :param status: The status to send - String
        :return:
        """
        if self.bpm_queue_url is None:
            return

        aws_functions.send_bpm_status(
            self.bpm_queue_url,
            module_name,
            status,
            self.run_id,
            survey="RSI",
            current_step_num=current_step_num,
            total_steps=len(self.methods),
        )
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the calculate top two
    statistical method.
    The method requires a dataframe which must contain the columns specified by:
    - aggregated column
    - additional aggregated column
    - total columns

    :param event: {"RuntimeVariables":{
        aggregated_column - A column to aggregate by. e.g. Enterprise_Reference.
        additional_aggregated_column - A column to aggregate by. e.g. Region.
        total_columns - The names of the columns to produce aggregations for.
        top1_column - The prefix for the largest_contibutor column
        top2_column - The prefix for the second_largest_contibutor column
    }}
    :param context: N/A
    :return: {"success": True}
            or LambdaFailure exception
    """
    current_module = "Aggregation Calc Top Two - Wrangler."

    error_message = ""
    bpm_queue_url = None
    current_step_num = 5

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Needs to be declared inside of the lambda handler
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]

        # Runtime Variables
        additional_aggregated_column = runtime_variables[
            "additional_aggregated_column"]
        aggregated_column = runtime_variables["aggregated_column"]
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        top1_column = runtime_variables["top1_column"]
        top2_column = runtime_variables["top2_column"]
        total_columns = runtime_variables["total_columns"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)
    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables")
        # Send start of module status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        # Read from S3 bucket
        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)
        logger.info("Retrieved data from s3")

        # Serialise data
        logger.info("Converting dataframe to json.")
        prepared_data = data.to_json(orient="records")

        # Invoke aggregation top2 method
        logger.info("Invoking the statistical method.")

        json_payload = {
            "RuntimeVariables": {
                "additional_aggregated_column": additional_aggregated_column,
                "aggregated_column": aggregated_column,
                "bpm_queue_url": bpm_queue_url,
                "data": prepared_data,
                "environment": environment,
                "run_id": run_id,
                "survey": survey,
                "top1_column": top1_column,
                "top2_column": top2_column,
                "total_columns": total_columns
            }
        }

        top2 = lambda_client.invoke(FunctionName=method_name,
                                    Payload=json.dumps(json_payload))

        json_response = json.loads(top2.get("Payload").read().decode("utf-8"))

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        # Sending output to S3, notice to SNS
        logger.info("Sending function response downstream.")
        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent the data to S3")

        aws_functions.send_sns_message(sns_topic_arn, "Aggregation - Top 2.")
        logger.info("Successfully sent the SNS message")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}
def lambda_handler(event, context):
    """
    This method takes the new columns and adds them all onto the main dataset.

    :param event: { "RuntimeVariables": {
        aggregated_column - A column to aggregate by. e.g. Enterprise_Reference.
        additional_aggregated_column - A column to aggregate by. e.g. Region.
    }}
    :param context:
    :return:
    """

    current_module = "Aggregation_Combiner"
    error_message = ""
    bpm_queue_url = None
    current_step_num = 5

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        additional_aggregated_column = runtime_variables["additional_aggregated_column"]
        aggregated_column = runtime_variables["aggregated_column"]
        aggregation_files = runtime_variables["aggregation_files"]
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)
    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - Retrieved configuration variables.")
        # Get file from s3
        imp_df = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Retrieved data from s3")

        # Receive the 3 aggregation outputs.
        ent_ref_agg = aggregation_files["ent_ref_agg"]
        cell_agg = aggregation_files["cell_agg"]
        top2_agg = aggregation_files["top2_agg"]

        # Load file content.
        ent_ref_agg_df = aws_functions.read_dataframe_from_s3(bucket_name, ent_ref_agg)
        cell_agg_df = aws_functions.read_dataframe_from_s3(bucket_name, cell_agg)
        top2_agg_df = aws_functions.read_dataframe_from_s3(bucket_name, top2_agg)
        logger.info("Successfully retrievied aggragation data from s3")

        to_aggregate = [aggregated_column]
        if additional_aggregated_column != "":
            to_aggregate.append(additional_aggregated_column)

        # merge the imputation output from s3 with the 3 aggregation outputs
        first_merge = pd.merge(
            imp_df, ent_ref_agg_df, on=to_aggregate, how="left")

        second_merge = pd.merge(
            first_merge, cell_agg_df, on=to_aggregate, how="left")

        third_merge = pd.merge(
            second_merge, top2_agg_df, on=to_aggregate, how="left")

        logger.info("Successfully merged dataframes")

        # convert output to json ready to return
        final_output = third_merge.to_json(orient="records")

        # send output onwards
        aws_functions.save_to_s3(bucket_name, out_file_name, final_output)
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, ent_ref_agg))
            logger.info(aws_functions.delete_data(bucket_name, cell_agg))
            logger.info(aws_functions.delete_data(bucket_name, top2_agg))
            logger.info("Successfully deleted input data.")

        aws_functions.send_sns_message(sns_topic_arn, "Aggregation - Combiner.")
        logger.info("Successfully sent data to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context,
                                                           bpm_queue_url=bpm_queue_url)

    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send end of module status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                  current_step_num, total_steps)

    return {"success": True}
Пример #6
0
def lambda_handler(event, context):
    """
    Lambda function preparing data for enrichment and then calling the enrichment method.
    :param event: Json string representing input - String
    :param context:
    :return Json: success and/or indication of error message.
    """

    # Set up logger.
    current_module = "Enrichment - Wrangler"
    error_message = ""

    bpm_queue_url = None
    current_step_num = 2

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables.
        bucket_name = environment_variables["bucket_name"]
        identifier_column = environment_variables["identifier_column"]
        method_name = environment_variables["method_name"]

        # Runtime Variables.
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables['environment']
        lookups = runtime_variables["lookups"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        marine_mismatch_check = runtime_variables["marine_mismatch_check"]
        period_column = runtime_variables["period_column"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables['survey']
        survey_column = runtime_variables["survey_column"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)

        raise exception_classes.LambdaFailure(error_message)

    try:

        # Send start of method status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        # Set up client.
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        data_df = aws_functions.read_dataframe_from_s3(bucket_name,
                                                       in_file_name)

        logger.info("Started - retrieved data from s3")
        data_json = data_df.to_json(orient="records")
        json_payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "environment": environment,
                "data": data_json,
                "lookups": lookups,
                "marine_mismatch_check": marine_mismatch_check,
                "survey": survey,
                "survey_column": survey_column,
                "period_column": period_column,
                "identifier_column": identifier_column,
                "run_id": run_id
            }
        }
        response = lambda_client.invoke(FunctionName=method_name,
                                        Payload=json.dumps(json_payload))

        logger.info("Successfully invoked method.")
        json_response = json.loads(
            response.get("Payload").read().decode("utf-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])

        logger.info("Successfully sent data to s3.")

        anomalies = json_response["anomalies"]

        if anomalies != "[]":
            aws_functions.save_to_s3(bucket_name, "Enrichment_Anomalies",
                                     anomalies)
            have_anomalies = True
        else:
            have_anomalies = False

        aws_functions.send_sns_message_with_anomalies(have_anomalies,
                                                      sns_topic_arn,
                                                      "Enrichment.")

        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)

    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    # Send end of method status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)

    return {"success": True}
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the calculate movements statistical method.
    The method requires a column per question to store the movements, named as follows:
    'movement_questionNameAndNumber'. The wrangler checks for non response and if everyone
    has responded the calculate movements is skipped.
    :param event: Contains Runtime_variables, which contains the movement_type
    :param context: N/A
    :return: Success & Impute/Error - Type: JSON
    """
    to_be_imputed = True
    current_module = "Imputation Movement - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None
    current_step_num = 4

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        response_type = environment_variables["response_type"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_data = runtime_variables["current_data"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        movement_type = runtime_variables["movement_type"]
        out_file_name = runtime_variables["out_file_name"]
        out_file_name_skip = runtime_variables["out_file_name_skip"]
        period = runtime_variables["period"]
        period_column = runtime_variables["period_column"]
        periodicity = runtime_variables["periodicity"]
        previous_data = runtime_variables["previous_data"]
        questions_list = runtime_variables["questions_list"]
        reference = runtime_variables["unique_identifier"][0]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        # Send in progress status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        previous_period = general_functions.calculate_adjacent_periods(
            period, periodicity)
        logger.info("Completed reading data from s3")
        previous_period_data = data[data[period_column].astype("str") == str(
            previous_period)]
        data = data[data[period_column].astype("str") == str(period)]
        logger.info("Split input data")

        # Create a Dataframe where the response column
        # value is set as 1 i.e non responders
        filtered_non_responders = data.loc[(data[response_type] == 1) & (
            data[period_column].astype("str") == str(period))]

        logger.info("Successfully created filtered non responders DataFrame")

        response_check = len(filtered_non_responders.index)

        # If greater than 0 it means there is non-responders so Imputation need to be run
        if response_check > 0:

            # Save previous period data to s3 for apply to pick up later
            aws_functions.save_to_s3(
                bucket_name, previous_data,
                previous_period_data.to_json(orient="records"))
            # Save raw data to s3 for apply to pick up later
            aws_functions.save_to_s3(bucket_name, current_data,
                                     data.to_json(orient="records"))
            logger.info("Successfully sent data.")

            # Ensure that only responder_ids with a response
            # type of 2 (returned) get picked up
            data = data[data[response_type] == 2]
            previous_period_data = \
                previous_period_data[previous_period_data[response_type] == 2]

            # Ensure that only rows that exist in both current and previous get picked up.
            data = data[data[reference].isin(
                previous_period_data[reference])].dropna()
            previous_period_data = previous_period_data[
                previous_period_data[reference].isin(
                    data[reference])].dropna()  # noqa e501

            # Merged together so it can be sent via the payload to the method
            merged_data = pd.concat([data, previous_period_data])

            # Make sure there is some data, non-responders were removed at this stage
            if len(merged_data.index) > 0:
                logger.info(
                    "Successfully filtered and merged the previous period data"
                )
            else:
                raise exception_classes.LambdaFailure(
                    "No data left after filtering")

            for question in questions_list:
                merged_data["movement_" + question] = 0.0

            json_ordered_data = merged_data.to_json(orient="records")

            json_payload = {
                "RuntimeVariables": {
                    "bpm_queue_url": bpm_queue_url,
                    "current_period": period,
                    "data": json.loads(json_ordered_data),
                    "environment": environment,
                    "movement_type": movement_type,
                    "period_column": period_column,
                    "previous_period": previous_period,
                    "questions_list": questions_list,
                    "run_id": run_id,
                    "survey": survey
                }
            }

            logger.info("Successfully created movement columns on the data")

            imputed_data = lambda_client.invoke(
                FunctionName=method_name, Payload=json.dumps(json_payload))

            logger.info("Successfully invoked method.")

            json_response = json.loads(
                imputed_data.get("Payload").read().decode("UTF-8"))
            logger.info("JSON extracted from method response.")

            if not json_response["success"]:
                raise exception_classes.MethodFailure(json_response["error"])

            imputation_run_type = "Calculate Movement."
            aws_functions.save_to_s3(bucket_name, out_file_name,
                                     json_response["data"])

            logger.info("Successfully sent the data to s3")

        else:

            to_be_imputed = False
            imputation_run_type = "Has Not Run."

            aws_functions.save_to_s3(bucket_name, out_file_name_skip,
                                     data.to_json(orient="records"))

            logger.info("Successfully sent the unchanged data to s3")

            aws_functions.send_sns_message(sns_topic_arn,
                                           "Imputation - Did not run")
            logger.info("Successfully sent message to sns")

        aws_functions.send_sns_message(sns_topic_arn,
                                       "Imputation - " + imputation_run_type)

        logger.info("Successfully sent the SNS message")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True, "impute": to_be_imputed}
Пример #8
0
def lambda_handler(event, context):
    error_message = ''
    current_module = "Success Handler."

    bpm_queue_url = None
    run_id = 0

    try:
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        run_id = runtime_variables["run_id"]
        survey = runtime_variables["survey"]

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")
        outcome = "PASS"
        jsonresponse = """ {"resultFlag": \"""" + str(outcome) \
                       + """\", "id": \"""" + run_id + """\"}"""
        jsonresponse = json.loads(jsonresponse)

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")
    # Send end status to BPM.
    status = "RUN COMPLETE"
    current_module = "BMI Results Processing Complete."
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id)

    return jsonresponse
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the apply factors statistical method.
    The method requires a column per question to store the factors.
    :param event:  Contains all the variables which are required for the specific run.
    :param context: N/A
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Imputation Apply Factors - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None
    current_step_num = 4

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        response_type = environment_variables["response_type"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_data = runtime_variables["current_data"]
        distinct_values = runtime_variables["distinct_values"]
        environment = runtime_variables["environment"]
        factors_parameters = runtime_variables["factors_parameters"][
            "RuntimeVariables"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        previous_data = runtime_variables["previous_data"]
        questions_list = runtime_variables["questions_list"]
        reference = runtime_variables["unique_identifier"][0]
        region_column = factors_parameters["region_column"]
        regionless_code = factors_parameters["regionless_code"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        sum_columns = runtime_variables["sum_columns"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        # Get factors data from calculate_factors
        factors_dataframe = aws_functions.read_dataframe_from_s3(
            bucket_name, in_file_name)
        logger.info("Successfully retrieved factors data from s3")

        # Get data from module that preceded imputation
        input_data = aws_functions.read_dataframe_from_s3(
            bucket_name, current_data)

        # Split out non responder data from input
        non_responder_dataframe = input_data[input_data[response_type] == 1]
        logger.info("Successfully retrieved raw-input data from s3")

        # Read in previous period data for current period non-responders
        prev_period_data = aws_functions.read_dataframe_from_s3(
            bucket_name, previous_data)
        logger.info("Successfully retrieved previous period data from s3")
        # Filter so we only have those that responded in prev
        prev_period_data = prev_period_data[prev_period_data[response_type] ==
                                            2]

        prev_questions_list = produce_columns("prev_", questions_list,
                                              [reference])

        for question in questions_list:
            prev_period_data = prev_period_data.rename(
                index=str, columns={question: "prev_" + question})
        logger.info("Successfully renamed previous period data")

        non_responder_dataframe_with_prev = pd.merge(
            non_responder_dataframe,
            prev_period_data[prev_questions_list],
            on=reference,
        )
        logger.info(
            "Successfully merged previous period data with non-responder df")

        # Merge the factors onto the non responders
        non_responders_with_factors = pd.merge(
            non_responder_dataframe_with_prev,
            factors_dataframe[produce_columns("imputation_factor_",
                                              questions_list,
                                              distinct_values)],
            on=distinct_values,
            how="inner",
        )
        logger.info("Successfully merged non-responders with factors")

        # Collects all rows where an imputation factor doesn't exist.
        dropped_rows = non_responder_dataframe_with_prev[
            ~non_responder_dataframe_with_prev[reference].
            isin(non_responders_with_factors[reference])].dropna()

        if len(dropped_rows) > 0:
            merge_values = distinct_values
            merge_values.remove(region_column)

            # Collect the GB region imputation factors if they exist.
            regionless_factors = \
                factors_dataframe[
                    produce_columns("imputation_factor_",
                                    questions_list,
                                    distinct_values)
                ][factors_dataframe[region_column] == regionless_code]

            if len(merge_values) != 0:
                # Basic merge where we have values to merge on.
                dropped_rows_with_factors = \
                    pd.merge(dropped_rows, regionless_factors,
                             on=merge_values, how="inner")
            else:
                # Added a column to both dataframes to use for the merge.
                dropped_rows["Temp_Key"] = 0
                regionless_factors["Temp_Key"] = 0

                dropped_rows_with_factors = \
                    pd.merge(dropped_rows, regionless_factors,
                             on="Temp_Key", how="inner")

                dropped_rows_with_factors = dropped_rows_with_factors.drop(
                    "Temp_Key", axis=1)

            non_responders_with_factors = \
                pd.concat([non_responders_with_factors, dropped_rows_with_factors])
            logger.info("Successfully merged missing rows with non_responders")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url":
                bpm_queue_url,
                "data":
                json.loads(
                    non_responders_with_factors.to_json(orient="records")),
                "environment":
                environment,
                "questions_list":
                questions_list,
                "run_id":
                run_id,
                "sum_columns":
                sum_columns,
                "survey":
                survey
            }
        }

        # Non responder data should now contain all previous values
        #   and the imputation columns
        imputed_data = lambda_client.invoke(
            FunctionName=method_name,
            Payload=json.dumps(payload),
        )
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            imputed_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        imputed_non_responders = pd.read_json(json_response["data"],
                                              dtype=False)

        # retrieve current responders from input data..
        current_responders = input_data[input_data[response_type] == 2]

        # Joining Datasets Together.
        final_imputed = pd.concat([current_responders, imputed_non_responders])
        logger.info("Successfully joined imputed data with responder data")

        # Create A List Of Factor Columns To Drop
        cols_to_drop = produce_columns(
            "imputation_factor_", questions_list,
            produce_columns("prev_", questions_list))

        filtered_data = final_imputed.drop(cols_to_drop, axis=1)

        message = filtered_data.to_json(orient="records")

        aws_functions.save_to_s3(bucket_name, out_file_name, message)
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, current_data))
            logger.info(aws_functions.delete_data(bucket_name, previous_data))
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data.")

        aws_functions.send_sns_message(sns_topic_arn,
                                       "Imputation - Apply Factors.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    # Send end status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)

    return {"success": True}
def lambda_handler(event, context):
    """
    This method will take the simple bricks survey data and expand it to have seperate
     column for each brick type as expceted by the results pipeline. It'll then send it
     to the Results S3 bucket for further processing.
    :param event: Event object
    :param context: Context object
    :return: JSON String - {"success": boolean, "error": string}
    """
    current_module = "Results Ingest - Brick Type - Wrangler"
    error_message = ""
    # Variables required for error handling.
    bpm_queue_url = None
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Load variables.
        environment_variables = EnvironmentSchema().load(os.environ)
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables.
        method_name = environment_variables["method_name"]
        results_bucket_name = environment_variables["results_bucket_name"]

        # Runtime Variables.
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        ingestion_parameters = runtime_variables["ingestion_parameters"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")
        # Send in progress status to BPM.
        status = "IN PROGRESS"
        current_step_num = 1
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)
        # Set up client.
        lambda_client = boto3.client("lambda", region_name="eu-west-2")
        data_df = aws_functions.read_dataframe_from_s3(results_bucket_name,
                                                       in_file_name)

        logger.info("Retrieved data from S3.")
        data_json = data_df.to_json(orient="records")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "brick_questions": ingestion_parameters["brick_questions"],
                "brick_types": ingestion_parameters["brick_types"],
                "brick_type_column": ingestion_parameters["brick_type_column"],
                "data": json.loads(data_json),
                "environment": environment,
                "run_id": run_id,
                "survey": survey
            },
        }

        method_return = lambda_client.invoke(FunctionName=method_name,
                                             Payload=json.dumps(payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            method_return.get("Payload").read().decode("utf-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(results_bucket_name, out_file_name,
                                 json_response["data"])

        logger.info("Data ready for Results pipeline. Written to S3.")

        aws_functions.send_sns_message(sns_topic_arn, "Ingest.")
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send end status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)

    return {"success": True}
Пример #11
0
def lambda_handler(event, context):
    """
    prepares the data for the Strata method.
    - Read in data from the SQS queue.
    - Invoke the Strata Method.
    - Send data from the Strata method to the SQS queue.

    :param event:
    :param context:
    :return: string - Json string to send to the SNS topic upon completion
    """
    current_module = "Strata - Wrangler"
    error_message = ""
    log_message = ""
    bpm_queue_url = None
    current_step_num = 3

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]
        # Set up clients
        var_lambda = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        period_column = environment_variables["period_column"]
        segmentation = environment_variables["segmentation"]
        reference = environment_variables["reference"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_period = runtime_variables["period"]
        environment = runtime_variables['environment']
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        region_column = runtime_variables["distinct_values"][0]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        survey_column = runtime_variables["survey_column"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)

        raise exception_classes.LambdaFailure(error_message)

    try:

        logger.info("Started - retrieved configuration variables.")

        # Send start of module status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        data_df = aws_functions.read_dataframe_from_s3(bucket_name,
                                                       in_file_name)
        logger.info("Successfully retrieved data from s3")

        data_json = data_df.to_json(orient="records")
        json_payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "current_period": current_period,
                "data": data_json,
                "environment": environment,
                "period_column": period_column,
                "reference": reference,
                "region_column": region_column,
                "run_id": run_id,
                "segmentation": segmentation,
                "survey": survey,
                "survey_column": survey_column
            }
        }
        returned_data = var_lambda.invoke(FunctionName=method_name,
                                          Payload=json.dumps(json_payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            returned_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        # Push current period data onwards
        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent data to s3")

        anomalies = json_response["anomalies"]

        if anomalies != "[]":
            aws_functions.save_to_s3(bucket_name, "Strata_Anomalies",
                                     anomalies)
            have_anomalies = True
        else:
            have_anomalies = False
        logger.info("Successfully sent anomalies to s3")

        aws_functions.send_sns_message_with_anomalies(have_anomalies,
                                                      sns_topic_arn, "Strata.")

        logger.info("Successfully sent message to sns")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(log_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    # Send end of module status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)
    return {"success": True}
def lambda_handler(event, context):
    """
    Responsible for executing specified disclosure methods, masking values which could
    be used to identify specific responders.
    :param event: JSON payload containing:
    RuntimeVariables:{
        bpm_queue_url: Queue url to send BPM status message.
        cell_total_column: The name of the column holding the cell total.
        disclosivity_marker: The name of the column to put "disclosive" marker.
        disclosure_stages: The stages of disclosure you wish to run e.g. (1 2 5)
        environment: The operating environment to use in the spp logger.
        explanation: The name of the column to put reason for pass/fail.
        in_file_name: Input file specified.
        out_file_name: Output file specified.
        parent_column: The name of the column holding the count of parent company.
        publishable_indicator: The name of the column to put "publish" marker.
        stage5_threshold: The threshold used in the disclosure calculation.
        survey: The survey selected to be used in the logger.
        threshold: The threshold used in the disclosure steps.
        top1_column: The name of the column largest contributor to the cell.
        top2_column: The name of the column second largest contributor to the cell.
        total_column: The name of the column holding the cell total.
        total_steps: The total number of steps in the system.
        unique_identifier: A list of the column names to specify a unique cell.
    }
    :param context: AWS Context Object.
    :return final_output: Dict containing either:
        {"success": True}
        {"success": False, "error": <error message - Type: String>}
    """
    current_module = "Disclosure Wrangler"
    error_message = ""
    # Set-up variables for status message
    current_step_num = 6
    bpm_queue_url = None
    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        cell_total_column = runtime_variables["cell_total_column"]
        disclosivity_marker = runtime_variables["disclosivity_marker"]
        disclosure_stages = runtime_variables["disclosure_stages"]
        environment = runtime_variables["environment"]
        explanation = runtime_variables["explanation"]
        final_output_location = runtime_variables["final_output_location"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        parent_column = runtime_variables["parent_column"]
        publishable_indicator = runtime_variables["publishable_indicator"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        stage5_threshold = runtime_variables["stage5_threshold"]
        survey = runtime_variables["survey"]
        threshold = runtime_variables["threshold"]
        top1_column = runtime_variables["top1_column"]
        top2_column = runtime_variables["top2_column"]
        total_columns = runtime_variables["total_columns"]
        total_steps = runtime_variables["total_steps"]
        unique_identifier = runtime_variables["unique_identifier"]
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        # Send start of method status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                      current_step_num, total_steps)

        # Set up clients
        lambda_client = boto3.client("lambda", "eu-west-2")

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)
        logger.info("Successfully retrieved data")

        formatted_data = data.to_json(orient="records")

        disclosure_stages_list = disclosure_stages.split()
        disclosure_stages_list.sort()

        generic_json_payload = {
            "bpm_queue_url": bpm_queue_url,
            "data": formatted_data,
            "disclosivity_marker": disclosivity_marker,
            "environment": environment,
            "explanation": explanation,
            "publishable_indicator": publishable_indicator,
            "run_id": run_id,
            "survey": survey,
            "total_columns": total_columns,
            "unique_identifier": unique_identifier
        }

        stage1_payload = {
            "cell_total_column": cell_total_column
        }

        stage2_payload = {
            "parent_column": parent_column,
            "threshold": threshold
        }

        stage3_payload = {
        }

        stage4_payload = {
        }

        stage5_payload = {
            "top1_column": top1_column,
            "top2_column": top2_column,
            "cell_total_column": cell_total_column,
            "threshold": stage5_threshold
        }

        for disclosure_step in disclosure_stages_list:

            payload_array = [generic_json_payload, stage1_payload, stage2_payload,
                             stage3_payload, stage4_payload, stage5_payload]

            # Find the specific location where the stage number need to be inserted and
            # constructs the relevant method name using the disclosure stage number.
            index = method_name.find("-method")
            lambda_name = method_name[:index] + disclosure_step + method_name[index:]

            # Combines the generic payload and the stage specific payload.
            combined_input = {**payload_array[0], **(payload_array[int(disclosure_step)])}
            combined_input = {"RuntimeVariables": combined_input}

            formatted_data = invoke_method(lambda_name,
                                           combined_input,
                                           lambda_client)

            if not formatted_data["success"]:
                raise exception_classes.MethodFailure(formatted_data["error"])

            logger.info("Successfully invoked stage " + disclosure_step + " lambda")

            # Located here as after the first loop it requires formatted data to be
            # referenced with "data" and the JSON needs to be reset to use the right data.
            generic_json_payload = {
                "bpm_queue_url": bpm_queue_url,
                "data": formatted_data["data"],
                "disclosivity_marker": disclosivity_marker,
                "environment": environment,
                "explanation": explanation,
                "publishable_indicator": publishable_indicator,
                "run_id": run_id,
                "survey": survey,
                "total_columns": total_columns,
                "unique_identifier": unique_identifier
            }

        aws_functions.save_to_s3(bucket_name, out_file_name, formatted_data["data"])

        logger.info("Successfully sent data to s3")

        output_data = formatted_data["data"]

        aws_functions.save_dataframe_to_csv(pd.read_json(output_data, dtype=False),
                                            bucket_name, final_output_location)

        aws_functions.send_sns_message(sns_topic_arn, "Disclosure")
        logger.info("Successfully sent message to sns")

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send start of method status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                  current_step_num, total_steps)

    return {"success": True}
Пример #13
0
def lambda_handler(event, context):
    """
    This method will ingest data from Take On S3 bucket, transform it so that it fits
    in the results pipeline, and send it to the Results S3 bucket for further processing.
    :param event: Event object
    :param context: Context object
    :return: JSON String - {"success": boolean, "error": string}
    """
    current_module = "Results Ingest - Takeon Data - Wrangler"
    error_message = ""
    # Variables required for error handling.
    bpm_queue_url = None
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling.
        run_id = event["RuntimeVariables"]["run_id"]

        # Load variables.
        environment_variables = EnvironmentSchema().load(os.environ)
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables.
        method_name = environment_variables["method_name"]
        results_bucket_name = environment_variables["results_bucket_name"]

        # Runtime Variables.
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        ingestion_parameters = runtime_variables["ingestion_parameters"]
        out_file_name = runtime_variables["out_file_name"]
        period = runtime_variables["period"]
        periodicity = runtime_variables["periodicity"]
        snapshot_s3_uri = runtime_variables["snapshot_s3_uri"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context,
                                                           bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context,
                                                           bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")
        # Send in progress status to BPM.
        current_step_num = 1
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                      current_step_num, total_steps)

        # Set up client.
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        # Wrangle the S3 URI into bucket + name.
        snapshot_parsed_uri = urlparse(snapshot_s3_uri)
        snapshot_bucket = snapshot_parsed_uri.netloc
        snapshot_file = snapshot_parsed_uri.path
        snapshot_file = snapshot_file[1:]  # Remove the leading '/'

        # Get the file from S3
        input_file = aws_functions.read_from_s3(snapshot_bucket,
                                                snapshot_file,
                                                file_extension="")

        logger.info(f"Read Snapshot {snapshot_file} from S3 bucket {snapshot_bucket}")

        payload = {

            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(input_file),
                "environment": environment,
                "period": period,
                "periodicity": periodicity,
                "question_labels": ingestion_parameters["question_labels"],
                "run_id": run_id,
                "statuses": ingestion_parameters["statuses"],
                "survey": survey,
                "survey_codes": ingestion_parameters["survey_codes"]
            },
        }

        method_return = lambda_client.invoke(
            FunctionName=method_name, Payload=json.dumps(payload)
        )
        logger.info("Successfully invoked method.")

        json_response = json.loads(method_return.get("Payload").read().decode("utf-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(results_bucket_name, out_file_name,
                                 json_response["data"])

        logger.info("Data ready for Results pipeline. Written to S3.")

        aws_functions.send_sns_message(sns_topic_arn, "Ingest.")
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send end status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                  current_step_num, total_steps)

    return {"success": True}
def lambda_handler(event, context):
    """
    The wrangler converts the data from JSON format into a dataframe and then edits data.
    This process consolidates 36 columns of data down to 12 and adds brick_type, then
    creates two outputs. One with the GB region added and one with a
    consolidated brick_type.

    :param event: Contains all the variables which are required for the specific run.
    :param context: N/A

    :return:  Success & None/Error - Type: JSON
    """
    current_module = "Pre Aggregation Data Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        column_list = runtime_variables["total_columns"]
        environment = runtime_variables["environment"]
        factors_parameters = runtime_variables["factors_parameters"][
            "RuntimeVariables"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name_bricks = runtime_variables["out_file_name_bricks"]
        out_file_name_region = runtime_variables["out_file_name_region"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        unique_identifier = runtime_variables["unique_identifier"]

        # Factors Parameters
        region_column = factors_parameters["region_column"]
        regionless_code = factors_parameters["regionless_code"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)
    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)

        raise exception_classes.LambdaFailure(error_message)
    try:
        logger.info("Started - retrieved configuration variables.")
        # Send start of module status to BPM.
        # (NB: Current step and total steps omitted to display as "-" in bpm.)
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id)

        # Pulls In Data.
        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Retrieved data from s3")
        new_type = 1  # This number represents Clay & Sandlime Combined
        brick_type = {"clay": 3, "concrete": 2, "sandlime": 4}

        # Prune rows that contain no data
        questions_list = [
            brick + "_" + column for column in column_list
            for brick in brick_type.keys()
        ]
        data["zero_data"] = data.apply(lambda x: do_check(x, questions_list),
                                       axis=1)
        data = data[~data["zero_data"]]
        data.drop(["zero_data"], axis=1, inplace=True)

        # Identify The Brick Type Of The Row.
        data[unique_identifier[0]] = data.apply(
            lambda x: calculate_row_type(x, brick_type, column_list), axis=1)

        # Collate Each Rows 12 Good Brick Type Columns And 24 Empty Columns Down
        # Into 12 With The Same Name.
        data = data.apply(lambda x: sum_columns(x, brick_type, column_list,
                                                unique_identifier),
                          axis=1)

        # Old Columns With Brick Type In The Name Are Dropped.
        for question in questions_list:
            data.drop([question], axis=1, inplace=True)

        # Add GB Region For Aggregation By Region.
        logger.info("Creating File For Aggregation By Region.")
        data_region = data.to_json(orient="records")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(data_region),
                "environment": environment,
                "region_column": region_column,
                "regionless_code": regionless_code,
                "run_id": run_id,
                "survey": survey
            }
        }

        # Pass the data for processing (adding of the regionless region.
        gb_region_data = lambda_client.invoke(FunctionName=method_name,
                                              Payload=json.dumps(payload))
        logger.info("Succesfully invoked method.")

        json_response = json.loads(
            gb_region_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        region_dataframe = pd.DataFrame(json.loads(json_response["data"]))

        totals_dict = {total_column: "sum" for total_column in column_list}

        data_region = region_dataframe.groupby(
            unique_identifier[1:]).agg(totals_dict).reset_index()

        region_output = data_region.to_json(orient="records")

        aws_functions.save_to_s3(bucket_name, out_file_name_region,
                                 region_output)

        logger.info("Successfully sent data to s3")

        # Collate Brick Types Clay And Sand Lime Into A Single Type And Add To Data
        # For Aggregation By Brick Type.
        logger.info("Creating File For Aggregation By Brick Type.")
        data_brick = data.copy()

        data = data[data[unique_identifier[0]] != brick_type["concrete"]]
        data[unique_identifier[0]] = new_type

        data_brick = pd.concat([data_brick, data])

        brick_dataframe = data_brick.groupby(
            unique_identifier[0:2]).agg(totals_dict).reset_index()

        brick_output = brick_dataframe.to_json(orient="records")
        aws_functions.save_to_s3(bucket_name, out_file_name_bricks,
                                 brick_output)

        logger.info("Successfully sent data to s3")

        logger.info(
            aws_functions.send_sns_message(sns_topic_arn, "Pre Aggregation."))

        logger.info("Succesfully sent message to sns")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send end of module status to BPM.
    # (NB: Current step and total steps omitted to display as "-" in bpm.)
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id)

    return {"success": True}