示例#1
0
def lambda_handler(event, context):
    current_module = "Error Capture"
    # Define run_id outside of try block.
    run_id = 0
    error_message = ''
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling.
        run_id = event['RuntimeVariables']["run_id"]

        runtime_variables = RuntimeSchema().load(event['RuntimeVariables'])
        # Runtime variables.
        environment = runtime_variables["environment"]
        error = runtime_variables["error"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        # Take the error message from event.
        runtime_error_message = error["Cause"]
        logger.info("Retrieved error message.")

        # Call send_sns_message.
        send_sns_message(runtime_error_message, sns_topic_arn)
        logger.info("Sent error to sns topic.")

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")
示例#2
0
def handler(payload, context):
    # We only have environment and our module name for logging
    logger = general_functions.get_logger(None, current_module, environment,
                                          None)

    try:
        glue = boto3.client("glue")
        response = glue.start_job_run(
            JobName=emr_glue_name,
            Arguments=payload,
            MaxCapacity=spark_glue_job_capacity,
        )
        logger.info(
            f"Started job {emr_glue_name} with Id {response['JobRunId']}")
        return {'statusCode': 200, 'body': 'Glue job successfully started.'}

    except Exception:
        logger.exception("Error starting glue job")
        return {'statusCode': 500, 'body': 'Failed to start glue job.'}
def lambda_handler(event, context):
    """
    Responsible for executing specified disclosure methods, masking values which could
    be used to identify specific responders.
    :param event: JSON payload containing:
    RuntimeVariables:{
        bpm_queue_url: Queue url to send BPM status message.
        cell_total_column: The name of the column holding the cell total.
        disclosivity_marker: The name of the column to put "disclosive" marker.
        disclosure_stages: The stages of disclosure you wish to run e.g. (1 2 5)
        environment: The operating environment to use in the spp logger.
        explanation: The name of the column to put reason for pass/fail.
        in_file_name: Input file specified.
        out_file_name: Output file specified.
        parent_column: The name of the column holding the count of parent company.
        publishable_indicator: The name of the column to put "publish" marker.
        stage5_threshold: The threshold used in the disclosure calculation.
        survey: The survey selected to be used in the logger.
        threshold: The threshold used in the disclosure steps.
        top1_column: The name of the column largest contributor to the cell.
        top2_column: The name of the column second largest contributor to the cell.
        total_column: The name of the column holding the cell total.
        total_steps: The total number of steps in the system.
        unique_identifier: A list of the column names to specify a unique cell.
    }
    :param context: AWS Context Object.
    :return final_output: Dict containing either:
        {"success": True}
        {"success": False, "error": <error message - Type: String>}
    """
    current_module = "Disclosure Wrangler"
    error_message = ""
    # Set-up variables for status message
    current_step_num = 6
    bpm_queue_url = None
    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        cell_total_column = runtime_variables["cell_total_column"]
        disclosivity_marker = runtime_variables["disclosivity_marker"]
        disclosure_stages = runtime_variables["disclosure_stages"]
        environment = runtime_variables["environment"]
        explanation = runtime_variables["explanation"]
        final_output_location = runtime_variables["final_output_location"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        parent_column = runtime_variables["parent_column"]
        publishable_indicator = runtime_variables["publishable_indicator"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        stage5_threshold = runtime_variables["stage5_threshold"]
        survey = runtime_variables["survey"]
        threshold = runtime_variables["threshold"]
        top1_column = runtime_variables["top1_column"]
        top2_column = runtime_variables["top2_column"]
        total_columns = runtime_variables["total_columns"]
        total_steps = runtime_variables["total_steps"]
        unique_identifier = runtime_variables["unique_identifier"]
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        # Send start of method status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                      current_step_num, total_steps)

        # Set up clients
        lambda_client = boto3.client("lambda", "eu-west-2")

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)
        logger.info("Successfully retrieved data")

        formatted_data = data.to_json(orient="records")

        disclosure_stages_list = disclosure_stages.split()
        disclosure_stages_list.sort()

        generic_json_payload = {
            "bpm_queue_url": bpm_queue_url,
            "data": formatted_data,
            "disclosivity_marker": disclosivity_marker,
            "environment": environment,
            "explanation": explanation,
            "publishable_indicator": publishable_indicator,
            "run_id": run_id,
            "survey": survey,
            "total_columns": total_columns,
            "unique_identifier": unique_identifier
        }

        stage1_payload = {
            "cell_total_column": cell_total_column
        }

        stage2_payload = {
            "parent_column": parent_column,
            "threshold": threshold
        }

        stage3_payload = {
        }

        stage4_payload = {
        }

        stage5_payload = {
            "top1_column": top1_column,
            "top2_column": top2_column,
            "cell_total_column": cell_total_column,
            "threshold": stage5_threshold
        }

        for disclosure_step in disclosure_stages_list:

            payload_array = [generic_json_payload, stage1_payload, stage2_payload,
                             stage3_payload, stage4_payload, stage5_payload]

            # Find the specific location where the stage number need to be inserted and
            # constructs the relevant method name using the disclosure stage number.
            index = method_name.find("-method")
            lambda_name = method_name[:index] + disclosure_step + method_name[index:]

            # Combines the generic payload and the stage specific payload.
            combined_input = {**payload_array[0], **(payload_array[int(disclosure_step)])}
            combined_input = {"RuntimeVariables": combined_input}

            formatted_data = invoke_method(lambda_name,
                                           combined_input,
                                           lambda_client)

            if not formatted_data["success"]:
                raise exception_classes.MethodFailure(formatted_data["error"])

            logger.info("Successfully invoked stage " + disclosure_step + " lambda")

            # Located here as after the first loop it requires formatted data to be
            # referenced with "data" and the JSON needs to be reset to use the right data.
            generic_json_payload = {
                "bpm_queue_url": bpm_queue_url,
                "data": formatted_data["data"],
                "disclosivity_marker": disclosivity_marker,
                "environment": environment,
                "explanation": explanation,
                "publishable_indicator": publishable_indicator,
                "run_id": run_id,
                "survey": survey,
                "total_columns": total_columns,
                "unique_identifier": unique_identifier
            }

        aws_functions.save_to_s3(bucket_name, out_file_name, formatted_data["data"])

        logger.info("Successfully sent data to s3")

        output_data = formatted_data["data"]

        aws_functions.save_dataframe_to_csv(pd.read_json(output_data, dtype=False),
                                            bucket_name, final_output_location)

        aws_functions.send_sns_message(sns_topic_arn, "Disclosure")
        logger.info("Successfully sent message to sns")

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send start of method status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                  current_step_num, total_steps)

    return {"success": True}
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the calculate top two
    statistical method.
    The method requires a dataframe which must contain the columns specified by:
    - aggregated column
    - additional aggregated column
    - total columns

    :param event: {"RuntimeVariables":{
        aggregated_column - A column to aggregate by. e.g. Enterprise_Reference.
        additional_aggregated_column - A column to aggregate by. e.g. Region.
        total_columns - The names of the columns to produce aggregations for.
        top1_column - The prefix for the largest_contibutor column
        top2_column - The prefix for the second_largest_contibutor column
    }}
    :param context: N/A
    :return: {"success": True}
            or LambdaFailure exception
    """
    current_module = "Aggregation Calc Top Two - Wrangler."

    error_message = ""
    bpm_queue_url = None
    current_step_num = 5

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Needs to be declared inside of the lambda handler
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]

        # Runtime Variables
        additional_aggregated_column = runtime_variables[
            "additional_aggregated_column"]
        aggregated_column = runtime_variables["aggregated_column"]
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        top1_column = runtime_variables["top1_column"]
        top2_column = runtime_variables["top2_column"]
        total_columns = runtime_variables["total_columns"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)
    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables")
        # Send start of module status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        # Read from S3 bucket
        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)
        logger.info("Retrieved data from s3")

        # Serialise data
        logger.info("Converting dataframe to json.")
        prepared_data = data.to_json(orient="records")

        # Invoke aggregation top2 method
        logger.info("Invoking the statistical method.")

        json_payload = {
            "RuntimeVariables": {
                "additional_aggregated_column": additional_aggregated_column,
                "aggregated_column": aggregated_column,
                "bpm_queue_url": bpm_queue_url,
                "data": prepared_data,
                "environment": environment,
                "run_id": run_id,
                "survey": survey,
                "top1_column": top1_column,
                "top2_column": top2_column,
                "total_columns": total_columns
            }
        }

        top2 = lambda_client.invoke(FunctionName=method_name,
                                    Payload=json.dumps(json_payload))

        json_response = json.loads(top2.get("Payload").read().decode("utf-8"))

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        # Sending output to S3, notice to SNS
        logger.info("Sending function response downstream.")
        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent the data to S3")

        aws_functions.send_sns_message(sns_topic_arn, "Aggregation - Top 2.")
        logger.info("Successfully sent the SNS message")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}
示例#5
0
def lambda_handler(event, context):
    """
    Performs enrichment process, joining 2 lookups onto data and detecting anomalies.
    :param event: event object.
    :param context: Context object.
    :return final_output: Dict with "success",
            "data" and "anomalies" or "success and "error".
    """
    # Set up logger.
    current_module = "Enrichment - Method"
    error_message = ''

    bpm_queue_url = None

    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event['RuntimeVariables']['run_id']

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables.
        bucket_name = environment_variables["bucket_name"]

        # Runtime Variables.
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        data = runtime_variables['data']
        environment = runtime_variables['environment']
        identifier_column = runtime_variables["identifier_column"]
        lookups = runtime_variables['lookups']
        marine_mismatch_check = runtime_variables["marine_mismatch_check"]
        period_column = runtime_variables["period_column"]
        survey = runtime_variables['survey']
        survey_column = runtime_variables["survey_column"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger.info("Started - retrieved configuration variables.")

        input_data = pd.read_json(data, dtype=False)

        logger.info("JSON converted to Pandas DF(s).")

        enriched_df, anomalies = data_enrichment(input_data,
                                                 marine_mismatch_check,
                                                 survey_column, period_column,
                                                 bucket_name, lookups,
                                                 identifier_column)

        logger.info("Enrichment function ran successfully.")

        json_out = enriched_df.to_json(orient="records")

        anomaly_out = anomalies.to_json(orient="records")

        logger.info("DF(s) converted back to JSON.")

        final_output = {"data": json_out, "anomalies": anomaly_out}
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}

    logger.info("Successfully completed module: " + current_module)
    final_output['success'] = True
    return final_output
示例#6
0
def lambda_handler(event, context):
    """
    This method is responsible for creating the movements for each question and then
    recording them in the respective columns.
    :param event: JSON payload that contains: movement_type, json_data, questions_list
                  Type: JSON.
    :param context: N/A
    :return: Success - {"success": True/False, "data"/"error": "JSON String"/"Message"}
    """
    current_module = "Imputation Movement - Method"
    error_message = ""
    final_output = {}

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_period = runtime_variables["current_period"]
        environment = runtime_variables["environment"]
        json_data = runtime_variables["data"]
        movement_type = runtime_variables["movement_type"]
        period_column = runtime_variables["period_column"]
        previous_period = runtime_variables["previous_period"]
        questions_list = runtime_variables["questions_list"]
        survey = runtime_variables["survey"]

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context)
        return {"success": False, "error": error_message}

    try:
        logger.info("Started - retrieved configuration variables.")

        # Get relative calculation function
        calculation = getattr(imp_func, movement_type)

        df = pd.DataFrame(json_data)

        sorted_current = df[df[period_column].astype("str") == str(current_period)].copy()
        sorted_previous = df[df[period_column].astype("str") == str(previous_period)]

        for question in questions_list:

            # Converted to list due to issues with Numpy dtypes and math operations.
            current_list = sorted_current[question].tolist()
            previous_list = sorted_previous[question].tolist()

            result_list = []

            # len is used so the correct amount of iterations for the loop.
            for i in range(0, len(sorted_current)):

                # This check is too prevent the DivdebyZeroError.
                if previous_list[i] != 0:
                    number = calculation(current_list[i], previous_list[i])
                else:
                    number = 0.0

                result_list.append(number)

            sorted_current["movement_" + question] = result_list

        filled_dataframe = sorted_current.fillna(0.0)
        logger.info("Successfully finished calculations of movement.")

        final_output = {"data": filled_dataframe.to_json(orient="records")}

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}

    logger.info("Successfully completed module: " + current_module)
    final_output["success"] = True
    return final_output
示例#7
0
def lambda_handler(event, context):
    """
    Lambda function preparing data for enrichment and then calling the enrichment method.
    :param event: Json string representing input - String
    :param context:
    :return Json: success and/or indication of error message.
    """

    # Set up logger.
    current_module = "Enrichment - Wrangler"
    error_message = ""

    bpm_queue_url = None
    current_step_num = 2

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables.
        bucket_name = environment_variables["bucket_name"]
        identifier_column = environment_variables["identifier_column"]
        method_name = environment_variables["method_name"]

        # Runtime Variables.
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables['environment']
        lookups = runtime_variables["lookups"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        marine_mismatch_check = runtime_variables["marine_mismatch_check"]
        period_column = runtime_variables["period_column"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables['survey']
        survey_column = runtime_variables["survey_column"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)

        raise exception_classes.LambdaFailure(error_message)

    try:

        # Send start of method status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        # Set up client.
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        data_df = aws_functions.read_dataframe_from_s3(bucket_name,
                                                       in_file_name)

        logger.info("Started - retrieved data from s3")
        data_json = data_df.to_json(orient="records")
        json_payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "environment": environment,
                "data": data_json,
                "lookups": lookups,
                "marine_mismatch_check": marine_mismatch_check,
                "survey": survey,
                "survey_column": survey_column,
                "period_column": period_column,
                "identifier_column": identifier_column,
                "run_id": run_id
            }
        }
        response = lambda_client.invoke(FunctionName=method_name,
                                        Payload=json.dumps(json_payload))

        logger.info("Successfully invoked method.")
        json_response = json.loads(
            response.get("Payload").read().decode("utf-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])

        logger.info("Successfully sent data to s3.")

        anomalies = json_response["anomalies"]

        if anomalies != "[]":
            aws_functions.save_to_s3(bucket_name, "Enrichment_Anomalies",
                                     anomalies)
            have_anomalies = True
        else:
            have_anomalies = False

        aws_functions.send_sns_message_with_anomalies(have_anomalies,
                                                      sns_topic_arn,
                                                      "Enrichment.")

        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)

    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    # Send end of method status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)

    return {"success": True}
示例#8
0
def lambda_handler(event, context):
    """
    Returns JSON data with new IQR columns and respective values.
    :param event: JSON payload that contains: json_data, questions_list, distinct_values.
                  Type: JSON.
    :param context: N/A.
    :return: Success - {"success": True/False, "data"/"error": "JSON String"/"Message"}
    """
    current_module = "IQRS - Method"
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        distinct_values = runtime_variables["distinct_values"]
        environment = runtime_variables["environment"]
        input_data = pd.DataFrame(runtime_variables["data"])
        questions_list = runtime_variables["questions_list"]
        survey = runtime_variables["survey"]

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context)
        return {"success": False, "error": error_message}

    try:

        logger.info("Started - retrieved configuration variables.")

        movement_columns = produce_columns("movement_", questions_list)
        iqrs_columns = produce_columns("iqrs_", questions_list)

        iqrs_df = calc_iqrs(
            input_data,
            movement_columns,
            iqrs_columns,
            distinct_values
        )

        logger.info("Successfully finished calculations of IQRS.")

        json_out = iqrs_df.to_json(orient="records")
        final_output = {"data": json_out}

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}

    logger.info("Successfully completed module: " + current_module)
    final_output["success"] = True
    return final_output
def lambda_handler(event, context):
    """
    Generates a JSON dataset, grouped by the given aggregated_column(e.g.county) with
     the given total_column(e.g.Q608_total) aggregated by the given aggregation_type
     (e.g.Sum) as a new column called cell_total_column(e.g.county_total).

    :param event: {
        data - JSON String of the data.
        aggregated_column - A column to aggregate by. e.g. Enterprise_Reference.
        additional_aggregated_column - A column to aggregate by. e.g. Region.
        aggregation_type - How we wish to do the aggregation. e.g. sum, count, nunique.
        total_columns - The names of the columns to produce aggregations for.
        cell_total_column - Name of column to rename total_column.
    }

    :param context: N/A
    :return: Success - {"success": True/False, "data"/"error": "JSON String"/"Message"}
    """
    current_module = "Aggregation by column - Method"
    error_message = ""

    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Runtime Variables
        additional_aggregated_column = runtime_variables[
            "additional_aggregated_column"]
        aggregated_column = runtime_variables["aggregated_column"]
        aggregation_type = runtime_variables["aggregation_type"]
        cell_total_column = runtime_variables["cell_total_column"]
        data = json.loads(runtime_variables["data"])
        environment = runtime_variables["environment"]
        survey = runtime_variables["survey"]
        total_columns = runtime_variables["total_columns"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        return {"success": False, "error": error_message}
    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger.info(
            "Started - retrieved configuration variables from wrangler.")
        input_dataframe = pd.DataFrame(data)
        totals_dict = {
            total_column: aggregation_type
            for total_column in total_columns
        }

        logger.info("JSON data converted to DataFrame.")

        to_aggregate = [aggregated_column]
        if additional_aggregated_column != "":
            to_aggregate.append(additional_aggregated_column)

        county_agg = input_dataframe.groupby(to_aggregate)

        agg_by_county_output = county_agg.agg(totals_dict) \
            .reset_index()

        for total_column in total_columns:
            if "total" not in cell_total_column:
                column_cell_total_column = cell_total_column
            else:
                column_cell_total_column = cell_total_column + "_" + total_column
            agg_by_county_output = agg_by_county_output.rename(
                columns={total_column: column_cell_total_column},
                inplace=False)

        logger.info("Column totals successfully calculated.")

        output_json = agg_by_county_output.to_json(orient="records")
        final_output = {"data": output_json}
        logger.info("DataFrame converted to JSON for output.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e, current_module, run_id, context)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}

    logger.info("Successfully completed module.")
    final_output["success"] = True
    return final_output
def lambda_handler(event, context):
    """
    This method will take the simple bricks survey data and expand it to have seperate
     column for each brick type as expceted by the results pipeline. It'll then send it
     to the Results S3 bucket for further processing.
    :param event: Event object
    :param context: Context object
    :return: JSON String - {"success": boolean, "error": string}
    """
    current_module = "Results Ingest - Brick Type - Wrangler"
    error_message = ""
    # Variables required for error handling.
    bpm_queue_url = None
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Load variables.
        environment_variables = EnvironmentSchema().load(os.environ)
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables.
        method_name = environment_variables["method_name"]
        results_bucket_name = environment_variables["results_bucket_name"]

        # Runtime Variables.
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        ingestion_parameters = runtime_variables["ingestion_parameters"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")
        # Send in progress status to BPM.
        status = "IN PROGRESS"
        current_step_num = 1
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)
        # Set up client.
        lambda_client = boto3.client("lambda", region_name="eu-west-2")
        data_df = aws_functions.read_dataframe_from_s3(results_bucket_name,
                                                       in_file_name)

        logger.info("Retrieved data from S3.")
        data_json = data_df.to_json(orient="records")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "brick_questions": ingestion_parameters["brick_questions"],
                "brick_types": ingestion_parameters["brick_types"],
                "brick_type_column": ingestion_parameters["brick_type_column"],
                "data": json.loads(data_json),
                "environment": environment,
                "run_id": run_id,
                "survey": survey
            },
        }

        method_return = lambda_client.invoke(FunctionName=method_name,
                                             Payload=json.dumps(payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            method_return.get("Payload").read().decode("utf-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(results_bucket_name, out_file_name,
                                 json_response["data"])

        logger.info("Data ready for Results pipeline. Written to S3.")

        aws_functions.send_sns_message(sns_topic_arn, "Ingest.")
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send end status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)

    return {"success": True}
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the addition of regionless records.
    :param event: Contains all the variables which are required for the specific run.
    :param context: N/A
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Add an all-GB region - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables['environment']
        factors_parameters = runtime_variables["factors_parameters"]["RuntimeVariables"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        region_column = factors_parameters["region_column"]
        regionless_code = factors_parameters["regionless_code"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables['survey']

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:

        logger.info("Started - retrieved configuration variables.")

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        # Get data from module that preceded this step
        input_data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Successfully retrieved input data from s3")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(
                    input_data.to_json(orient="records")),
                "environment": environment,
                "regionless_code": regionless_code,
                "region_column": region_column,
                "run_id": run_id,
                "survey": survey
            }
        }

        # Pass the data for processing (adding of the regionless region)
        imputed_data = lambda_client.invoke(
            FunctionName=method_name,
            Payload=json.dumps(payload),
        )
        logger.info("Successfully invoked method.")

        json_response = json.loads(imputed_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        # Save
        aws_functions.save_to_s3(bucket_name, out_file_name, json_response["data"])
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data.")

        aws_functions.send_sns_message(sns_topic_arn, "Add a all-GB region.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)
    return {"success": True}
def lambda_handler(event, context):
    """
    Applies Calculate strata function to row of DataFrame.
    :param event: Event Object.
    :param context: Context object.
    :return: strata_out - Dict with "success" and "data" or "success and "error".
    """
    current_module = "Strata - Method"
    error_message = ""
    bpm_queue_url = None
    # Define run_id outside of try block
    run_id = 0

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        strata_column = environment_variables["strata_column"]
        value_column = environment_variables["value_column"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_period = runtime_variables["current_period"]
        data = runtime_variables["data"]
        environment = runtime_variables['environment']
        period_column = runtime_variables["period_column"]
        reference = runtime_variables["reference"]
        region_column = runtime_variables["region_column"]
        segmentation = runtime_variables["segmentation"]
        survey = runtime_variables['survey']
        survey_column = runtime_variables["survey_column"]

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context)
        return {"success": False, "error": error_message}

    try:
        logger.info("Started - retrieved configuration variables.")
        input_data = pd.read_json(data, dtype=False)
        post_strata = input_data.apply(
            calculate_strata,
            strata_column=strata_column,
            value_column=value_column,
            survey_column=survey_column,
            region_column=region_column,
            axis=1,
        )
        logger.info("Successfully ran calculation")

        # Perform mismatch detection
        strata_check, anomalies = strata_mismatch_detector(
            post_strata,
            current_period,
            period_column,
            reference,
            segmentation,
            "good_" + segmentation,
            "current_" + period_column,
            "previous_" + period_column,
            "current_" + segmentation,
            "previous_" + segmentation)

        json_out = strata_check.to_json(orient="records")
        anomalies_out = anomalies.to_json(orient="records")

        final_output = {"data": json_out, "anomalies": anomalies_out}

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}

    logger.info("Successfully completed module: " + current_module)
    final_output["success"] = True
    return final_output
示例#13
0
def lambda_handler(event, context):
    """
    This method will take the simple bricks survey data and expand it to have seperate
     coloumn for each brick type as expceted by the results pipeline. It'll then send it
     to the Results S3 bucket for further processing.
    :param event: Event object
    :param context: Context object
    :return: Dict with "success" and "data" or "success and "error".
    """
    current_module = "Results Ingest - Brick Type - Method"
    error_message = ""
    # Variables required for error handling.
    run_id = 0
    bpm_queue_url = None
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event['RuntimeVariables']['run_id']
        # Extract runtime variables.
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        bpm_queue_url = runtime_variables["bpm_queue_url"]
        brick_questions = runtime_variables['brick_questions']
        brick_type_column = runtime_variables['brick_type_column']
        brick_types = runtime_variables['brick_types']
        data = runtime_variables['data']
        environment = runtime_variables["environment"]
        survey = runtime_variables["survey"]
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        return {"success": False, "error": error_message}

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        return {"success": False, "error": error_message}

    try:
        logger.info("Started - retrieved wrangler configuration variables.")
        # Apply changes to every responder and every brick type
        for respondent in data:
            for this_type in brick_types:

                # When it's not the brick type this responder supplied, fill with 0s.
                if respondent[brick_type_column] != this_type:
                    for this_question in brick_questions[str(this_type)]:
                        respondent[brick_questions[str(this_type)]
                                   [this_question]] = 0

                # When it's the same as responder supplied, use their data.
                else:
                    for this_question in brick_questions[str(this_type)]:
                        respondent[brick_questions[str(this_type)][this_question]] = \
                            respondent[this_question]

            # Remove the 'shared' questions for respondents
            if str(respondent[brick_type_column]) in str(brick_types):
                for this_question in brick_questions[str(
                        respondent[brick_type_column])]:
                    respondent.pop(this_question, None)
            # Remove the 'shared' question for non-respondents
            else:
                for this_question in next(iter(brick_questions)):
                    respondent.pop(this_question, None)

        logger.info("Successfully expanded brick data.")
        final_output = {"data": json.dumps(data)}
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}

    logger.info("Successfully completed module.")
    final_output['success'] = True
    return final_output
示例#14
0
def lambda_handler(event, context):
    """
    prepares the data for the Strata method.
    - Read in data from the SQS queue.
    - Invoke the Strata Method.
    - Send data from the Strata method to the SQS queue.

    :param event:
    :param context:
    :return: string - Json string to send to the SNS topic upon completion
    """
    current_module = "Strata - Wrangler"
    error_message = ""
    log_message = ""
    bpm_queue_url = None
    current_step_num = 3

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]
        # Set up clients
        var_lambda = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        period_column = environment_variables["period_column"]
        segmentation = environment_variables["segmentation"]
        reference = environment_variables["reference"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_period = runtime_variables["period"]
        environment = runtime_variables['environment']
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        region_column = runtime_variables["distinct_values"][0]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        survey_column = runtime_variables["survey_column"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)

        raise exception_classes.LambdaFailure(error_message)

    try:

        logger.info("Started - retrieved configuration variables.")

        # Send start of module status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        data_df = aws_functions.read_dataframe_from_s3(bucket_name,
                                                       in_file_name)
        logger.info("Successfully retrieved data from s3")

        data_json = data_df.to_json(orient="records")
        json_payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "current_period": current_period,
                "data": data_json,
                "environment": environment,
                "period_column": period_column,
                "reference": reference,
                "region_column": region_column,
                "run_id": run_id,
                "segmentation": segmentation,
                "survey": survey,
                "survey_column": survey_column
            }
        }
        returned_data = var_lambda.invoke(FunctionName=method_name,
                                          Payload=json.dumps(json_payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            returned_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        # Push current period data onwards
        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent data to s3")

        anomalies = json_response["anomalies"]

        if anomalies != "[]":
            aws_functions.save_to_s3(bucket_name, "Strata_Anomalies",
                                     anomalies)
            have_anomalies = True
        else:
            have_anomalies = False
        logger.info("Successfully sent anomalies to s3")

        aws_functions.send_sns_message_with_anomalies(have_anomalies,
                                                      sns_topic_arn, "Strata.")

        logger.info("Successfully sent message to sns")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(log_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    # Send end of module status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)
    return {"success": True}
示例#15
0
def lambda_handler(event, context):
    """
    This method will ingest data from Take On S3 bucket, transform it so that it fits
    in the results pipeline, and send it to the Results S3 bucket for further processing.
    :param event: Event object
    :param context: Context object
    :return: Dict with "success" and "data" or "success and "error".
    """
    current_module = "Results Ingest - Takeon Data - Method"
    error_message = ""
    # Variables required for error handling.
    run_id = 0
    bpm_queue_url = None
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling.
        run_id = event["RuntimeVariables"]["run_id"]

        # Extract runtime variables.
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        input_json = runtime_variables["data"]
        period = runtime_variables["period"]
        periodicity = runtime_variables["periodicity"]
        previous_period = general_functions.calculate_adjacent_periods(
            period, periodicity)
        question_labels = runtime_variables["question_labels"]
        statuses = runtime_variables["statuses"]
        survey = runtime_variables["survey"]
        survey_codes = runtime_variables["survey_codes"]
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        return {"success": False, "error": error_message}

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        return {"success": False, "error": error_message}

    try:
        logger.info("Started - retrieved wrangler configuration variables.")
        output_json = []
        for survey in input_json["data"]["allSurveys"]["nodes"]:
            if survey["survey"] in survey_codes:
                for contributor in survey["contributorsBySurvey"]["nodes"]:
                    if contributor["period"] in (period, previous_period):
                        out_contrib = {}

                        # Basic contributor information.
                        out_contrib["survey"] = survey_codes[
                            contributor["survey"]]
                        out_contrib["period"] = str(contributor["period"])
                        out_contrib["responder_id"] = str(
                            contributor["reference"])
                        out_contrib["gor_code"] = contributor["region"]
                        out_contrib["enterprise_reference"] = str(
                            contributor["enterprisereference"])
                        out_contrib["enterprise_name"] = contributor[
                            "enterprisename"]

                        # Pre-populate default question answers.
                        for expected_question in question_labels.keys():
                            out_contrib[question_labels[expected_question]] = 0

                        # Where contributors provided an aswer, use it instead.
                        for question in contributor[
                                "responsesByReferenceAndPeriodAndSurvey"][
                                    "nodes"]:  # noqa: E501
                            if question["questioncode"] in question_labels.keys() and \
                                    question["response"].isnumeric():
                                out_contrib[question_labels[question["questioncode"]]] \
                                    = int(question["response"])

                        # Convert the response statuses to types,
                        # used by results to check if imputation should run
                        # assume all unknown statuses need to be imputed
                        # (this may change after further cross-team talks).
                        if contributor["status"] in statuses:
                            out_contrib["response_type"] = statuses[
                                contributor["status"]]
                        else:
                            out_contrib["response_type"] = 1

                        output_json.append(out_contrib)

        logger.info("Successfully extracted data from take on.")
        final_output = {"data": json.dumps(output_json)}
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}

    logger.info("Successfully completed module.")
    final_output["success"] = True
    return final_output
def lambda_handler(event, context):
    """
    This method loops through each county and records largest & second largest value
     against each record in the group.


    :param event: {
        data - JSON String of the data.
        aggregated_column - A column to aggregate by. e.g. Enterprise_Reference.
        additional_aggregated_column - A column to aggregate by. e.g. Region.
        total_columns - The names of the columns to produce aggregations for.
        top1_column - The prefix for the largest_contibutor column
        top2_column - The prefix for the second_largest_contibutor column
    }
    :param context: N/A
    :return: Success - {"success": True/False, "data"/"error": "JSON String"/"Message"}
    """
    current_module = "Aggregation Calc Top Two - Method"

    error_message = ""
    run_id = 0
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Runtime Variables
        additional_aggregated_column = runtime_variables[
            "additional_aggregated_column"]
        aggregated_column = runtime_variables["aggregated_column"]
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        data = json.loads(runtime_variables["data"])
        environment = runtime_variables["environment"]
        survey = runtime_variables["survey"]
        top1_column = runtime_variables["top1_column"]
        top2_column = runtime_variables["top2_column"]
        total_columns = runtime_variables["total_columns"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger.info(
            "Started - retrieved configuration variables from wrangler.")
        input_dataframe = pd.DataFrame(data)
        top_two_output = pd.DataFrame()
        logger.info("Invoking calc_top_two function on input dataframe")
        counter = 0
        for total_column in total_columns:
            response = calc_top_two(input_dataframe, total_column,
                                    aggregated_column,
                                    additional_aggregated_column, top1_column,
                                    top2_column)

            response = response.drop_duplicates()
            if counter == 0:
                top_two_output = response
            else:
                to_aggregate = [aggregated_column]
                if additional_aggregated_column != "":
                    to_aggregate.append(additional_aggregated_column)

                top_two_output = top_two_output.merge(response,
                                                      on=to_aggregate,
                                                      how="left")
            counter += 1

        response = top_two_output
        logger.info("Converting output dataframe to json")
        response_json = response.to_json(orient="records")
        final_output = {"data": response_json}
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}
    logger.info("Successfully completed module: " + current_module)
    final_output["success"] = True
    return final_output
示例#17
0
def lambda_handler(event, context):
    """
    The wrangler is responsible for preparing the data so the IQRS method can be applied.
    :param event: Contains all the variables which are required for the specific run.
    :param context: N/A
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Imputation IQRS - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        distinct_values = runtime_variables["distinct_values"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        questions_list = runtime_variables["questions_list"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables['survey']

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:

        logger.info("Started - retrieved configuration variables.")

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)
        logger.info("Successfully retrieved data.")
        iqrs_columns = imp_func.produce_columns("iqrs_", questions_list)
        for col in iqrs_columns:
            data[col] = 0

        logger.info("IQRS columns successfully added")

        data_json = data.to_json(orient="records")

        logger.info("Dataframe converted to JSON")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(data_json),
                "distinct_values": distinct_values,
                "environment": environment,
                "questions_list": questions_list,
                "run_id": run_id,
                "survey": survey
            }
        }

        wrangled_data = lambda_client.invoke(FunctionName=method_name,
                                             Payload=json.dumps(payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            wrangled_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data from s3.")

        aws_functions.send_sns_message(sns_topic_arn, "Imputation - IQRs.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}
示例#18
0
def lambda_handler(event, context):
    """
    This method is used to prepare data for the calculation of column totals.

    :param event: {"RuntimeVariables":{
        aggregated_column - A column to aggregate by. e.g. Enterprise_Reference.
        additional_aggregated_column - A column to aggregate by. e.g. Region.
        aggregation_type - How we wish to do the aggregation. e.g. sum, count, nunique.
        total_columns - The names of the columns to produce aggregations for.
        cell_total_column - Name of column to rename each total_column.
                        Is concatenated to the front of the total_column name.
    }}

    :param context: N/A
    :return: {"success": True}
            or LambdaFailure exception
    """
    current_module = "Aggregation by column - Wrangler"
    error_message = ""

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Needs to be declared inside the lambda_handler
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]

        # Runtime Variables
        additional_aggregated_column = runtime_variables[
            "additional_aggregated_column"]
        aggregated_column = runtime_variables["aggregated_column"]
        aggregation_type = runtime_variables["aggregation_type"]
        cell_total_column = runtime_variables["cell_total_column"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_columns = runtime_variables["total_columns"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)
    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")
        # Read from S3 bucket
        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)
        logger.info("Started - retrieved data from s3")

        formatted_data = data.to_json(orient="records")
        logger.info("Formatted disaggregated_data")

        json_payload = {
            "RuntimeVariables": {
                "additional_aggregated_column": additional_aggregated_column,
                "aggregated_column": aggregated_column,
                "aggregation_type": aggregation_type,
                "cell_total_column": cell_total_column,
                "data": formatted_data,
                "environment": environment,
                "run_id": run_id,
                "survey": survey,
                "total_columns": total_columns
            }
        }

        by_column = lambda_client.invoke(FunctionName=method_name,
                                         Payload=json.dumps(json_payload))

        json_response = json.loads(
            by_column.get("Payload").read().decode("utf-8"))

        logger.info("Successfully invoked the method lambda")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent the data to S3")

        aws_functions.send_sns_message(
            sns_topic_arn, "Aggregation - " + aggregated_column + ".")

        logger.info("Successfully sent the SNS message")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e, current_module, run_id, context)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}
def lambda_handler(event, context):
    """
    This module is responsible to load the provided config parameters from the provided
    survey given in the payload. It is also responsible for adding any parameters which
    are given at run time such as period and id. The module also triggers the respective
    StepFunction providing the payload.
    :return
    """
    current_module = "Config Loader"
    error_message = ""

    # Define run_id outside of try block.
    run_id = 0
    try:
        # Retrieve run_id before input validation.
        # Because it is used in exception handling.
        run_id = event['run_id']
        environment_variables = EnvironmentSchema().load(os.environ)
        runtime_variables = RuntimeSchema().load(event)

        bucket_name = environment_variables['bucket_name']
        config_suffix = environment_variables['config_suffix']
        environment = environment_variables['environment']
        folder_id = run_id
        file_path = environment_variables['file_path']
        payload_reference_name = environment_variables[
            'payload_reference_name']
        step_function_arn = environment_variables['step_function_arn']
        survey_arn_prefix = environment_variables['survey_arn_prefix']
        survey_arn_suffix = environment_variables['survey_arn_suffix']
        survey = runtime_variables[payload_reference_name]

    except Exception as e:
        error_message = general_functions.handle_exception(
            e, current_module, run_id, context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(
            e, current_module, run_id, context)
        raise exception_classes.LambdaFailure(error_message)

    try:

        logger.info("Retrieved configuration variables.")

        client = boto3.client('stepfunctions', region_name='eu-west-2')
        # Append survey to run_id.
        run_id = str(survey) + "-" + str(run_id)
        runtime_variables['run_id'] = run_id

        # Get the rest of the config from s3.
        config_file_name = file_path + \
            survey + \
            config_suffix
        config_string = aws_functions.read_from_s3(bucket_name,
                                                   config_file_name)
        combined_input = {**json.loads(config_string), **runtime_variables}

        # Setting File Path.
        location = combined_input["location"]
        full_location = location + "/" + folder_id + "/"

        combined_input[
            "final_output_location"] = location + "/0-latest/disclosure_out"
        combined_input["location"] = full_location

        for x in combined_input["file_names"].keys():
            if type(combined_input["file_names"][x]) == list:
                new_names = []
                for y in combined_input["file_names"][x]:
                    new_names.append(full_location + y)
                combined_input["file_names"][x] = new_names

            else:
                combined_input["file_names"][x] = full_location + \
                                                  combined_input["file_names"][x]

        # ARN.
        constructed_arn = creating_survey_arn(
            creating_step_arn(step_function_arn), survey, survey_arn_prefix,
            survey_arn_suffix)

        # Replace file for first checkpoint.
        if 'checkpoint_file' in combined_input:
            combined_input = set_checkpoint_start_file(
                combined_input['checkpoint_file'],
                combined_input['checkpoint'], combined_input)

        sf_response = client.start_execution(stateMachineArn=constructed_arn,
                                             name=run_id + "-" +
                                             str(random.randint(1000, 9999)),
                                             input=json.dumps(combined_input))

    except Exception as e:
        error_message = general_functions.handle_exception(
            e, current_module, run_id, context)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"execution_id": sf_response["executionArn"].split(":")[-1]}
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the apply factors statistical method.
    The method requires a column per question to store the factors.
    :param event:  Contains all the variables which are required for the specific run.
    :param context: N/A
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Imputation Apply Factors - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None
    current_step_num = 4

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        response_type = environment_variables["response_type"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_data = runtime_variables["current_data"]
        distinct_values = runtime_variables["distinct_values"]
        environment = runtime_variables["environment"]
        factors_parameters = runtime_variables["factors_parameters"][
            "RuntimeVariables"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        previous_data = runtime_variables["previous_data"]
        questions_list = runtime_variables["questions_list"]
        reference = runtime_variables["unique_identifier"][0]
        region_column = factors_parameters["region_column"]
        regionless_code = factors_parameters["regionless_code"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        sum_columns = runtime_variables["sum_columns"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        # Get factors data from calculate_factors
        factors_dataframe = aws_functions.read_dataframe_from_s3(
            bucket_name, in_file_name)
        logger.info("Successfully retrieved factors data from s3")

        # Get data from module that preceded imputation
        input_data = aws_functions.read_dataframe_from_s3(
            bucket_name, current_data)

        # Split out non responder data from input
        non_responder_dataframe = input_data[input_data[response_type] == 1]
        logger.info("Successfully retrieved raw-input data from s3")

        # Read in previous period data for current period non-responders
        prev_period_data = aws_functions.read_dataframe_from_s3(
            bucket_name, previous_data)
        logger.info("Successfully retrieved previous period data from s3")
        # Filter so we only have those that responded in prev
        prev_period_data = prev_period_data[prev_period_data[response_type] ==
                                            2]

        prev_questions_list = produce_columns("prev_", questions_list,
                                              [reference])

        for question in questions_list:
            prev_period_data = prev_period_data.rename(
                index=str, columns={question: "prev_" + question})
        logger.info("Successfully renamed previous period data")

        non_responder_dataframe_with_prev = pd.merge(
            non_responder_dataframe,
            prev_period_data[prev_questions_list],
            on=reference,
        )
        logger.info(
            "Successfully merged previous period data with non-responder df")

        # Merge the factors onto the non responders
        non_responders_with_factors = pd.merge(
            non_responder_dataframe_with_prev,
            factors_dataframe[produce_columns("imputation_factor_",
                                              questions_list,
                                              distinct_values)],
            on=distinct_values,
            how="inner",
        )
        logger.info("Successfully merged non-responders with factors")

        # Collects all rows where an imputation factor doesn't exist.
        dropped_rows = non_responder_dataframe_with_prev[
            ~non_responder_dataframe_with_prev[reference].
            isin(non_responders_with_factors[reference])].dropna()

        if len(dropped_rows) > 0:
            merge_values = distinct_values
            merge_values.remove(region_column)

            # Collect the GB region imputation factors if they exist.
            regionless_factors = \
                factors_dataframe[
                    produce_columns("imputation_factor_",
                                    questions_list,
                                    distinct_values)
                ][factors_dataframe[region_column] == regionless_code]

            if len(merge_values) != 0:
                # Basic merge where we have values to merge on.
                dropped_rows_with_factors = \
                    pd.merge(dropped_rows, regionless_factors,
                             on=merge_values, how="inner")
            else:
                # Added a column to both dataframes to use for the merge.
                dropped_rows["Temp_Key"] = 0
                regionless_factors["Temp_Key"] = 0

                dropped_rows_with_factors = \
                    pd.merge(dropped_rows, regionless_factors,
                             on="Temp_Key", how="inner")

                dropped_rows_with_factors = dropped_rows_with_factors.drop(
                    "Temp_Key", axis=1)

            non_responders_with_factors = \
                pd.concat([non_responders_with_factors, dropped_rows_with_factors])
            logger.info("Successfully merged missing rows with non_responders")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url":
                bpm_queue_url,
                "data":
                json.loads(
                    non_responders_with_factors.to_json(orient="records")),
                "environment":
                environment,
                "questions_list":
                questions_list,
                "run_id":
                run_id,
                "sum_columns":
                sum_columns,
                "survey":
                survey
            }
        }

        # Non responder data should now contain all previous values
        #   and the imputation columns
        imputed_data = lambda_client.invoke(
            FunctionName=method_name,
            Payload=json.dumps(payload),
        )
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            imputed_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        imputed_non_responders = pd.read_json(json_response["data"],
                                              dtype=False)

        # retrieve current responders from input data..
        current_responders = input_data[input_data[response_type] == 2]

        # Joining Datasets Together.
        final_imputed = pd.concat([current_responders, imputed_non_responders])
        logger.info("Successfully joined imputed data with responder data")

        # Create A List Of Factor Columns To Drop
        cols_to_drop = produce_columns(
            "imputation_factor_", questions_list,
            produce_columns("prev_", questions_list))

        filtered_data = final_imputed.drop(cols_to_drop, axis=1)

        message = filtered_data.to_json(orient="records")

        aws_functions.save_to_s3(bucket_name, out_file_name, message)
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, current_data))
            logger.info(aws_functions.delete_data(bucket_name, previous_data))
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data.")

        aws_functions.send_sns_message(sns_topic_arn,
                                       "Imputation - Apply Factors.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    # Send end status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id, current_step_num, total_steps)

    return {"success": True}
    ])
    bpm_queue_url = args["bpm_queue_url"]
    environment = args["environment"]
    run_id = args["run_id"]
    pipeline = args["pipeline"]
    snapshot_location = args["snapshot_location"]

    s3 = boto3.resource("s3", region_name="eu-west-2")
    config = json.load(
        s3.Object(f"spp-res-{environment}-config",
                  f"{pipeline}.json").get()["Body"])

    methods = config["methods"]
    num_methods = len(methods)

    logger = general_functions.get_logger(pipeline, component, environment,
                                          run_id)

    logger.info("Running pipeline %s with snapshot %s", pipeline,
                snapshot_location)
    # Set up extra params for ingest provided at runtime
    extra_ingest_params = {
        "run_id": run_id,
        "snapshot_location": snapshot_location
    }
    ingest_params = methods[0].get("params", {})
    ingest_params.update(extra_ingest_params)
    methods[0]["params"] = ingest_params

    spark = (pyspark.sql.SparkSession.builder.enableHiveSupport().appName(
        pipeline).getOrCreate())
    spark.sql("SET spark.sql.sources.partitionOverwriteMode=dynamic")
示例#22
0
def lambda_handler(event, context):
    error_message = ''
    current_module = "Success Handler."

    bpm_queue_url = None
    run_id = 0

    try:
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        run_id = runtime_variables["run_id"]
        survey = runtime_variables["survey"]

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")
        outcome = "PASS"
        jsonresponse = """ {"resultFlag": \"""" + str(outcome) \
                       + """\", "id": \"""" + run_id + """\"}"""
        jsonresponse = json.loads(jsonresponse)

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")
    # Send end status to BPM.
    status = "RUN COMPLETE"
    current_module = "BMI Results Processing Complete."
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                  run_id)

    return jsonresponse
示例#23
0
def lambda_handler(event, context):
    """
    Adds a regionless / all-GB region code
    :param event: JSON payload that contains: json_data, region column and regionless code
                    - Type: JSON.
    :param context: N/A
    :return: Success - {"success": True/False, "data"/"error": "JSON String"/"Message"}
    """
    current_module = "Add an all-GB regions - Method"
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables['environment']
        json_data = runtime_variables["data"]
        regionless_code = runtime_variables["regionless_code"]
        region_column = runtime_variables["region_column"]
        survey = runtime_variables['environment']

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger.info("Started - retrieved configuration variables.")

        # Get 2 copies of the data
        original_dataframe = pd.DataFrame(json_data)
        regionless_dataframe = pd.DataFrame(json_data)

        # Replace region in one of the sets
        regionless_dataframe[region_column] = regionless_code

        # Combine the original and region replaced data for output
        final_dataframe = pd.concat([original_dataframe, regionless_dataframe])

        final_output = {"data": final_dataframe.to_json(orient="records")}

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}

    logger.info("Successfully completed module: " + current_module)
    final_output["success"] = True
    return final_output
def lambda_handler(event, context):
    """
    This wrangler is used to prepare data for the calculate movements statistical method.
    The method requires a column per question to store the movements, named as follows:
    'movement_questionNameAndNumber'. The wrangler checks for non response and if everyone
    has responded the calculate movements is skipped.
    :param event: Contains Runtime_variables, which contains the movement_type
    :param context: N/A
    :return: Success & Impute/Error - Type: JSON
    """
    to_be_imputed = True
    current_module = "Imputation Movement - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None
    current_step_num = 4

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        response_type = environment_variables["response_type"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        current_data = runtime_variables["current_data"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        movement_type = runtime_variables["movement_type"]
        out_file_name = runtime_variables["out_file_name"]
        out_file_name_skip = runtime_variables["out_file_name_skip"]
        period = runtime_variables["period"]
        period_column = runtime_variables["period_column"]
        periodicity = runtime_variables["periodicity"]
        previous_data = runtime_variables["previous_data"]
        questions_list = runtime_variables["questions_list"]
        reference = runtime_variables["unique_identifier"][0]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        # Send in progress status to BPM.
        status = "IN PROGRESS"
        aws_functions.send_bpm_status(bpm_queue_url, current_module, status,
                                      run_id, current_step_num, total_steps)

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        previous_period = general_functions.calculate_adjacent_periods(
            period, periodicity)
        logger.info("Completed reading data from s3")
        previous_period_data = data[data[period_column].astype("str") == str(
            previous_period)]
        data = data[data[period_column].astype("str") == str(period)]
        logger.info("Split input data")

        # Create a Dataframe where the response column
        # value is set as 1 i.e non responders
        filtered_non_responders = data.loc[(data[response_type] == 1) & (
            data[period_column].astype("str") == str(period))]

        logger.info("Successfully created filtered non responders DataFrame")

        response_check = len(filtered_non_responders.index)

        # If greater than 0 it means there is non-responders so Imputation need to be run
        if response_check > 0:

            # Save previous period data to s3 for apply to pick up later
            aws_functions.save_to_s3(
                bucket_name, previous_data,
                previous_period_data.to_json(orient="records"))
            # Save raw data to s3 for apply to pick up later
            aws_functions.save_to_s3(bucket_name, current_data,
                                     data.to_json(orient="records"))
            logger.info("Successfully sent data.")

            # Ensure that only responder_ids with a response
            # type of 2 (returned) get picked up
            data = data[data[response_type] == 2]
            previous_period_data = \
                previous_period_data[previous_period_data[response_type] == 2]

            # Ensure that only rows that exist in both current and previous get picked up.
            data = data[data[reference].isin(
                previous_period_data[reference])].dropna()
            previous_period_data = previous_period_data[
                previous_period_data[reference].isin(
                    data[reference])].dropna()  # noqa e501

            # Merged together so it can be sent via the payload to the method
            merged_data = pd.concat([data, previous_period_data])

            # Make sure there is some data, non-responders were removed at this stage
            if len(merged_data.index) > 0:
                logger.info(
                    "Successfully filtered and merged the previous period data"
                )
            else:
                raise exception_classes.LambdaFailure(
                    "No data left after filtering")

            for question in questions_list:
                merged_data["movement_" + question] = 0.0

            json_ordered_data = merged_data.to_json(orient="records")

            json_payload = {
                "RuntimeVariables": {
                    "bpm_queue_url": bpm_queue_url,
                    "current_period": period,
                    "data": json.loads(json_ordered_data),
                    "environment": environment,
                    "movement_type": movement_type,
                    "period_column": period_column,
                    "previous_period": previous_period,
                    "questions_list": questions_list,
                    "run_id": run_id,
                    "survey": survey
                }
            }

            logger.info("Successfully created movement columns on the data")

            imputed_data = lambda_client.invoke(
                FunctionName=method_name, Payload=json.dumps(json_payload))

            logger.info("Successfully invoked method.")

            json_response = json.loads(
                imputed_data.get("Payload").read().decode("UTF-8"))
            logger.info("JSON extracted from method response.")

            if not json_response["success"]:
                raise exception_classes.MethodFailure(json_response["error"])

            imputation_run_type = "Calculate Movement."
            aws_functions.save_to_s3(bucket_name, out_file_name,
                                     json_response["data"])

            logger.info("Successfully sent the data to s3")

        else:

            to_be_imputed = False
            imputation_run_type = "Has Not Run."

            aws_functions.save_to_s3(bucket_name, out_file_name_skip,
                                     data.to_json(orient="records"))

            logger.info("Successfully sent the unchanged data to s3")

            aws_functions.send_sns_message(sns_topic_arn,
                                           "Imputation - Did not run")
            logger.info("Successfully sent message to sns")

        aws_functions.send_sns_message(sns_topic_arn,
                                       "Imputation - " + imputation_run_type)

        logger.info("Successfully sent the SNS message")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True, "impute": to_be_imputed}
def lambda_handler(event, context):
    """
    Applies imputation factors on a question-by-question basis.
    :param event:  JSON payload that contains: json_data and questions_list - Type: JSON.
    :param context: N/A
    :return: Success - {"success": True/False, "data"/"error": "JSON String"/"Message"}
    """
    current_module = "Apply Factors - Method"
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        json_data = runtime_variables["data"]
        questions_list = runtime_variables["questions_list"]
        sum_columns = runtime_variables["sum_columns"]
        survey = runtime_variables["survey"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        return {"success": False, "error": error_message}

    try:
        logger.info("Started - retrieved configuration variables.")

        working_dataframe = pd.DataFrame(json_data)

        for question in questions_list:
            # Loop through each question value, impute based on factor and previous value
            # then drop the previous value and the imp factor
            working_dataframe[question] = working_dataframe.apply(
                lambda x: general_functions.sas_round(x[
                    "prev_" + question] * x["imputation_factor_" + question]),
                axis=1,
            )

            logger.info("Completed imputation of " + str(question))

        working_dataframe = working_dataframe.apply(
            lambda x: sum_data_columns(x, sum_columns), axis=1)

        final_output = {"data": working_dataframe.to_json(orient="records")}

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}

    logger.info("Successfully completed module: " + current_module)
    final_output["success"] = True
    return final_output
def lambda_handler(event, context):
    """
    prepares the data for the Means method.
    - Read in data from the SQS queue.
    - Invokes the Mean Method.
    - Send data received back from the Mean method to the SQS queue.

    :param event:
    :param context:
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Imputation Recalculate Means - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", "eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        distinct_values = runtime_variables["distinct_values"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        questions_list = runtime_variables["questions_list"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Successfully retrieved data")

        # Add means columns
        for question in questions_list:
            data.drop(["movement_" + question + "_count"],
                      axis=1,
                      inplace=True)
            data.drop(["movement_" + question + "_sum"], axis=1, inplace=True)
            data.drop(["atyp_" + question, "iqrs_" + question],
                      axis=1,
                      inplace=True)
            data["mean_" + question] = 0.0

        data_json = data.to_json(orient="records")

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(data_json),
                "distinct_values": distinct_values,
                "environment": environment,
                "questions_list": questions_list,
                "run_id": run_id,
                "survey": survey
            }
        }

        returned_data = lambda_client.invoke(FunctionName=method_name,
                                             Payload=json.dumps(payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            returned_data.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        aws_functions.save_to_s3(bucket_name, out_file_name,
                                 json_response["data"])
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data from s3.")

        aws_functions.send_sns_message(sns_topic_arn,
                                       "Imputation - Recalculate Means.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}
def lambda_handler(event, context):
    """
    This method takes the new columns and adds them all onto the main dataset.

    :param event: { "RuntimeVariables": {
        aggregated_column - A column to aggregate by. e.g. Enterprise_Reference.
        additional_aggregated_column - A column to aggregate by. e.g. Region.
    }}
    :param context:
    :return:
    """

    current_module = "Aggregation_Combiner"
    error_message = ""
    bpm_queue_url = None
    current_step_num = 5

    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        additional_aggregated_column = runtime_variables["additional_aggregated_column"]
        aggregated_column = runtime_variables["aggregated_column"]
        aggregation_files = runtime_variables["aggregation_files"]
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        environment = runtime_variables["environment"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables["survey"]
        total_steps = runtime_variables["total_steps"]

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module, run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)
    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - Retrieved configuration variables.")
        # Get file from s3
        imp_df = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Retrieved data from s3")

        # Receive the 3 aggregation outputs.
        ent_ref_agg = aggregation_files["ent_ref_agg"]
        cell_agg = aggregation_files["cell_agg"]
        top2_agg = aggregation_files["top2_agg"]

        # Load file content.
        ent_ref_agg_df = aws_functions.read_dataframe_from_s3(bucket_name, ent_ref_agg)
        cell_agg_df = aws_functions.read_dataframe_from_s3(bucket_name, cell_agg)
        top2_agg_df = aws_functions.read_dataframe_from_s3(bucket_name, top2_agg)
        logger.info("Successfully retrievied aggragation data from s3")

        to_aggregate = [aggregated_column]
        if additional_aggregated_column != "":
            to_aggregate.append(additional_aggregated_column)

        # merge the imputation output from s3 with the 3 aggregation outputs
        first_merge = pd.merge(
            imp_df, ent_ref_agg_df, on=to_aggregate, how="left")

        second_merge = pd.merge(
            first_merge, cell_agg_df, on=to_aggregate, how="left")

        third_merge = pd.merge(
            second_merge, top2_agg_df, on=to_aggregate, how="left")

        logger.info("Successfully merged dataframes")

        # convert output to json ready to return
        final_output = third_merge.to_json(orient="records")

        # send output onwards
        aws_functions.save_to_s3(bucket_name, out_file_name, final_output)
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, ent_ref_agg))
            logger.info(aws_functions.delete_data(bucket_name, cell_agg))
            logger.info(aws_functions.delete_data(bucket_name, top2_agg))
            logger.info("Successfully deleted input data.")

        aws_functions.send_sns_message(sns_topic_arn, "Aggregation - Combiner.")
        logger.info("Successfully sent data to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context,
                                                           bpm_queue_url=bpm_queue_url)

    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module.")

    # Send end of module status to BPM.
    status = "DONE"
    aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id,
                                  current_step_num, total_steps)

    return {"success": True}
def lambda_handler(event, context):
    """
    Main entry point into method
    :param event: json payload containing:
            bpm_queue_url: Queue url to send BPM status message.
            data: input data.
            disclosivity_marker: The name of the column to put "disclosive" marker.
            environment: The operating environment to use in the spp logger.
            explanation: The name of the column to put reason for pass/fail.
            parent_column: The name of the column holding the count of parent company.
            publishable_indicator: The name of the column to put "publish" marker.
            survey: The survey selected to be used in the logger.
            threshold: The threshold above which a row is not disclosive.
            total_columns: The names of the column holding the cell totals.
                        Included so that correct disclosure columns used.
            unique_identifier: The name of the column holding the contributor id.
    :param context: AWS Context Object.
    :return final_output: Dict containing either:
            {"success": True, "data": <stage 2 output - json >}
            {"success": False, "error": <error message - string>}
    """
    current_module = "Disclosure Stage 2 Method"
    error_message = ""
    # Set-up variables for status message
    bpm_queue_url = None
    # Define run_id outside of try block
    run_id = 0
    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]
        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        disclosivity_marker = runtime_variables["disclosivity_marker"]
        environment = runtime_variables["environment"]
        explanation = runtime_variables["explanation"]
        parent_column = runtime_variables["parent_column"]
        publishable_indicator = runtime_variables["publishable_indicator"]
        survey = runtime_variables["survey"]
        threshold = int(runtime_variables["threshold"])
        total_columns = runtime_variables["total_columns"]
        unique_identifier = runtime_variables["unique_identifier"]

        input_json = json.loads(runtime_variables["data"])
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
        return {"success": False, "error": error_message}

    try:
        logger = general_functions.get_logger(survey, current_module, environment,
                                              run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
        return {"success": False, "error": error_message}

    try:
        logger.info("Started - retrieved wrangler configuration variables.")
        input_dataframe = pd.DataFrame(input_json)
        stage_2_output = pd.DataFrame()
        first_loop = True
        for total_column in total_columns:
            this_disclosivity_marker = disclosivity_marker + "_" + total_column
            this_publishable_indicator = publishable_indicator + "_" + total_column
            this_explanation = explanation + "_" + total_column

            disclosure_output = disclosure(input_dataframe,
                                           this_disclosivity_marker,
                                           this_publishable_indicator,
                                           this_explanation,
                                           parent_column,
                                           threshold)
            if first_loop:
                stage_2_output = disclosure_output
                first_loop = False
            else:
                these_disclosure_columns = [this_disclosivity_marker,
                                            this_explanation,
                                            this_publishable_indicator]
                keep_columns = these_disclosure_columns + unique_identifier
                stage_2_output.drop(these_disclosure_columns, axis=1, inplace=True)
                stage_2_output = stage_2_output.merge(disclosure_output[keep_columns],
                                                      on=unique_identifier,
                                                      how="left")

            logger.info("Successfully completed Disclosure stage 2 for:"
                        + str(total_column))

        logger.info("Successfully completed Disclosure")
        final_output = {"data": stage_2_output.to_json(orient="records")}

    except Exception as e:
        error_message = general_functions.handle_exception(e, current_module,
                                                           run_id, context=context,
                                                           bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            return {"success": False, "error": error_message}

    logger.info("Successfully completed module: " + current_module)
    final_output["success"] = True
    return final_output
示例#29
0
from unittest.mock import patch

from es_aws_functions import general_functions
from spp.engine.pipeline import Pipeline

logger = general_functions.get_logger(survey="rsi", module_name="SPP Engine - Write",
                                      environment="sandbox", run_id="1111.2222")


@patch('es_aws_functions.aws_functions.send_bpm_status')
@patch('spp.engine.pipeline.PipelineMethod')
def test_aws_big_pipeline(mock_class, mock_method):
    test_pipeline = Pipeline("Test", "000001", logger, True)

    test_pipeline.add_pipeline_methods("run_id", "method_a",
                                       "tests.test_methods.bd.big_data",
                                       [{"name": "df", "database": "test_db",
                                         "table": "test_table",
                                         "path": "dummy.json",
                                         "select": ["column_1", "column_2"],
                                         "where": [{"column": "column_1",
                                                    "condition": "=",
                                                    "value": 100}]}],
                                       {
                                           "location": "s3://dtrades-assets/workflows",
                                           "format": "parquet",
                                           "save_mode": "append",
                                           "partition_by": ["run_id"]
                                       }, True,
                                       {"param_1": "col_1", "param_2": "col_2",
                                        "param_3": "col_3"})
示例#30
0
def lambda_handler(event, context):
    """
    Prepares data for and calls the Calculate imputation factors method by adding on the
    required columns needed by the method.
    :param event: Contains all the variables which are required for the specific run.
    :param context: lambda context
    :return: Success & None/Error - Type: JSON
    """
    current_module = "Imputation Calculate Factors - Wrangler."
    error_message = ""

    # Define run_id outside of try block
    run_id = 0

    # Set-up variables for status message
    bpm_queue_url = None

    try:
        # Retrieve run_id before input validation
        # Because it is used in exception handling
        run_id = event["RuntimeVariables"]["run_id"]

        # Set up clients
        lambda_client = boto3.client("lambda", region_name="eu-west-2")

        environment_variables = EnvironmentSchema().load(os.environ)

        runtime_variables = RuntimeSchema().load(event["RuntimeVariables"])

        # Environment Variables
        bucket_name = environment_variables["bucket_name"]
        method_name = environment_variables["method_name"]
        run_environment = environment_variables["run_environment"]

        # Runtime Variables
        bpm_queue_url = runtime_variables["bpm_queue_url"]
        distinct_values = runtime_variables["distinct_values"]
        environment = runtime_variables["environment"]
        factors_parameters = runtime_variables["factors_parameters"]
        in_file_name = runtime_variables["in_file_name"]
        out_file_name = runtime_variables["out_file_name"]
        period_column = runtime_variables["period_column"]
        questions_list = runtime_variables["questions_list"]
        sns_topic_arn = runtime_variables["sns_topic_arn"]
        survey = runtime_variables['survey']

    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger = general_functions.get_logger(survey, current_module,
                                              environment, run_id)
    except Exception as e:
        error_message = general_functions.handle_exception(e,
                                                           current_module,
                                                           run_id,
                                                           context=context)
        raise exception_classes.LambdaFailure(error_message)

    try:
        logger.info("Started - retrieved configuration variables.")

        data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name)

        logger.info("Successfully retrieved data")

        factor_columns = imp_func.\
            produce_columns("imputation_factor_", questions_list)

        # create df columns needed for method
        for factor in factor_columns:
            data[factor] = 0

        payload = {
            "RuntimeVariables": {
                "bpm_queue_url": bpm_queue_url,
                "data": json.loads(data.to_json(orient="records")),
                "environment": environment,
                "questions_list": questions_list,
                "distinct_values": distinct_values,
                "factors_parameters": factors_parameters,
                "run_id": run_id,
                "survey": survey
            }
        }

        # invoke the method to calculate the factors
        calculate_factors = lambda_client.invoke(FunctionName=method_name,
                                                 Payload=json.dumps(payload))
        logger.info("Successfully invoked method.")

        json_response = json.loads(
            calculate_factors.get("Payload").read().decode("UTF-8"))
        logger.info("JSON extracted from method response.")

        if not json_response["success"]:
            raise exception_classes.MethodFailure(json_response["error"])

        output_df = pd.read_json(json_response["data"], dtype=False)
        distinct_values.append(period_column)
        columns_to_keep = imp_func.produce_columns("imputation_factor_",
                                                   questions_list,
                                                   distinct_values)

        final_df = output_df[columns_to_keep].drop_duplicates().to_json(
            orient="records")
        aws_functions.save_to_s3(bucket_name, out_file_name, final_df)
        logger.info("Successfully sent data to s3.")

        if run_environment != "development":
            logger.info(aws_functions.delete_data(bucket_name, in_file_name))
            logger.info("Successfully deleted input data.")

        aws_functions.send_sns_message(sns_topic_arn,
                                       "Imputation - Calculate Factors.")
        logger.info("Successfully sent message to sns.")

    except Exception as e:
        error_message = general_functions.handle_exception(
            e,
            current_module,
            run_id,
            context=context,
            bpm_queue_url=bpm_queue_url)
    finally:
        if (len(error_message)) > 0:
            logger.error(error_message)
            raise exception_classes.LambdaFailure(error_message)

    logger.info("Successfully completed module: " + current_module)

    return {"success": True}