def __dag_details(self, read_stream):
        """
        :param read_stream: input stream with events from dag kafka topic
        :return: list of aggregated metrics
        """
        details = ".details"

        number_of_unique_tasks_in_the_dags = read_stream \
            .filter("dag is not NULL") \
            .filter("task is not NULL") \
            .aggregate(DistinctCount(group_fields=["dag"],
                                     aggregation_field="task",
                                     aggregation_name=self._component_name + details))

        dag_host_task_count = read_stream \
            .filter("dag is not NULL") \
            .filter("hostname is not NULL") \
            .filter("task is not NULL") \
            .aggregate(Count(group_fields=["dag", "hostname", "task"],
                             aggregation_name=self._component_name + details))

        bbc_dag_subtask_message_itv_generated_with_task_count = read_stream \
            .filter("dag is not NULL") \
            .filter("task is not NULL") \
            .where("dag like '%bbc%' and subtask_message like '%ITV generated%'") \
            .aggregate(Count(group_fields=["dag", "task"],
                             aggregation_name=self._component_name + ".highres.itv_gen"))

        return [
            number_of_unique_tasks_in_the_dags, dag_host_task_count,
            bbc_dag_subtask_message_itv_generated_with_task_count
        ]
    def _process_pipeline(self, json_stream):

        stream = json_stream \
            .filter(col("VoiceReport.voiceReport.sessionId").isNotNull()) \
            .select(
                col("@timestamp"),
                col("header.viewerID").alias("viewerID"),
                col("VoiceReport.voiceReport.sessionId").alias("sessionId"),
                col("VoiceReport.voiceReport.sessionCreationTime").alias("sessionCreationTime"),
                col("VoiceReport.voiceReport.audioPacketLoss").alias("audioPacketLoss"),
                col("VoiceReport.voiceReport.audioTransferTime").alias("audioTransferTime"),
                col("VoiceReport.voiceReport.transactionResult").alias("transactionResult")
            )

        aggregation_fields = [
            "sessionCreationTime", "audioPacketLoss", "audioTransferTime"
        ]
        aggregations = []

        for field in aggregation_fields:
            kwargs = {'aggregation_field': field}

            aggregations.extend([
                Count(**kwargs),
                Max(**kwargs),
                Min(**kwargs),
                P01(**kwargs),
                P05(**kwargs),
                P10(**kwargs),
                P25(**kwargs),
                P50(**kwargs),
                P75(**kwargs),
                P90(**kwargs),
                P95(**kwargs),
                P99(**kwargs)
            ])

        return [
            stream.aggregate(
                CompoundAggregation(aggregations=aggregations,
                                    group_fields=self.__dimensions,
                                    aggregation_name=self._component_name)),
            stream.aggregate(
                Count(group_fields=[
                    "viewerID", "sessionId", "transactionResult"
                ],
                      aggregation_name=self._component_name))
        ]
 def __tva_backup_deleted(self, read_stream):
     return read_stream.where(
         "message like '%TvaManagementFullOnlineIngest%' and "
         "message like '%TvaBackupHelper%' and "
         "message like '%Deleted%'") \
         .aggregate(Count(aggregation_field="message",
                          aggregation_name=self._component_name + ".tva_backup_deleted"))
 def __count_requests_by_content_source_id_and_methods_and_status(
         self, read_stream):
     return read_stream \
         .where("method is not null") \
         .withColumn("response_successful", col("response_code").between(200, 299).cast("string")) \
         .aggregate(Count(group_fields=["content_source_id", "method", "response_successful"],
                          aggregation_name=self._component_name))
 def __trace_metrics(self, events):
     return events \
         .withColumn("counter",
                     custom_translate_like(
                         source_field=col("message"),
                         mappings_pair=[
                             (["HTTP request received", "Referer: cdvr-bs", "<Result>success</Result>"],
                              "vrm_success_recorded"),
                             (["HTTP request received", "Referer: cdvr-bs", "<Result>failed</Result>"],
                              "vrm_failed_recorded"),
                             (["HTTP request", ":8080/RE/", "learnAction"], "reng_success_action"),
                             (["HTTP request received", "IsAuthorized.traxis"], "irdeto_success_request"),
                             (["HTTP request received", "User-Agent", "vod-service"], "vod_service_success"),
                             (["HTTP request received", "x-application-name: purchase-service"],
                              "purchase_service_success"),
                             (["HTTP request received", "x-application-name: discovery-service"],
                              "discovery_service_success"),
                             (["HTTP request received", "x-application-name: epg-packager"], "epg_success"),
                             (["HTTP request received", "x-application-name: recording-service"],
                              "recording_service_success"),
                             (["HTTP request received", "x-application-name: session-service"],
                              "session_service_success")
                         ],
                         default_value="unclassified")) \
         .where("counter != 'unclassified'") \
         .aggregate(Count(group_fields=["hostname", "counter"],
                          aggregation_name=self._component_name))
示例#6
0
 def __count_content_on_demand_errors(self, error_report_stream):
     return error_report_stream \
         .where((col("code") == 3200) |
                (col("code") == 3300) |
                (col("code") == 3400)) \
         .aggregate(Count(group_fields=["code"],
                          aggregation_name=self._component_name + ".content_on_demand_errors"))
    def _process_pipeline(self, read_stream):
        """This define the aggregation fields and re-use statistical functions from aggregation.py"""
        stream = read_stream \
            .withColumn("VMStat_idlePct", col("VMStat_idlePct").cast(IntegerType())) \
            .withColumn("VMStat_systemPct", col("VMStat_systemPct").cast(IntegerType())) \
            .withColumn("VMStat_iowaitPct", col("VMStat_iowaitPct").cast(IntegerType())) \
            .withColumn("VMStat_hwIrqPct", col("VMStat_hwIrqPct").cast(IntegerType())) \
            .withColumn("MemoryUsage_freeKb", col("MemoryUsage_freeKb").cast(IntegerType())) \
            .withColumn("MemoryUsage_cachedKb", col("MemoryUsage_cachedKb").cast(IntegerType())) \
            .withColumn("MemoryUsage_usedKb", col("MemoryUsage_usedKb").cast(IntegerType())) \
            .withColumn("VMStat_nicePct", col("VMStat_nicePct").cast(IntegerType())) \
            .withColumn("VMStat_userPct", col("VMStat_userPct").cast(IntegerType())) \
            .withColumn("VMStat_swIrqPct", col("VMStat_swIrqPct").cast(IntegerType())) \
            .withColumn("VMStat_loadAverage", col("VMStat_loadAverage").cast(IntegerType()))

        aggregation_fields = ["VMStat_idlePct", "VMStat_systemPct", "VMStat_iowaitPct", "VMStat_hwIrqPct",
                              "MemoryUsage_usedKb", "MemoryUsage_freeKb", "MemoryUsage_cachedKb",
                              "VMStat_nicePct","VMStat_userPct", "VMStat_swIrqPct", "VMStat_loadAverage"]

        aggregation_fields_with_sum = ["MemoryUsage_usedKb", "MemoryUsage_freeKb", "MemoryUsage_cachedKb"]

        aggregations = []
        for field in aggregation_fields:
            kwargs = {'aggregation_field': field}

            aggregations.extend([Count(**kwargs), Max(**kwargs), Min(**kwargs), Stddev(**kwargs),
                                 P01(**kwargs), P05(**kwargs), P10(**kwargs), P25(**kwargs), P50(**kwargs),
                                 P75(**kwargs), P90(**kwargs), P95(**kwargs), P99(**kwargs)])

            if kwargs["aggregation_field"] in aggregation_fields_with_sum:
                aggregations.append(Sum(**kwargs))

        return [stream.aggregate(CompoundAggregation(aggregations=aggregations, group_fields=self.__dimensions,
                                                     aggregation_name=self._component_name))]
    def _process_pipeline(self, json_stream):
        schema = StructType([
            StructField("TUNER", StringType()),
            StructField("BOARD", StringType()),
            StructField("WIFI", StringType()),
            StructField("CPU", StringType()),
            StructField("HDD", StringType())
        ])

        stream = json_stream \
            .withColumn("jsonHW", from_json(col("TemperatureReport_value"), schema).alias("jsonHW")) \
            .withColumn("TUNER", when(col("jsonHW.TUNER") == "-274", None).otherwise(col("jsonHW.TUNER"))) \
            .withColumn("BOARD", when(col("jsonHW.BOARD") == "-274", None).otherwise(col("jsonHW.BOARD"))) \
            .withColumn("WIFI", when(col("jsonHW.WIFI") == "-274", None).otherwise(col("jsonHW.WIFI"))) \
            .withColumn("CPU", when(col("jsonHW.CPU") == "-274", None).otherwise(col("jsonHW.CPU"))) \
            .withColumn("HDD", when(col("jsonHW.HDD") == "-274", None).otherwise(col("jsonHW.HDD"))) \
            .drop("jsonHW") \
            .drop("TemperatureReport_value")

        aggregation_fields = ["TUNER", "BOARD", "WIFI", "CPU", "HDD"]
        aggregations = []

        for field in aggregation_fields:
            kwargs = {'aggregation_field': field}

            aggregations.extend([Count(**kwargs), Max(**kwargs), Min(**kwargs), Stddev(**kwargs),
                                 P01(**kwargs), P05(**kwargs), P10(**kwargs), P25(**kwargs), P50(**kwargs),
                                 P75(**kwargs), P90(**kwargs), P95(**kwargs), P99(**kwargs)])

        return [stream.aggregate(CompoundAggregation(aggregations=aggregations, group_fields=self.__dimensions,
                                                     aggregation_name=self._component_name))]
示例#9
0
    def _process_pipeline(self, read_stream):

        return read_stream\
            .aggregate(Count(
                group_fields=["hardwareVersion", "firmwareVersion", "asVersion", "appVersion", "ErrorReport_level"],
                aggregation_name=self._component_name)
            )
示例#10
0
 def __count_network_errors(self, error_report_stream):
     return error_report_stream \
         .where((col("code").between(9003, 9006)) |
                (col("code").between(9993, 9996)) |
                (col("code") == 9031)) \
         .aggregate(Count(group_fields=["code"],
                          aggregation_name=self._component_name + ".network_errors"))
    def _process_pipeline(self, read_stream):
        # filter useless data
        filtered_stream = read_stream.where(
            (col("duration_ms").cast("long") != 0) &
            ~ (col("requested_url").startswith("GET /info") | col("requested_url").startswith("GET /prometheus"))
        )

        mapped_stream = filtered_stream \
            .withColumn("country",
                        when(col("stack").isNotNull(),
                             regexp_extract("stack", r".*-(\w+)$", 1))
                        .otherwise("undefined"))

        average_duration = mapped_stream.aggregate(
            Avg(group_fields=["country", "host", "app", "app_version", "api_method"],
                aggregation_field="duration_ms",
                aggregation_name=self._component_name))

        count_by_status = mapped_stream.aggregate(
            Count(group_fields=["country", "host", "app", "app_version", "api_method", "status"],
                  aggregation_name=self._component_name))

        request_stream = read_stream \
            .where(col("header_x-dev").isNotNull()) \
            .withColumn("country",
                        when(col("stack").isNotNull(),
                             regexp_extract("stack", r".*-(\w+)$", 1))
                        .otherwise("undefined"))

        count_by_app = request_stream.aggregate(
            Count(group_fields=["country", "app"],
                  aggregation_name=self._component_name + ".requests"))

        count_by_app_with_status = request_stream \
            .where(col("status").isNotNull()) \
            .withColumn("status", custom_translate_regex(
                source_field=col("status"),
                mapping={r"^2\d\d": "successful"},
                default_value="failure")) \
            .aggregate(Count(group_fields=["country", "app", "status"],
                             aggregation_name=self._component_name + ".requests"))

        count_stb_requests = request_stream \
            .aggregate(Count(group_fields=["country", "header_x-dev"],
                             aggregation_name=self._component_name + ".requests"))

        return [average_duration, count_by_status, count_stb_requests, count_by_app, count_by_app_with_status]
示例#12
0
 def __ring_status_node_errors(self, error_events):
     return error_events \
         .where("message like '%Eventis.Cassandra.Service."
                "CassandraServiceException+HostRingException%'") \
         .withColumn("host", regexp_extract("message", r".*Eventis\.Cassandra\.Service\.CassandraServiceException\+"
                                                       r"HostRingException.*'(\S+)'.*", 1)) \
         .aggregate(Count(group_fields=["hostname", "host"],
                          aggregation_name=self._component_name + ".ring_status_node_errors"))
示例#13
0
 def __cassandra_errors(self, error_events):
     return error_events \
         .where("message like '%Exception with cassandra node%'") \
         .withColumn("host", regexp_extract("message",
                                            r".*Exception\s+with\s+cassandra\s+node\s+\'([\d\.]+).*", 1)
                     ) \
         .aggregate(Count(group_fields=["hostname", "host"],
                          aggregation_name=self._component_name + ".cassandra_errors"))
    def _process_pipeline(self, read_stream):

        return read_stream.aggregate(
            Count(group_fields=[
                "hardwareVersion", "firmwareVersion", "asVersion",
                "appVersion", "UsageCollectorReport_event_Type"
            ],
                  aggregation_name=self._component_name))
 def __undefined_warnings(self, warn_events):
     return warn_events.where(
         "message not like '%Unable to use alias%because alias is already used by%' and "
         "message not like '%One or more validation errors detected during tva ingest%'"
     ).aggregate(
         Count(group_fields=["hostname"],
               aggregation_name=self._component_name +
               ".undefined_warnings"))
 def __ring_status_node_warnings(self, events):
     return events \
         .where("message like '%Unable to determine external address "
                "of node with internal address %'") \
         .withColumn("host", regexp_extract("message", r".*Unable\s+to\s+determine\s+external\s+address\s+of\s+"
                                                       r"node\s+with\s+internal\s+address\s+'(\S+)'.*", 1)) \
         .aggregate(Count(group_fields=["hostname", "host"],
                          aggregation_name=self._component_name + ".ring_status_node_warnings"))
    def __process_hi_res_events(self, read_stream):
        """
        Aggregation for events with information about loading high_resolution images
        :param read_stream: input stream with events from dag kafka topic
        :return: list of aggregated metrics
        """
        perform_high_res_images_events = read_stream \
            .where("task == 'perform_high_resolution_images_qc'")

        def __process_images_processed_status(column_name, regex_group_number,
                                              component_suffix):
            """
            Calculate aggregated metric for specific column
            :param column_name: New column name for value of processed images
            :param regex_group_number: index of group in regex pattern
            :param component_suffix: name of suffix for metric
            :return: aggregated metric for specific column
            """
            return perform_high_res_images_events \
                .where("subtask_message like 'Images processed:%'") \
                .withColumn(column_name,
                            regexp_extract("subtask_message",
                                           r"^Images processed: qc_success: (\d+), qc_retry: (\d+), qc_error: (\d+).*",
                                           regex_group_number)) \
                .aggregate(Sum(group_fields=["dag", "task"], aggregation_field=column_name,
                               aggregation_name=self._component_name + "." + component_suffix))

        perform_high_res_images_processed_success_sum = \
            __process_images_processed_status("images_success", 1, "hi_res_images_processed_success")

        perform_high_res_images_processed_retry_sum = \
            __process_images_processed_status("images_retry", 2, "hi_res_images_processed_retry")

        perform_high_res_images_processed_error_sum = \
            __process_images_processed_status("images_error", 3, "hi_res_images_processed_error")

        __mapping_image_type = [
            (["image_type='HighResPortrait'",
              "status='qc_success'"], "hi_res_images_portrait"),
            (["image_type='HighResLandscape'",
              "status='qc_success'"], "hi_res_images_landscape")
        ]

        perform_high_res_images_type_count = perform_high_res_images_events \
            .withColumn("image_type", custom_translate_like(source_field=col("subtask_message"),
                                                            mappings_pair=__mapping_image_type,
                                                            default_value="unclassified")) \
            .where("image_type != 'unclassified'") \
            .aggregate(Count(group_fields=["dag", "task", "image_type"],
                             aggregation_name=self._component_name))

        return [
            perform_high_res_images_processed_success_sum,
            perform_high_res_images_processed_retry_sum,
            perform_high_res_images_processed_error_sum,
            perform_high_res_images_type_count
        ]
示例#18
0
    def _process_pipeline(self, read_stream):
        stb_ids = read_stream.withColumn("stb_id", col("header.viewerID"))

        requests_count = stb_ids.aggregate(
            Count(aggregation_name=self._component_name + ".request"))
        stb_ids_distinct_count = stb_ids.aggregate(
            DistinctCount(aggregation_field="stb_id",
                          aggregation_name=self._component_name))
        return [requests_count, stb_ids_distinct_count]
示例#19
0
 def __count_player_live_tv_errors(self, error_report_stream):
     return error_report_stream \
         .where((col("code").between(2000, 2002)) |
                (col("code") == 2004) |
                (col("code") == 2006) |
                (col("code") == 2010) |
                (col("code") == 2020) |
                (col("code") == 2050)) \
         .aggregate(Count(group_fields=["code"],
                          aggregation_name=self._component_name + ".player_live_tv_errors"))
    def _agg_uservice2component_count(self, stream):
        """
        Aggregate uservice - he component call counts
        :param stream:
        :return:
        """
        aggregation = Count(group_fields=["tenant", "app", "dest", "calls"], aggregation_field="requests",
                            aggregation_name=self._component_name)

        return stream.aggregate(aggregation)
    def _agg_count(self, stream, type):
        """
        Aggregate uservice - he component call counts
        :param stream:
        :return:
        """
        aggregation = Count(group_fields=["app", type],
                            aggregation_field="status",
                            aggregation_name=self._component_name)

        return stream.aggregate(aggregation)
 def __memory_flushing(self, events):
     return events \
         .where("message like '%Flushing%'") \
         .withColumn("column_family", custom_translate_like(
             source_field=col("message"),
             mappings_pair=[(["Channels"], "channels"),
                            (["Titles"], "titles"),
                            (["Groups"], "groups")],
             default_value="unclassified")) \
         .where("column_family != 'unclassified'") \
         .aggregate(Count(group_fields=["column_family"],
                          aggregation_name=self._component_name + ".memory_flushing"))
示例#23
0
    def _process_pipeline(self, json_stream):
        stream = json_stream .withColumn("UsageCollectorReport_missed_events",
                        col("UsageCollectorReport_missed_events").cast(IntegerType()))

        kwargs = {"aggregation_field": "UsageCollectorReport_missed_events"}

        aggregations = [Sum(**kwargs), Count(**kwargs), Max(**kwargs), Min(**kwargs), Stddev(**kwargs),
                        P01(**kwargs), P05(**kwargs), P10(**kwargs), P25(**kwargs), P50(**kwargs),
                        P75(**kwargs), P90(**kwargs), P95(**kwargs), P99(**kwargs)]

        return [stream.aggregate(CompoundAggregation(aggregations=aggregations, group_fields=self.__dimensions,
                                                     aggregation_name=self._component_name))]
示例#24
0
 def __count_replay_playback_errors(self, error_report_stream):
     return error_report_stream \
         .where((col("code") == 2505) |
                (col("code") == 2507) |
                (col("code") == 2510) |
                (col("code") == 2511) |
                (col("code") == 2512) |
                (col("code") == 2514) |
                (col("code") == 2517) |
                (col("code") == 2518)) \
         .aggregate(Count(group_fields=["code"],
                          aggregation_name=self._component_name + ".replay_playback_errors"))
示例#25
0
 def __count_player_recording_errors(self, error_report_stream):
     return error_report_stream \
         .where((col("code") == 2200) |
                (col("code") == 2205) |
                (col("code") == 2207) |
                (col("code") == 2211) |
                (col("code") == 2212) |
                (col("code") == 2214) |
                (col("code") == 2217) |
                (col("code") == 2218)) \
         .aggregate(Count(group_fields=["code"],
                          aggregation_name=self._component_name + ".player_recording_errors"))
 def __info_metrics(self, events):
     return events \
         .withColumn("counter",
                     custom_translate_like(
                         source_field=col("message"),
                         mappings_pair=[
                             (["Loading tva version", "took"], "metadata_success")
                         ],
                         default_value="unclassified")) \
         .where("counter != 'unclassified'") \
         .aggregate(Count(group_fields=["hostname", "counter"],
                          aggregation_name=self._component_name))
    def __process_hi_res_on_mpx_events(self, read_stream):
        """
        Aggregation for events with information about high_resolution and loading to mpx
        :param read_stream: input stream with events from dag kafka topic
        :return: list of aggregated metrics
        """
        upload_high_res_images_created_on_mpx_count = read_stream \
            .where("task == 'upload_high_resolution_images_to_mpx'") \
            .where("subtask_message like '%Image was created on MPX:%'") \
            .aggregate(Count(group_fields=["dag", "task"],
                             aggregation_name=self._component_name + ".hi_res_images_created_on_mpx"))

        return [upload_high_res_images_created_on_mpx_count]
示例#28
0
 def __count_player_review_buffer_errors(self, error_report_stream):
     return error_report_stream \
         .where((col("code") == 2100) |
                (col("code") == 2105) |
                (col("code") == 2107) |
                (col("code") == 2111) |
                (col("code") == 2112) |
                (col("code") == 2114) |
                (col("code") == 2117) |
                (col("code") == 2118) |
                (col("code") == 2120) |
                (col("code") == 2130)) \
         .aggregate(Count(group_fields=["code"],
                          aggregation_name=self._component_name + ".player_review_buffer_errors"))
 def __warn_metrics(self, events):
     return events \
         .withColumn("counter",
                     custom_translate_like(
                         source_field=col("message"),
                         mappings_pair=[
                             (["Error", ":8080/RE"], "reng_error_action"),
                             (["Genre", "is not known"], "metadata_warning"),
                             (["Invalid parameter"], "invalid_parameter_warning")
                         ],
                         default_value="unclassified")) \
         .where("counter != 'unclassified'") \
         .aggregate(Count(group_fields=["hostname", "counter"],
                          aggregation_name=self._component_name))
    def _process_pipeline(self, read_stream):

        pre_result_df = self._prepare_input_data_frame(read_stream)

        aggregation_fields_without_sum = TunerPerfReport.get_column_names(
            "TunerReport_SNR")
        aggregation_fields_without_sum.extend(
            TunerPerfReport.get_column_names("TunerReport_signalLevel"))

        aggregation_fields_with_sum = TunerPerfReport.get_column_names(
            "TunerReport_erroreds")
        aggregation_fields_with_sum.extend(
            TunerPerfReport.get_column_names("TunerReport_unerroreds"))
        aggregation_fields_with_sum.extend(
            TunerPerfReport.get_column_names("TunerReport_correcteds"))

        aggregations_ls = []
        aggregations_ls.extend(aggregation_fields_without_sum)
        aggregations_ls.extend(aggregation_fields_with_sum)

        aggregations = []

        for field in aggregations_ls:
            kwargs = {'aggregation_field': field}

            aggregations.extend([
                Count(**kwargs),
                Max(**kwargs),
                Min(**kwargs),
                P01(**kwargs),
                P05(**kwargs),
                P10(**kwargs),
                P25(**kwargs),
                P50(**kwargs),
                P75(**kwargs),
                P90(**kwargs),
                P95(**kwargs),
                P99(**kwargs)
            ])

            if kwargs["aggregation_field"] in aggregation_fields_with_sum:
                aggregations.append(Sum(**kwargs))

        return [
            pre_result_df.aggregate(
                CompoundAggregation(aggregations=aggregations,
                                    group_fields=self.__dimensions,
                                    aggregation_name=self._component_name))
        ]