def pre_process(self, profile): summary = profile.find_by_name("Summary") exec_summary_json = utils.parse_exec_summary( summary.val.info_strings.get('ExecSummary') ) if summary.val.info_strings.get('ExecSummary') else {} stats_mapping = { 'Query Compilation': { 'Metadata load finished': 'MetadataLoadTime', 'Analysis finished': 'AnalysisTime', 'Single node plan created': 'SinglePlanTime', 'Runtime filters computed': 'RuntimeFilterTime', 'Distributed plan created': 'DistributedPlanTime', 'Lineage info computed': 'LineageTime' }, 'Query Timeline': { 'Planning finished': 'PlanningTime', 'Completed admission': 'AdmittedTime', 'Rows available': 'QueryTime', 'Unregister query': 'EndTime', '((fragment instances)|(remote fragments)|(execution backends).*) started': 'RemoteFragmentsStarted' } } # Setup Event Sequence if summary: for s in summary.val.event_sequences: sequence_name = s.name sequence = stats_mapping.get(sequence_name) if sequence: duration = 0 for i in range(len(s.labels)): event_name = s.labels[i] event_duration = s.timestamps[i] - duration if sequence.get(event_name): summary.val.counters.append( models.TCounter(name=sequence.get(event_name), value=event_duration, unit=5)) sequence.pop(event_name) else: for key, value in sequence.iteritems(): if re.search(key, event_name, re.IGNORECASE): summary.val.counters.append( models.TCounter(name=value, value=event_duration, unit=5)) sequence.pop(key) break duration = s.timestamps[i] for key, value in stats_mapping.get( 'Query Compilation').iteritems(): summary.val.counters.append( models.TCounter(name=value, value=0, unit=5)) for key, value in stats_mapping.get('Query Timeline').iteritems(): summary.val.counters.append( models.TCounter(name=value, value=0, unit=5)) missing_stats = {} for key in [ 'Tables Missing Stats', 'Tables With Corrupt Table Stats' ]: if summary.val.info_strings.get(key): tables = summary.val.info_strings.get(key).split(',') for table in tables: missing_stats[table] = 1 def add_host(node, exec_summary_json=exec_summary_json): is_plan_node = node.is_plan_node() node_id = node.id() nid = int(node_id) if node_id and node.is_regular() else -1 # Setup Hosts & Broadcast if node_id and node.is_regular() and nid in exec_summary_json: exec_summary_node = exec_summary_json.get(nid, {}) node.val.counters.append( models.TCounter(name='Hosts', value=exec_summary_node.get('hosts', ''), unit=0)) broadcast = 0 if exec_summary_json[nid]['broadcast']: broadcast = 1 node.val.counters.append( models.TCounter(name='Broadcast', value=broadcast, unit=0)) if exec_summary_node.get('detail') and re.search( r'\w*_SCAN_NODE', node.name(), re.IGNORECASE): details = exec_summary_node['detail'].split() node.val.info_strings['Table'] = details[0] node.val.counters.append( models.TCounter(name='MissingStats', value=missing_stats.get(details[0], 0), unit=0)) # Setup LocalTime & ChildTime if node_id: child_time = 0 for c in node.children: if c.is_plan_node(): child_time += c.counter_map()['TotalTime'].value counter_map = node.counter_map() # Load the metric data as if the object would be loaded from the DB local_time = counter_map['TotalTime'].value - child_time # Make sure to substract the wait time for the exchange node if is_plan_node and re.search(r'EXCHANGE_NODE', node.val.name) is not None: async_time = counter_map.get( 'AsyncTotalTime', models.TCounter(value=0)).value inactive_time = counter_map['InactiveTotalTime'].value if inactive_time == 0: dequeue = node.find_by_name('Dequeue') inactive_time = dequeue.counter_map().get( 'DataWaitTime', models.TCounter( value=0)).value if dequeue else 0 local_time = counter_map[ 'TotalTime'].value - inactive_time - async_time child_time = counter_map['TotalTime'].value - local_time if re.search( r'KrpcDataStreamSender', node.val.name) is not None and node.fragment_instance: local_time = counter_map.get( 'SerializeBatchTime', models.TCounter(value=0)).value child_time = counter_map['TotalTime'].value - local_time if re.search(r'HBASE_SCAN_NODE', node.val.name): local_time = counter_map[ 'TotalTime'].value - counter_map.get( 'TotalRawHBaseReadTime(*)', models.TCounter(value=0)).value child_time = counter_map['TotalTime'].value - local_time if re.search(r'KUDU_SCAN_NODE', node.val.name): child_time = counter_map.get( 'KuduClientTime', models.TCounter(value=0)).value local_time = counter_map['TotalTime'].value counter_map['TotalTime'].value = child_time + local_time if re.search(r'HDFS_SCAN_NODE', node.val.name): child_time = counter_map.get( 'TotalRawHdfsReadTime(*)', models.TCounter(value=0)).value local_time = counter_map['TotalTime'].value counter_map['TotalTime'].value = local_time + child_time # For Hash Join, if the "LocalTime" metrics if is_plan_node and re.search(r'HASH_JOIN_NODE', node.val.name) is not None: if ("LocalTime" in counter_map): local_time = counter_map["LocalTime"].value else: local_time = counter_map["ProbeTime"].value +\ counter_map["BuildTime"].value # Add two virtual metrics for local_time and child_time node.val.counters.append( models.TCounter(name='LocalTime', value=local_time, unit=5)) node.val.counters.append( models.TCounter(name='ChildTime', value=child_time, unit=5)) profile.foreach_lambda(add_host)
def pre_process(self, profile): summary = profile.find_by_name("Summary") exec_summary_json = utils.parse_exec_summary( summary.val.info_strings['ExecSummary']) stats_mapping = { 'Query Compilation': { 'Metadata load finished': 'MetadataLoadTime', 'Analysis finished': 'AnalysisTime', 'Single node plan created': 'SinglePlanTime', 'Runtime filters computed': 'RuntimeFilterTime', 'Distributed plan created': 'DistributedPlanTime', 'Lineage info computed': 'LineageTime' }, 'Query Timeline': { 'Planning finished': 'PlanningTime', 'Completed admission': 'AdmittedTime', 'Rows available': 'QueryTime', 'Unregister query': 'EndTime', '((fragment instances)|(remote fragments)|(execution backends).*) started': 'RemoteFragmentsStarted' } } # Setup Event Sequence if summary: for s in summary.val.event_sequences: sequence_name = s.name sequence = stats_mapping.get(sequence_name) if sequence: duration = 0 for i in range(len(s.labels)): event_name = s.labels[i] event_duration = s.timestamps[i] - duration event_value = s.timestamps[i] if sequence.get(event_name): summary.val.counters.append( models.TCounter(name=sequence.get(event_name), value=event_duration, unit=5)) sequence.pop(event_name) else: for key, value in sequence.iteritems(): if re.search(key, event_name, re.IGNORECASE): summary.val.counters.append( models.TCounter(name=value, value=event_duration, unit=5)) sequence.pop(key) break duration = s.timestamps[i] for key, value in stats_mapping.get( 'Query Compilation').iteritems(): summary.val.counters.append( models.TCounter(name=value, value=0, unit=5)) for key, value in stats_mapping.get('Query Timeline').iteritems(): summary.val.counters.append( models.TCounter(name=value, value=0, unit=5)) def add_host(node, exec_summary_json=exec_summary_json): is_plan_node = node.is_plan_node() node_id = node.id() # Setup Hosts & Broadcast if node_id and node.is_regular() and int( node_id) in exec_summary_json: node.val.counters.append( models.TCounter( name='Hosts', value=exec_summary_json[int(node_id)]["hosts"], unit=0)) broadcast = 0 if exec_summary_json[int(node_id)]["broadcast"]: broadcast = 1 node.val.counters.append( models.TCounter(name='Broadcast', value=broadcast, unit=0)) # Setup LocalTime & ChildTime if node_id: child_time = 0 for c in node.children: if c.is_plan_node(): child_time += c.counter_map()['TotalTime'].value counter_map = node.counter_map() # Load the metric data as if the object would be loaded from the DB local_time = counter_map['TotalTime'].value - child_time # Make sure to substract the wait time for the exchange node if is_plan_node and re.search(r'EXCHANGE_NODE', node.val.name) is not None: async_time = counter_map.get( 'AsyncTotalTime', models.TCounter(value=0)).value local_time = counter_map['TotalTime'].value - counter_map[ 'InactiveTotalTime'].value - async_time # For Hash Join, if the "LocalTime" metrics if is_plan_node and re.search(r'HASH_JOIN_NODE', node.val.name) is not None: if ("LocalTime" in counter_map): local_time = counter_map["LocalTime"].value else: local_time = counter_map["ProbeTime"].value +\ counter_map["BuildTime"].value # Add two virtual metrics for local_time and child_time node.val.counters.append( models.TCounter(name='LocalTime', value=local_time, unit=5)) node.val.counters.append( models.TCounter(name='ChildTime', value=child_time, unit=5)) profile.foreach_lambda(add_host)