def __init__(self, workload_scale, file_format, queries, ref_queries): time_list = [] ref_time_list = [] for query_name, results in queries.items(): if query_name in ref_queries: # We want to calculate the average and geomean of the query only if it is both # results and reference results for query_results in results[RESULT_LIST]: time_list.append(query_results[TIME_TAKEN]) ref_results = ref_queries[query_name] for ref_query_results in ref_results[RESULT_LIST]: ref_time_list.append(ref_query_results[TIME_TAKEN]) self.workload_name = '{0}({1})'.format( workload_scale[0][1].upper(), workload_scale[1][1]) self.file_format = '{0} / {1} / {2}'.format( file_format[0][1], file_format[1][1], file_format[2][1]) self.avg = calculate_avg(time_list) ref_avg = calculate_avg(ref_time_list) self.delta_avg = calculate_change(self.avg, ref_avg) self.geomean = calculate_geomean(time_list) ref_geomean = calculate_geomean(ref_time_list) self.delta_geomean = calculate_change(self.geomean, ref_geomean)
def calculate_time_stats(grouped): """Adds statistics to the nested dictionary. We are calculating the average runtime and Standard Deviation for each query type. """ def remove_first_run(result_list): """We want to remove the first result because the performance is much worse on the first run. """ if len(result_list) > 1: # We want to remove the first result only if there is more that one result result_list.remove(min(result_list, key=lambda result: result['start_time'])) for workload_scale, workload in grouped.items(): for file_format, queries in workload.items(): for query_name, results in queries.items(): result_list = results[RESULT_LIST] remove_first_run(result_list) avg = calculate_avg( [query_results[TIME_TAKEN] for query_results in result_list]) dev = calculate_stddev( [query_results[TIME_TAKEN] for query_results in result_list]) num_clients = max( int(query_results[CLIENT_NAME]) for query_results in result_list) iterations = int((len(result_list) + 1) / num_clients) results[AVG] = avg results[STDDEV] = dev results[NUM_CLIENTS] = num_clients results[ITERATIONS] = iterations
def calculate_workload_file_format_runtimes(grouped): """Calculate average time for each workload and scale factor, for each file format and compression. This returns a new dictionary with avarage times. Here's an example of how this dictionary is structured: dictionary-> (('workload', 'tpch'), ('scale', '300gb'))-> (('file_format','parquet'), ('compression_codec','zip'), ('compression_type','block'))-> 'avg' We also have access to the list of QueryResult associated with each file_format The difference between this dictionary and grouped_queries is that query name is missing after workload. """ new_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) # First populate the dictionary with query results for workload_scale, workload in grouped.items(): for query_name, file_formats in workload.items(): for file_format, results in file_formats.items(): new_dict[workload_scale][file_format][RESULT_LIST].extend(results[RESULT_LIST]) # Do the average calculation. Standard deviation could also be calculated here for workload_scale in new_dict: for file_format in new_dict[workload_scale]: avg = calculate_avg([query_results[TIME_TAKEN] for query_results in new_dict[workload_scale][file_format][RESULT_LIST]]) new_dict[workload_scale][file_format][AVG] = avg return new_dict
def calculate_time_stats(grouped): """ Add statistics to the nested dictionary. Each query name is supplemented with the average, standard deviation, number of clients, iterations, and a sorted list of the time taken to complete each run. """ def remove_first_run(result_list): """We want to remove the first result because the performance is much worse on the first run. """ if len(result_list) > 1: # We want to remove the first result only if there is more that one result result_list.remove(min(result_list, key=lambda result: result['start_time'])) for workload_scale, workload in grouped.items(): for file_format, queries in workload.items(): for query_name, results in queries.items(): result_list = results[RESULT_LIST] remove_first_run(result_list) avg = calculate_avg( [query_results[TIME_TAKEN] for query_results in result_list]) dev = calculate_stddev( [query_results[TIME_TAKEN] for query_results in result_list]) num_clients = max( int(query_results[CLIENT_NAME]) for query_results in result_list) iterations = int((len(result_list) + 1) / num_clients) results[AVG] = avg results[STDDEV] = dev results[NUM_CLIENTS] = num_clients results[ITERATIONS] = iterations results[SORTED] = [query_results[TIME_TAKEN] for query_results in result_list] results[SORTED].sort()
def create_exec_result(execution_times, iterations, result_data): exec_result = QueryExecResult() exec_result.success = False if result_data: # Just print the first result returned. There may be additional results if # there were multiple iterations executed. LOG.debug('Data:\n%s\n' % result_data[0]) exec_result.data = result_data[0].split('\n') if len(execution_times) == iterations: exec_result.avg_time = calculate_avg(execution_times) if iterations > 1: exec_result.std_dev = calculate_stddev(execution_times) exec_result.success = True return exec_result
def get_summary_str(grouped): summary_str = str() for workload_scale, workload in grouped.items(): summary_str += "{0} / {1} \n".format(workload_scale[0][1], workload_scale[1][1]) table = prettytable.PrettyTable(["File Format", "Compression", "Avg (s)"]) table.align = 'l' table.float_format = '.2' for file_format, queries in workload.items(): # Calculate The average time for each file format and compression ff = file_format[0][1] compression = file_format[1][1] + " / " + file_format[2][1] avg = calculate_avg([query_results[TIME_TAKEN] for results in queries.values() for query_results in results[RESULT_LIST]]) table.add_row([ff, compression, avg]) summary_str += str(table) + '\n' return summary_str
def calculate_time_stats(grouped): """Adds statistics to the nested dictionary. We are calculating the average runtime and Standard Deviation for each query type. """ for workload_scale, workload in grouped.items(): for file_format, queries in workload.items(): for query_name, results in queries.items(): result_list = results[RESULT_LIST] avg = calculate_avg( [query_results[TIME_TAKEN] for query_results in result_list]) dev = calculate_stddev( [query_results[TIME_TAKEN] for query_results in result_list]) num_clients = max( int(query_results[CLIENT_NAME]) for query_results in result_list) iterations = len(result_list) results[AVG] = avg results[STDDEV] = dev results[NUM_CLIENTS] = num_clients results[ITERATIONS] = iterations
def get_summary_str(grouped): summary_str = str() for workload_scale, workload in grouped.items(): summary_str += "{0} / {1} \n".format(workload_scale[0][1], workload_scale[1][1]) table = prettytable.PrettyTable( ["File Format", "Compression", "Avg (s)"]) table.align = 'l' table.float_format = '.2' for file_format, queries in workload.items(): # Calculate The average time for each file format and compression ff = file_format[0][1] compression = file_format[1][1] + " / " + file_format[2][1] avg = calculate_avg([ query_results[TIME_TAKEN] for results in queries.values() for query_results in results[RESULT_LIST] ]) table.add_row([ff, compression, avg]) summary_str += str(table) + '\n' return summary_str
def calculate_time_stats(grouped): """Adds statistics to the nested dictionary. We are calculating the average runtime and Standard Deviation for each query type. """ for workload_scale in grouped: for query_name in grouped[workload_scale]: for file_format in grouped[workload_scale][query_name]: result_list = grouped[workload_scale][query_name][file_format][RESULT_LIST] avg = calculate_avg( [query_results[TIME_TAKEN] for query_results in result_list]) dev = calculate_stddev( [query_results[TIME_TAKEN] for query_results in result_list]) num_clients = max( int(query_results[CLIENT_NAME]) for query_results in result_list) iterations = len(result_list) grouped[workload_scale][query_name][file_format][AVG] = avg grouped[workload_scale][query_name][file_format][STDDEV] = dev grouped[workload_scale][query_name][file_format][NUM_CLIENTS] = num_clients grouped[workload_scale][query_name][file_format][ITERATIONS] = iterations
def __build_rows(self, exec_summaries): first_exec_summary = exec_summaries[0] for row_num, row in enumerate(first_exec_summary): combined_row = {} # Copy fixed values from the first exec summary for key in [PREFIX, OPERATOR, NUM_HOSTS, NUM_ROWS, EST_NUM_ROWS, DETAIL]: combined_row[key] = row[key] avg_times = [exec_summary[row_num][AVG_TIME] for exec_summary in exec_summaries] max_times = [exec_summary[row_num][MAX_TIME] for exec_summary in exec_summaries] peak_mems = [exec_summary[row_num][PEAK_MEM] for exec_summary in exec_summaries] est_peak_mems = [exec_summary[row_num][EST_PEAK_MEM] for exec_summary in exec_summaries] # Set the calculated values combined_row[AVG_TIME] = calculate_avg(avg_times) combined_row[STDDEV_TIME] = calculate_stddev(avg_times) combined_row[MAX_TIME] = max(max_times) combined_row[PEAK_MEM] = max(peak_mems) combined_row[EST_PEAK_MEM] = max(est_peak_mems) self.rows.append(combined_row)
def calculate_time_stats(grouped): """Adds statistics to the nested dictionary. We are calculating the average runtime and Standard Deviation for each query type. """ for workload_scale, workload in grouped.items(): for file_format, queries in workload.items(): for query_name, results in queries.items(): result_list = results[RESULT_LIST] avg = calculate_avg([ query_results[TIME_TAKEN] for query_results in result_list ]) dev = calculate_stddev([ query_results[TIME_TAKEN] for query_results in result_list ]) num_clients = max( int(query_results[CLIENT_NAME]) for query_results in result_list) iterations = len(result_list) results[AVG] = avg results[STDDEV] = dev results[NUM_CLIENTS] = num_clients results[ITERATIONS] = iterations
def construct_exec_result(iterations, query, results): """ Calculate average running time and standard deviation. The summary of the first result is used as the summary for the entire execution. """ # Use the output from the first result. exec_result = QueryExecResult() exec_result.query = query exec_result.data = results[0].data exec_result.beeswax_result = results[0] exec_result.set_result_note(results[0].summary) exec_result.runtime_profile = results[0].runtime_profile # If running more than 2 iterations, throw the first result out. Don't throw away # the first result if iterations = 2 to preserve the stddev calculation. if iterations > 2: results = results[1:] runtimes = [r.time_taken for r in results] exec_result.success = True exec_result.avg_time = calculate_avg(runtimes) if iterations > 1: exec_result.std_dev = calculate_stddev(runtimes) return exec_result