def generate_default_report(self, output_csv=False): """Format a default summary report based on calculated statistics for an executed scenario. :returns: A report (string) suitable for printing, emailing, etc. """ stats = self.stats template = Template(self.scenario_template()) tmpl_vars = { 'size_data': [], 'stat_list': [ ('TOTAL', stats['agg_stats'], stats['size_stats']), ('CREATE', stats['op_stats'][ssbench.CREATE_OBJECT], stats['op_stats'][ssbench.CREATE_OBJECT]['size_stats']), ('READ', stats['op_stats'][ssbench.READ_OBJECT], stats['op_stats'][ssbench.READ_OBJECT]['size_stats']), ('UPDATE', stats['op_stats'][ssbench.UPDATE_OBJECT], stats['op_stats'][ssbench.UPDATE_OBJECT]['size_stats']), ('DELETE', stats['op_stats'][ssbench.DELETE_OBJECT], stats['op_stats'][ssbench.DELETE_OBJECT]['size_stats']), ], 'agg_stats': stats['agg_stats'], 'nth_pctile': stats['nth_pctile'], 'start_time': datetime.utcfromtimestamp( stats['time_series']['start_time']).strftime( REPORT_TIME_FORMAT), 'stop_time': datetime.utcfromtimestamp( stats['time_series']['stop']).strftime(REPORT_TIME_FORMAT), 'duration': stats['time_series']['stop'] - stats['time_series']['start_time'], 'jobs_per_worker_stats': stats['jobs_per_worker_stats'], 'weighted_c': 0.0, 'weighted_r': 0.0, 'weighted_u': 0.0, 'weighted_d': 0.0, } for size_data in self.scenario.sizes_by_name.values(): if size_data['size_min'] == size_data['size_max']: size_range = '%-15s' % (self._format_bytes( size_data['size_min']), ) else: size_range = '%s - %s' % ( self._format_bytes(size_data['size_min']), self._format_bytes(size_data['size_max'])) initial_files = self.scenario._scenario_data['initial_files'] initial_total = sum(initial_files.values()) pct_total = (initial_files.get(size_data['name'], 0) / float(initial_total) * 100.0) tmpl_vars['size_data'].append({ 'crud_pcts': ' '.join(map(lambda p: '%2.0f' % p, size_data['crud_pcts'])), 'size_range': size_range, 'size_name': size_data['name'], 'pct_total_ops': '%3.0f%%' % pct_total, }) tmpl_vars['weighted_c'] += \ pct_total * size_data['crud_pcts'][0] / 100.0 tmpl_vars['weighted_r'] += \ pct_total * size_data['crud_pcts'][1] / 100.0 tmpl_vars['weighted_u'] += \ pct_total * size_data['crud_pcts'][2] / 100.0 tmpl_vars['weighted_d'] += \ pct_total * size_data['crud_pcts'][3] / 100.0 if output_csv: csv_fields = [ 'scenario_name', 'ssbench_version', 'worker_count', 'concurrency', 'start_time', 'stop_time', 'duration', 'delete_after' ] csv_data = { 'scenario_name': self.scenario.name, 'ssbench_version': self.scenario.version, 'worker_count': tmpl_vars['agg_stats']['worker_count'], 'concurrency': self.scenario.user_count, 'start_time': tmpl_vars['start_time'], 'stop_time': tmpl_vars['stop_time'], 'duration': tmpl_vars['duration'], 'delete_after': str(self.scenario.delete_after), } for label, stats, sstats in tmpl_vars['stat_list']: label_lc = label.lower() if stats.get('req_count', 0): self._add_csv_kv(csv_fields, csv_data, '%s_count' % label_lc, stats['req_count']) self._add_csv_kv(csv_fields, csv_data, '%s_errors' % label_lc, stats['errors']) self._add_csv_kv(csv_fields, csv_data, '%s_retries' % label_lc, stats['retries']) self._add_csv_kv(csv_fields, csv_data, '%s_retry_rate' % label_lc, '%5.2f' % stats['retry_rate']) self._add_csv_kv(csv_fields, csv_data, '%s_avg_req_per_s' % label_lc, stats['avg_req_per_sec']) self._add_stats_for(csv_fields, csv_data, label, 'all', stats, tmpl_vars['nth_pctile']) for size_str, per_size_stats in sstats.iteritems(): if per_size_stats: self._add_stats_for(csv_fields, csv_data, label, size_str, per_size_stats, tmpl_vars['nth_pctile']) csv_file = StringIO() csv_writer = DictWriter(csv_file, csv_fields, lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) csv_writer.writeheader() csv_writer.writerow(csv_data) return csv_file.getvalue() else: return template.render(scenario=self.scenario, **tmpl_vars)
def parse_to_logn_and_normalize(problem_set): columns, data = parse(problem_set) for key in columns: info = columns[key] temp = list(info[:]) temp.append(len(info[1])) columns[key] = tuple(temp) #Calculate the attribute statistics so we can figure out the most common attribute stats = {} for key in data: example = data[key] for i in range(0, len(example)-1): if not stats.get(i): stats[i] = {} if example[i]: if stats[i].get(example[i]): stats[i][example[i]] += 1 else: stats[i][example[i]] = 1 #Calculate the value that should be filled in for any missing values missing_values= {} for i in range(0, len(example)-1): possibles = stats[i] if "continuous" in columns[i+1][1]: temp = [x * y for x, y in possibles.iteritems()] temp2 = sum(possibles.itervalues()) temp = math.fsum(temp) missing_values[i] = temp / temp2 else: most_common = heapq.nlargest(1, possibles.iteritems(), operator.itemgetter(1)) missing_values[i] = most_common[0][0] #Fill in any missing values for key in data: example = data[key] for i in range(0, len(example)-1): if not example[i]: example[i] = missing_values[i] normalizers = {} #Calc Normalization Params for i in range(0, len(example)-1): if "continuous" in columns[i+1][1]: temp = [example[i] for example in data.itervalues()] normalizers[i] = (statlib.stats.lmean(temp), statlib.stats.lstdev(temp)) #Create Log(N) mappings mappings = {} for i in range(0, len(example)-1): if "continuous" not in columns[i+1][1]: mappings[i] = {} values = columns[i+1][1] num_values = columns[i+1][2] num_inputs = math.ceil(math.log(num_values, 2)) for j in range(0, len(values)): mappings[i][values[j]] = [int(x) for x in tobin(j, num_inputs)] #Convert to log(N) encoding and normalize for key in data: example = data[key] new_example = [] for i in range(0, len(example)-1): if "continuous" in columns[i+1][1]: val = example[i] new_val = (val - normalizers[i][0]) / normalizers[i][1] new_example.append(new_val) else: new_example.extend(mappings[i][example[i]]) new_example.append(int(example[-1])) data[key] = new_example return(columns, data)
def generate_default_report(self, output_csv=False): """Format a default summary report based on calculated statistics for an executed scenario. :returns: A report (string) suitable for printing, emailing, etc. """ stats = self.stats template = Template(self.scenario_template()) tmpl_vars = { 'size_data': [], 'stat_list': [ ('TOTAL', stats['agg_stats'], stats['size_stats']), ('CREATE', stats['op_stats'][ssbench.CREATE_OBJECT], stats['op_stats'][ssbench.CREATE_OBJECT]['size_stats']), ('READ', stats['op_stats'][ssbench.READ_OBJECT], stats['op_stats'][ssbench.READ_OBJECT]['size_stats']), ('UPDATE', stats['op_stats'][ssbench.UPDATE_OBJECT], stats['op_stats'][ssbench.UPDATE_OBJECT]['size_stats']), ('DELETE', stats['op_stats'][ssbench.DELETE_OBJECT], stats['op_stats'][ssbench.DELETE_OBJECT]['size_stats']), ], 'agg_stats': stats['agg_stats'], 'nth_pctile': stats['nth_pctile'], 'start_time': datetime.utcfromtimestamp( stats['time_series']['start_time'] ).strftime(REPORT_TIME_FORMAT), 'stop_time': datetime.utcfromtimestamp( stats['time_series']['stop']).strftime(REPORT_TIME_FORMAT), 'duration': stats['time_series']['stop'] - stats['time_series']['start_time'], 'jobs_per_worker_stats': stats['jobs_per_worker_stats'], 'weighted_c': 0.0, 'weighted_r': 0.0, 'weighted_u': 0.0, 'weighted_d': 0.0, } for size_data in self.scenario.sizes_by_name.values(): if size_data['size_min'] == size_data['size_max']: size_range = '%-15s' % ( self._format_bytes(size_data['size_min']),) else: size_range = '%s - %s' % ( self._format_bytes(size_data['size_min']), self._format_bytes(size_data['size_max'])) initial_files = self.scenario._scenario_data['initial_files'] initial_total = sum(initial_files.values()) pct_total = (initial_files.get(size_data['name'], 0) / float(initial_total) * 100.0) tmpl_vars['size_data'].append({ 'crud_pcts': ' '.join(map(lambda p: '%2.0f' % p, size_data['crud_pcts'])), 'size_range': size_range, 'size_name': size_data['name'], 'pct_total_ops': '%3.0f%%' % pct_total, }) tmpl_vars['weighted_c'] += \ pct_total * size_data['crud_pcts'][0] / 100.0 tmpl_vars['weighted_r'] += \ pct_total * size_data['crud_pcts'][1] / 100.0 tmpl_vars['weighted_u'] += \ pct_total * size_data['crud_pcts'][2] / 100.0 tmpl_vars['weighted_d'] += \ pct_total * size_data['crud_pcts'][3] / 100.0 if output_csv: csv_fields = [ 'scenario_name', 'ssbench_version', 'worker_count', 'concurrency', 'start_time', 'stop_time', 'duration', 'delete_after'] csv_data = { 'scenario_name': self.scenario.name, 'ssbench_version': self.scenario.version, 'worker_count': tmpl_vars['agg_stats']['worker_count'], 'concurrency': self.scenario.user_count, 'start_time': tmpl_vars['start_time'], 'stop_time': tmpl_vars['stop_time'], 'duration': tmpl_vars['duration'], 'delete_after': str(self.scenario.delete_after), } for label, stats, sstats in tmpl_vars['stat_list']: label_lc = label.lower() if stats.get('req_count', 0): self._add_csv_kv(csv_fields, csv_data, '%s_count' % label_lc, stats['req_count']) self._add_csv_kv(csv_fields, csv_data, '%s_errors' % label_lc, stats['errors']) self._add_csv_kv(csv_fields, csv_data, '%s_retries' % label_lc, stats['retries']) self._add_csv_kv(csv_fields, csv_data, '%s_retry_rate' % label_lc, '%5.2f' % stats['retry_rate']) self._add_csv_kv(csv_fields, csv_data, '%s_avg_req_per_s' % label_lc, stats['avg_req_per_sec']) self._add_stats_for(csv_fields, csv_data, label, 'all', stats, tmpl_vars['nth_pctile']) for size_str, per_size_stats in sstats.iteritems(): if per_size_stats: self._add_stats_for(csv_fields, csv_data, label, size_str, per_size_stats, tmpl_vars['nth_pctile']) csv_file = StringIO() csv_writer = DictWriter(csv_file, csv_fields, lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) csv_writer.writeheader() csv_writer.writerow(csv_data) return csv_file.getvalue() else: return template.render(scenario=self.scenario, **tmpl_vars)