def load(self, filename, metadir=None): filename = os.path.expandvars(filename) if os.path.isdir(filename): self.__use_rootfs = False elif os.path.isfile(filename): self.__use_rootfs = True else: print "%s does not exist" % filename return False if self.__use_rootfs: if self.verbose: print "loading %s" % filename data = ropen(filename) if not data: print "Could not open %s" % filename return False if self.coreData: self.coreData.Close() self.coreData = data self.coreDataName = filename else: self.root = filename if self.coreData: self.coreData.Close() dataroot = os.path.dirname(filename) # get metadata for meta in ["variables", "datasets", "trees"]: metafile = "%s.yml" % meta if metadir: metafile_user = os.path.join(metadir, metafile) if os.path.isfile(metafile_user): print "loading %s" % metafile_user setattr(self, meta, metadata.load(metafile_user)) continue else: if os.path.isfile(metafile): print "loading %s" % metafile setattr(self, meta, metadata.load(metafile)) continue metafile_data = os.path.join(dataroot, metafile) if os.path.isfile(metafile_data): print "loading %s" % metafile_data setattr(self, meta, metadata.load(metafile_data)) continue if os.environ.has_key('DATAROOT'): dataroot_central = os.environ['DATAROOT'] metafile_central = os.path.join(dataroot_central, metafile) if os.path.isfile(metafile_central): print "loading %s" % metafile_central setattr(self, meta, metadata.load(metafile_central)) continue print "Could not find %s.yml in $DATAROOT, %s or current working directory" % ( meta, dataroot) return False return True
def load(self, filename, metadir=None): filename = os.path.expandvars(filename) if os.path.isdir(filename): self.__use_rootfs = False elif os.path.isfile(filename): self.__use_rootfs = True else: print "%s does not exist"% filename return False if self.__use_rootfs: if self.verbose: print "loading %s"%filename data = ropen(filename) if not data: print "Could not open %s"% filename return False if self.coreData: self.coreData.Close() self.coreData = data self.coreDataName = filename else: self.root = filename if self.coreData: self.coreData.Close() dataroot = os.path.dirname(filename) # get metadata for meta in ["variables", "datasets", "trees"]: metafile = "%s.yml"% meta if metadir: metafile_user = os.path.join(metadir, metafile) if os.path.isfile(metafile_user): print "loading %s"% metafile_user setattr(self,meta,metadata.load(metafile_user)) continue else: if os.path.isfile(metafile): print "loading %s"% metafile setattr(self,meta,metadata.load(metafile)) continue metafile_data = os.path.join(dataroot, metafile) if os.path.isfile(metafile_data): print "loading %s"% metafile_data setattr(self,meta,metadata.load(metafile_data)) continue if os.environ.has_key('DATAROOT'): dataroot_central = os.environ['DATAROOT'] metafile_central = os.path.join(dataroot_central, metafile) if os.path.isfile(metafile_central): print "loading %s"% metafile_central setattr(self,meta,metadata.load(metafile_central)) continue print "Could not find %s.yml in $DATAROOT, %s or current working directory"% (meta, dataroot) return False return True
def draw(path): data = metadata.load(path) adf_dist_path = os.path.join(path, "adf_distribution.png") if os.path.exists(adf_dist_path): print("path exists %s, skip" % adf_dist_path) #return p_values = {'c': [], 'ct': [], 'ctt': []} for srv in data["services"]: do_adfuller(path, srv, p_values) measurement = os.path.dirname(os.path.join(path, '')) ax = plt.subplots(1)[1] ax.yaxis.grid() labels = [ "constant", "constant + trend", "constant, and linear and quadratic trend" ] ax.hist(p_values.values(), 22, histtype='bar', align='mid', label=labels, alpha=0.4) ax.set_xlabel( "Distribution of p-value for Augmented Dickey-Fuller test for %s" % measurement) ax.legend() plt.savefig(adf_dist_path) print(adf_dist_path)
def render(self, view, label_points=True): return { "nrows": self.nrows, "ncols": self.ncols, "hasColor": bool(self.color), "color": self.color, "colorCoding": self.color_coding, "colorNominal": self.color_nominal, "colorOrdinal": self.color_ordinal, "cordering": self.cordering, "subplots": [ s.render(label_points=label_points) for s in self.subplots.itervalues() ], "caption": view.caption, "data": load(view) }
def main(path): data = metadata.load(path) result = defaultdict(list) for srv in data["services"]: process_service(path, srv["name"], result) n = os.path.join(path, "scores.tsv") print(n) pd.DataFrame(result).to_csv(n)
def cluster_services(path): data = metadata.load(path) def _cluster_service(args): import cluster return cluster.cluster_service(*args) ids = [] for cluster_size in range(1, 8): for service in data["services"]: res = lview.apply_async(_cluster_service, (path, service, cluster_size)) ids.extend(res.msg_ids) return ids
def render(self, view, label_points=True): return { "nrows": self.nrows, "ncols": self.ncols, "hasColor": bool(self.color), "color": self.color, "colorCoding": self.color_coding, "colorNominal": self.color_nominal, "colorOrdinal": self.color_ordinal, "cordering": self.cordering, "subplots": [s.render(label_points=label_points) for s in self.subplots.itervalues()], "caption": view.caption, "data": load(view) }
def find_causality(metadata_path, callgraph_file_path, prev_cluster_metadata): # extract the service pairs from the callgraph (.dot file) callgraph_pairs = extract_callgraph_pairs(callgraph_file_path) # load the metadata.json which summarizes the measurement dir info. extract # the names of the services. data = metadata.load(metadata_path) services = {} for srv in data["services"]: services[srv["name"]] = srv # determine granger causality between services for srv_a, srv_b in callgraph_pairs.values(): compare_services(services[srv_a], services[srv_b], metadata_path, prev_cluster_metadata)
def increase_cluster_size(path): queue = Queue(connection=Redis("jobqueue.local")) data = metadata.load(path) best_score = -1 best = -1 for service in data["services"]: for key, value in service.get("clusters", {}).items(): score = value.get("silhouette_score", -1) if best_score < score: best = int(key) best_score = score if best in [6, 7]: for cluster_size in range( 8, min(len(service["preprocessed_fields"]), 15)): queue.enqueue_call(func=cluster_service, args=(path, service, cluster_size), timeout=3600 * 3)
def find_causality(callgraph_path, path): data = metadata.load(path) call_pairs = load_graph(callgraph_path) services = {} for srv in data["services"]: services[srv["name"]] = srv ids = [] def _compare_services(args): from grangercausality import compare_services compare_services(*args) for srv_a, srv_b in call_pairs: res = lview.apply_async(_compare_services, (services[srv_a], services[srv_b], path)) ids.extend(res.msg_ids) return ids
def apply(path): data = metadata.load(path) for service in data["services"]: filename = os.path.join(path, service["filename"]) df = load_timeseries(filename, service) print(service) df2 = interpolate_missing(df[service["fields"]]) classes = classify_series(df2) preprocessed_series = {} for k in classes["other_fields"]: # short by one value, because we have to short the other one! preprocessed_series[k] = df2[k][1:] for k in classes["monotonic_fields"]: preprocessed_series[k + "-diff"] = df2[k].diff()[1:] newname = service["name"] + "-preprocessed.tsv.gz" df3 = pd.DataFrame(preprocessed_series) df3.to_csv(os.path.join(path, newname), sep="\t", compression='gzip') service["preprocessed_filename"] = newname service["preprocessed_fields"] = list(df3.columns) service.update(classes) metadata.save(path, data)
def write_measurement(measurement, report): if measurement.endswith("/"): measurement = measurement[:-1] title = os.path.basename(measurement) data = metadata.load(measurement) metrics_count = 0 metrics_set = set() filtered_count = 0 for srv in data["services"]: metrics_count += len(srv["fields"]) metrics_set.update(srv["fields"]) filtered_count += len(srv["preprocessed_fields"]) for i in range(1, cluster_number(srv) + 1): clusters = [] for j in range(1, i + 1): name = "%s-cluster-%d_%d.png" % (srv["name"], i, j) url = "https://gitlab.com/micro-analytics/measurements2/raw/master/%s/%s" % ( title, name) clusters.append((name, j, url)) args = dict(title=title, cluster_size=i, clusters=clusters) path = os.path.join(report, "%s-%s-%d.md" % (title, srv["name"], i)) write_template(CLUSTER, path, **args) return title, data["services"]
def run_rca(causality_graphs, updated_services="all", filter_edges=None, excluded_metrics=[]): # extract metrics & clusters from metadata start_time = time.time() for version in causality_graphs: causality_graphs[version]['graph'].extract_metadata( metadata.load(causality_graphs[version]['dir']), add_services=['nova_novncproxy']) # causality_graphs[version].print_graph(version, True, False) print( "%s::run_rca() : extracted \"%s\" metrics & clusters in %s seconds" % (sys.argv[0], version, time.time() - start_time)) # 1st phase of rca is individual metric differences in metadata metric_diffs_by_service, totals = causality_graphs['faulty'][ 'graph'].get_metric_diffs(causality_graphs['non-faulty']['graph']) # order the metric_diffs_by_service, according to 'total-change' metric_diffs_list = sorted(list(metric_diffs_by_service), key=lambda x: (len(metric_diffs_by_service[x]['new']) + len( metric_diffs_by_service[x]['discarded'])), reverse=True) # # print a LaTEX formatted table, for paper purposes # rca_plots.to_latex_table(metric_diffs_by_service, metric_diffs_list, ['new', 'discarded', 'unchanged']) print("\n#1 : individual metric differences:") table = PrettyTable(['service', 'new', 'discarded', 'unchanged']) for service_name in metric_diffs_list: # print("%s [NEW] -> %s" % (service_name, metric_diffs_by_service[service_name]['new'])) # print("%s [DISCARDED] -> %s" % (service_name, metric_diffs_by_service[service_name]['discarded'])) # print("%s [UNCHANGED] -> %s" % (service_name, metric_diffs_by_service[service_name]['unchanged'])) table.add_row([ service_name, len(metric_diffs_by_service[service_name]['new']), len(metric_diffs_by_service[service_name]['discarded']), len(metric_diffs_by_service[service_name]['unchanged']) ]) table.add_row( ["TOTALS", totals['new'], totals['discarded'], totals['unchanged']]) print(table) print("") # # plot indiv. metrics # rca_plots.plot_individual_metrics(metric_diffs_by_service) # 2nd phase : cluster differences # calculate cluster difference stats cluster_diffs = causality_graphs['faulty']['graph'].get_cluster_diffs( causality_graphs['non-faulty']['graph']) print("\n#2.1 : silhouette scores by service:") # print silhouette scores by service table = PrettyTable( ['service', 'silhouette score non-faulty', 'silhouette score faulty']) for service_name in cluster_diffs: table.add_row([ service_name, cluster_diffs[service_name]['silhouette-score'][1], cluster_diffs[service_name]['silhouette-score'][0] ]) print(table) print("") print("\n#2.2 : cluster similarity:") # print cluster similarity table similarities = [] table = PrettyTable([ 'service', 'cluster non-faulty', 'cluster faulty', 'similarity score' ]) for service_name in cluster_diffs: is_first = True for rep_metric in cluster_diffs[service_name]['similarity']['f-nf']: if not is_first: name = "" else: name = service_name is_first = False similarities.append(cluster_diffs[service_name]['similarity'] ['f-nf'][rep_metric][1]) table.add_row([ name, "N/A" if cluster_diffs[service_name]['similarity']['f-nf'] [rep_metric][0] is None else cluster_diffs[service_name] ['similarity']['f-nf'][rep_metric][0].rep_metric, rep_metric, cluster_diffs[service_name]['similarity']['f-nf'][rep_metric] [1] ]) print(table) print("") print("\n#2.3 : cluster metric differences") # print cluster metric diffs table and gather data for a plot showing the # number of clusters w/ novelty vs. the total number of clusters cluster_novelty = [] # keep total nr. of clusters for 'All' and 'Top' scopes # FIXME: this sounds like i'm doing something wrong, but anyway... total_changed = 0 total_top = 0 total_all = 0 # the threshold for the top services top_threshold = len(metric_diffs_list) top_threshold_str = ('Top %d' % (top_threshold)) table = PrettyTable( ['service', 'cluster', 'new', 'discarded', 'unchanged']) for service_name in causality_graphs['faulty']['graph'].clusters: is_first = True for rep_metric in causality_graphs['faulty']['graph'].clusters[ service_name]['cluster-table']: cluster = causality_graphs['faulty']['graph'].clusters[ service_name]['cluster-table'][rep_metric] if not is_first: name = "" else: name = service_name is_first = False columns = {} for column_name in ['new', 'discarded', 'unchanged']: columns[ column_name] = 0 if column_name not in cluster.metric_diffs else len( cluster.metric_diffs[column_name]) # update the cluster novelty data (for plotting) if (columns['new'] > 0) and (columns['discarded'] > 0): cluster_novelty.append(('All', 'New\nand\nDiscarded', 1)) total_changed += 1 # if service_name in metric_diffs_list[0:top_threshold]: # cluster_novelty.append((top_threshold_str, 'New\nand\nDiscarded', 1)) elif columns['new'] > 0: cluster_novelty.append(('All', 'New', 1)) total_changed += 1 # if service_name in metric_diffs_list[0:top_threshold]: # cluster_novelty.append((top_threshold_str, 'New', 1)) elif columns['discarded'] > 0: cluster_novelty.append(('All', 'Discarded', 1)) total_changed += 1 # if service_name in metric_diffs_list[0:top_threshold]: # cluster_novelty.append((top_threshold_str, 'Discarded', 1)) total_all += 1 if service_name in metric_diffs_list[0:top_threshold]: total_top += 1 table.add_row([ name, cluster.rep_metric, columns['new'], columns['discarded'], columns['unchanged'] ]) cluster_novelty.append(('All', 'Changed', total_changed)) cluster_novelty.append(('All', 'Total', total_all)) # cluster_novelty.append((top_threshold_str, 'Total', total_top)) print(table) print("") print("\n#3.1 : edge differences") edge_diffs_stats = [] cluster_reduction_stats = [] column_titles = OrderedDict() column_titles['new'] = 'New' column_titles['discarded'] = 'Discarded' column_titles['lag-change'] = 'Lag change' # column_titles['changed'] = 'Changed (total)' column_titles['unchanged'] = 'Unchanged' for similarity_threshold in [0.01, 0.50, 0.60, 0.70]: edge_diffs = causality_graphs['faulty']['graph'].get_edge_diffs( causality_graphs['non-faulty']['graph'], cluster_diffs, metric_diffs_list[0:top_threshold], similarity_threshold=similarity_threshold) # print cluster metric diffs table table = PrettyTable( ['difference-type', 'new', 'discarded', 'unchanged', 'lag-change']) total_edge_diffs = 0 total_metrics = 0 included_services = set() visited_similarity_clusters = set() visited_similarity_services = set() metrics_per_service = defaultdict(int) for edge_diff_type in edge_diffs: columns = {} for column_name, column_str in column_titles.iteritems(): if similarity_threshold == 0.01: similarity_threshold = 0.00 columns[column_name] = 0 if column_name not in edge_diffs[ edge_diff_type] else len( edge_diffs[edge_diff_type][column_name]) # FIXME: ugly form of data collection if edge_diff_type == 'similarity': # if column_name != 'discarded': edge_diffs_stats.append((similarity_threshold, column_str, columns[column_name])) if column_name != 'unchanged': total_edge_diffs += columns[column_name] if column_name == 'discarded': graph = causality_graphs['non-faulty']['graph'] else: graph = causality_graphs['faulty']['graph'] for edge in edge_diffs[edge_diff_type][column_name]: for cluster in edge: if edge_diff_type == 'similarity' or ( edge_diff_type == 'novelty' and similarity_threshold == 0.00): if edge[cluster] not in visited_similarity_clusters: total_metrics += len(graph.clusters[ edge[cluster][0]]['cluster-table'][ edge[cluster][1]].other_metrics) metrics_per_service[edge[cluster][0]] += len( graph.clusters[ edge[cluster][0]]['cluster-table'][ edge[cluster][1]].other_metrics) visited_similarity_clusters.add(edge[cluster]) visited_similarity_services.add(edge[cluster][0]) included_services.add(edge[cluster][0]) table.add_row([ edge_diff_type, columns['new'], columns['discarded'], columns['unchanged'], columns['lag-change'] ]) print("included : %s (%d)" % (str(included_services), len(included_services))) # edge_diffs_stats.append((similarity_threshold, 'Changed (total)', total_edge_diffs)) cluster_reduction_stats.append((similarity_threshold, 'Services', len(visited_similarity_services))) # print(visited_similarity_clusters) cluster_reduction_stats.append((similarity_threshold, 'Clusters', len(visited_similarity_clusters))) cluster_reduction_stats.append( (similarity_threshold, 'Metrics', total_metrics)) print("\n\nEDGE DIFFS SUMMARY (SIMILARITY THRESHOLD : %f" % (similarity_threshold)) print("\tMETRICS (%d) : %s" % (total_metrics, metrics_per_service)) print("\tEDGES") for column_name in edge_diffs['similarity']: if column_name == 'discarded': graph = causality_graphs['non-faulty']['graph'] else: graph = causality_graphs['faulty']['graph'] for edge in edge_diffs[edge_diff_type][column_name]: print("\t(%s) %s" % (column_name, edge)) print("") print(table) print("") print("\n#3.2 : edge differences (cluster compositions)") visited_clusters = [] similarity_edges = [] for edge_diff_type in edge_diffs: for column_name in edge_diffs[edge_diff_type]: if column_name == 'discarded': graph = causality_graphs['non-faulty']['graph'] else: graph = causality_graphs['faulty']['graph'] for edge in edge_diffs[edge_diff_type][column_name]: if edge_diff_type == 'similarity': similarity_edges.append(edge) for cluster in edge: if edge[cluster] in visited_clusters: continue service_name = edge[cluster][0] p_metric = edge[cluster][1] # print("%s -> %s\n" # % (str(edge[cluster]), str(graph.clusters[service_name]['cluster-table'][p_metric].other_metrics))) visited_clusters.append(edge[cluster]) rca_plots.draw_edge_differences(edge_diffs['similarity'], 'similarity') rca_plots.draw_edge_differences(edge_diffs['novelty'], 'novelty') cluster_novelty = pd.DataFrame(cluster_novelty, columns=['Scope', 'Type', 'nr-clusters']) edge_diffs_stats = pd.DataFrame( edge_diffs_stats, columns=['Similarity threshold', 'Edge diff.', 'nr-edges']) cluster_reduction_stats = pd.DataFrame( cluster_reduction_stats, columns=['Similarity threshold', 'Type', 'nr']) print(cluster_reduction_stats) print(edge_diffs_stats) rca_plots.plot_clusters(cluster_novelty, edge_diffs_stats, cluster_reduction_stats) print("")
import os from flask import Flask, send_file, request import metadata import random import csv items = metadata.load() app = Flask(__name__) IMAGE_FILE_FORMAT = 'data/images_alpha/{:s}.png' INDEX_HTML = '''<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>Dataset Labelling</title> </head> <body> <div id="viewer"> <div id="image" draggable="false" ondragstart="return false;"></div> <div class="hline" style="top:100px;"></div> <div class="hline" style="top:150px;"></div> <div class="hline" style="top:200px;"></div> <div class="hline" style="top:250px;"></div> <div class="hline" style="top:300px;"></div> <div class="hline" style="top:350px;"></div> <div class="hline" style="top:400px;"></div> <div class="hline" style="top:450px;"></div> <div class="hline" style="top:500px;"></div>
def draw(path): data = metadata.load(path) p_values_pearson = [] p_values_shapiro = [] norm_dist_path = os.path.join(path, "normtest_distribution.png") if os.path.exists(norm_dist_path): print("path exists %s, skip" % norm_dist_path) #return for srv in data["services"]: filename = os.path.join(path, srv["filename"]) df = load_timeseries(filename, srv) columns = [] for c in df.columns: if (not df[c].isnull().all()) and df[c].var() != 0: columns.append(c) df = df[columns] n = len(columns) if n == 0: continue fig, axis = plt.subplots(n, 2) fig.set_figheight(n * 4) fig.set_figwidth(30) for i, col in enumerate(df.columns): serie = df[col].dropna() sns.boxplot(x=serie, ax=axis[i, 0]) statistic_1, p_value_1 = normaltest(serie) p_values_pearson.append(p_value_1) statistic_2, p_value_2 = shapiro(serie) p_values_shapiro.append(p_value_2) templ = """Pearson's normtest: statistic: %f p-value: %E -> %s Shapiro-Wilk test for normality: statistic: %f p-value: %E -> %s """ outcome_1 = "not normal distributed" if p_value_1 < 0.05 else "normal distributed" outcome_2 = "not normal distributed" if p_value_2 < 0.05 else "normal distributed" text = templ % (statistic_1, p_value_1, outcome_1, statistic_2, p_value_2, outcome_2) axis[i, 1].axis('off') axis[i, 1].text(0.05, 0.05, text, fontsize=18) plot_path = os.path.join(path, "%s_normtest.png" % srv["name"]) plt.savefig(plot_path) print(plot_path) fig, axis = plt.subplots(2) fig.set_figheight(8) measurement = os.path.dirname(os.path.join(path, '')) name = "Distribution of p-value for Pearson's normtest for %s" % measurement plot = sns.distplot(pd.Series(p_values_pearson, name=name), rug=True, kde=False, norm_hist=False, ax=axis[0]) name = "Distribution of p-value for Shapiro-Wilk's normtest for %s" % measurement plot = sns.distplot(pd.Series(p_values_shapiro, name=name), rug=True, kde=False, norm_hist=False, ax=axis[1]) fig.savefig(norm_dist_path) print(norm_dist_path)
# LyfeOnEdge 2020 # Written for Open Shop Channel Project import io, json import metadata # load metadata json metadata = metadata.Metadata() metadata.load() # python object to parse converted hbb list file class hbbjsonparser(object): def __init__(self): self.init() def init(self): self.all = [] self.demos = [] self.emulators = [] self.games = [] self.media = [] self.utilities = [] self.map = { "demos": self.demos, "emulators": self.emulators, "games": self.games, "media": self.media, "utilities": self.utilities, }
def cluster_words(words, service_name, size): stopwords = [ "GET", "POST", "total", "http-requests", service_name, "-", "_" ] cleaned_words = [] for word in words: for stopword in stopwords: word = word.replace(stopword, "") cleaned_words.append(word) def distance(coord): i, j = coord return 1 - jaro_distance(cleaned_words[i], cleaned_words[j]) indices = np.triu_indices(len(words), 1) distances = np.apply_along_axis(distance, 0, indices) return cluster_of_size(linkage(distances), size) if __name__ == '__main__': if len(sys.argv) < 2: sys.stderr.write("USAGE: %s measurement" % sys.argv[0]) sys.exit(1) data = metadata.load(sys.argv[1]) for srv in data["services"]: words = srv["preprocessed_fields"] print("### %s ###" % srv["name"]) clusters = cluster_words(words, srv["name"], 10) for i, cluster in enumerate(clusters): print(i, [words[idx] for idx in cluster])
def loadMetadata(path): try: return metadata.load(path) except IOError: print(u"Cannot load {}.".format(path)) return None
import json import math import numpy as np import metadata butterflies_by_image_id = {i.image_id: i for i in metadata.load()} strings = [] name_ids = {} def get_name_id(value): if value not in name_ids: name_ids[value] = len(strings) strings.append(value) return name_ids[value] def create_json_dict(item, x, y): result = { 'x': x, 'y': y, 'occId': item.occurence_id, 'image': item.image_id, 'properties': [ get_name_id(p) for p in (item.family, item.genus, item.species, item.subspecies,
"--initial-cluster-dir", help = """dir w/ clustered data from which to get prev. cluster assigments.""") parser.add_argument( "--callgraph", help = """path to callgraph .dot file. default is 'openstack-callgraph.dot' (on the local dir).""") args = parser.parse_args() # quit if a dir w/ measurement files hasn't been provided if not args.msr_dir: sys.stderr.write("""%s: [ERROR] please pass a dir w/ clustered data as '--msr-dir'\n""" % sys.argv[0]) parser.print_help() sys.exit(1) if args.initial_cluster_dir: prev_cluster_metadata = metadata.load(args.initial_cluster_dir) else: prev_cluster_metadata = None # choose the default .dot callgraph if one hasn't been provided if not args.callgraph: callgraph_file_path = DEFAULT_CALLGRAPH_FILE_PATH else: callgraph_file_path = args.callgraph find_causality(args.msr_dir, callgraph_file_path, prev_cluster_metadata)
for i, col in enumerate(df.columns): serie = df[col].dropna() if pd.algos.is_monotonic_float64(serie.values, False)[0]: serie = serie.diff()[1:] p_value = adfuller(serie, autolag='AIC')[1] if math.isnan(p_value): continue nearest = 0.05 * round(p_value/0.05) bins[nearest].append(serie) for bin, members in bins.items(): series = [serie.name for serie in members] if len(members) <= 10: columns = series else: columns = random.sample(series, 10) subset = df[columns] name = "%s_adf_confidence_%.2f.png" % (srv["name"], bin) print(name) axes = subset.plot(subplots=True) plt.savefig(os.path.join(path, name)) plt.close("all") if __name__ == '__main__': if len(sys.argv) < 1: sys.stderr.write("USAGE: %s measurment\n" % sys.argv[0]) sys.exit(1) for path in sys.argv[1:]: services = metadata.load(path)["services"] for srv in services: draw(path, srv)
"--initial-cluster-dir", help="""dir w/ clustered data from which to derive initial cluster assigments.""") args = parser.parse_args() # quit if a dir w/ causality files hasn't been provided if not args.msr_dir: sys.stderr.write( """%s: [ERROR] please supply 1 measurement data dir\n""" % sys.argv[0]) parser.print_help() sys.exit(1) if args.initial_cluster_dir: prev_metadata = metadata.load(args.initial_cluster_dir) else: prev_metadata = None last_cluster_size = defaultdict(int) start_time = datetime.utcnow() for n in range(2, 7): # to reduce clustering time, use paralellism pool = mp.Pool(mp.cpu_count()) # tasks to run in paralell tasks = [] for srv in metadata.load(args.msr_dir)["services"]: