示例#1
0
    def load(self, filename, metadir=None):

        filename = os.path.expandvars(filename)
        if os.path.isdir(filename):
            self.__use_rootfs = False
        elif os.path.isfile(filename):
            self.__use_rootfs = True
        else:
            print "%s does not exist" % filename
            return False
        if self.__use_rootfs:
            if self.verbose: print "loading %s" % filename
            data = ropen(filename)
            if not data:
                print "Could not open %s" % filename
                return False
            if self.coreData:
                self.coreData.Close()
            self.coreData = data
            self.coreDataName = filename
        else:
            self.root = filename
            if self.coreData:
                self.coreData.Close()
        dataroot = os.path.dirname(filename)
        # get metadata
        for meta in ["variables", "datasets", "trees"]:
            metafile = "%s.yml" % meta
            if metadir:
                metafile_user = os.path.join(metadir, metafile)
                if os.path.isfile(metafile_user):
                    print "loading %s" % metafile_user
                    setattr(self, meta, metadata.load(metafile_user))
                    continue
            else:
                if os.path.isfile(metafile):
                    print "loading %s" % metafile
                    setattr(self, meta, metadata.load(metafile))
                    continue
                metafile_data = os.path.join(dataroot, metafile)
                if os.path.isfile(metafile_data):
                    print "loading %s" % metafile_data
                    setattr(self, meta, metadata.load(metafile_data))
                    continue
                if os.environ.has_key('DATAROOT'):
                    dataroot_central = os.environ['DATAROOT']
                    metafile_central = os.path.join(dataroot_central, metafile)
                    if os.path.isfile(metafile_central):
                        print "loading %s" % metafile_central
                        setattr(self, meta, metadata.load(metafile_central))
                        continue
            print "Could not find %s.yml in $DATAROOT, %s or current working directory" % (
                meta, dataroot)
            return False
        return True
示例#2
0
 def load(self, filename, metadir=None):
     
     filename = os.path.expandvars(filename)
     if os.path.isdir(filename):
         self.__use_rootfs = False
     elif os.path.isfile(filename):
         self.__use_rootfs = True
     else:
         print "%s does not exist"% filename
         return False
     if self.__use_rootfs:
         if self.verbose: print "loading %s"%filename
         data = ropen(filename)
         if not data:
             print "Could not open %s"% filename
             return False
         if self.coreData:
             self.coreData.Close()
         self.coreData = data
         self.coreDataName = filename
     else:
         self.root = filename
         if self.coreData:
             self.coreData.Close()
     dataroot = os.path.dirname(filename)
     # get metadata
     for meta in ["variables", "datasets", "trees"]:
         metafile = "%s.yml"% meta
         if metadir:
             metafile_user = os.path.join(metadir, metafile)
             if os.path.isfile(metafile_user):
                 print "loading %s"% metafile_user
                 setattr(self,meta,metadata.load(metafile_user))
                 continue
         else:    
             if os.path.isfile(metafile):
                 print "loading %s"% metafile
                 setattr(self,meta,metadata.load(metafile))
                 continue
             metafile_data = os.path.join(dataroot, metafile)
             if os.path.isfile(metafile_data):
                 print "loading %s"% metafile_data
                 setattr(self,meta,metadata.load(metafile_data))
                 continue
             if os.environ.has_key('DATAROOT'):
                 dataroot_central = os.environ['DATAROOT']
                 metafile_central = os.path.join(dataroot_central, metafile)
                 if os.path.isfile(metafile_central):
                     print "loading %s"% metafile_central
                     setattr(self,meta,metadata.load(metafile_central))
                     continue
         print "Could not find %s.yml in $DATAROOT, %s or current working directory"% (meta, dataroot)
         return False
     return True
def draw(path):
    data = metadata.load(path)
    adf_dist_path = os.path.join(path, "adf_distribution.png")
    if os.path.exists(adf_dist_path):
        print("path exists %s, skip" % adf_dist_path)
        #return
    p_values = {'c': [], 'ct': [], 'ctt': []}
    for srv in data["services"]:
        do_adfuller(path, srv, p_values)

    measurement = os.path.dirname(os.path.join(path, ''))
    ax = plt.subplots(1)[1]
    ax.yaxis.grid()
    labels = [
        "constant", "constant + trend",
        "constant, and linear and quadratic trend"
    ]
    ax.hist(p_values.values(),
            22,
            histtype='bar',
            align='mid',
            label=labels,
            alpha=0.4)
    ax.set_xlabel(
        "Distribution of p-value for Augmented Dickey-Fuller test for %s" %
        measurement)
    ax.legend()
    plt.savefig(adf_dist_path)
    print(adf_dist_path)
示例#4
0
 def render(self, view, label_points=True):
     return {
         "nrows":
         self.nrows,
         "ncols":
         self.ncols,
         "hasColor":
         bool(self.color),
         "color":
         self.color,
         "colorCoding":
         self.color_coding,
         "colorNominal":
         self.color_nominal,
         "colorOrdinal":
         self.color_ordinal,
         "cordering":
         self.cordering,
         "subplots": [
             s.render(label_points=label_points)
             for s in self.subplots.itervalues()
         ],
         "caption":
         view.caption,
         "data":
         load(view)
     }
示例#5
0
def main(path):
    data = metadata.load(path)
    result = defaultdict(list)
    for srv in data["services"]:
        process_service(path, srv["name"], result)
    n = os.path.join(path, "scores.tsv")
    print(n)
    pd.DataFrame(result).to_csv(n)
def cluster_services(path):
    data = metadata.load(path)

    def _cluster_service(args):
        import cluster
        return cluster.cluster_service(*args)

    ids = []
    for cluster_size in range(1, 8):
        for service in data["services"]:
            res = lview.apply_async(_cluster_service,
                                    (path, service, cluster_size))
            ids.extend(res.msg_ids)
    return ids
示例#7
0
 def render(self, view, label_points=True):
     return {
         "nrows": self.nrows,
         "ncols": self.ncols,
         "hasColor": bool(self.color),
         "color": self.color,
         "colorCoding": self.color_coding,
         "colorNominal": self.color_nominal,
         "colorOrdinal": self.color_ordinal,
         "cordering": self.cordering,
         "subplots": [s.render(label_points=label_points) for s in self.subplots.itervalues()],
         "caption": view.caption,
         "data": load(view)
     }
def find_causality(metadata_path, callgraph_file_path, prev_cluster_metadata):

    # extract the service pairs from the callgraph (.dot file)
    callgraph_pairs = extract_callgraph_pairs(callgraph_file_path)

    # load the metadata.json which summarizes the measurement dir info. extract 
    # the names of the services.
    data = metadata.load(metadata_path)
    services = {}
    for srv in data["services"]:
        services[srv["name"]] = srv

    # determine granger causality between services
    for srv_a, srv_b in callgraph_pairs.values():
        compare_services(services[srv_a], services[srv_b], metadata_path, prev_cluster_metadata)
def increase_cluster_size(path):
    queue = Queue(connection=Redis("jobqueue.local"))
    data = metadata.load(path)
    best_score = -1
    best = -1
    for service in data["services"]:
        for key, value in service.get("clusters", {}).items():
            score = value.get("silhouette_score", -1)
            if best_score < score:
                best = int(key)
                best_score = score
        if best in [6, 7]:
            for cluster_size in range(
                    8, min(len(service["preprocessed_fields"]), 15)):
                queue.enqueue_call(func=cluster_service,
                                   args=(path, service, cluster_size),
                                   timeout=3600 * 3)
def find_causality(callgraph_path, path):
    data = metadata.load(path)
    call_pairs = load_graph(callgraph_path)
    services = {}
    for srv in data["services"]:
        services[srv["name"]] = srv
    ids = []

    def _compare_services(args):
        from grangercausality import compare_services
        compare_services(*args)

    for srv_a, srv_b in call_pairs:
        res = lview.apply_async(_compare_services,
                                (services[srv_a], services[srv_b], path))
        ids.extend(res.msg_ids)
    return ids
def apply(path):
    data = metadata.load(path)
    for service in data["services"]:
        filename = os.path.join(path, service["filename"])
        df = load_timeseries(filename, service)
        print(service)
        df2 = interpolate_missing(df[service["fields"]])
        classes = classify_series(df2)
        preprocessed_series = {}
        for k in classes["other_fields"]:
            # short by one value, because we have to short the other one!
            preprocessed_series[k] = df2[k][1:]
        for k in classes["monotonic_fields"]:
            preprocessed_series[k + "-diff"] = df2[k].diff()[1:]
        newname = service["name"] + "-preprocessed.tsv.gz"
        df3 = pd.DataFrame(preprocessed_series)
        df3.to_csv(os.path.join(path, newname), sep="\t", compression='gzip')
        service["preprocessed_filename"] = newname
        service["preprocessed_fields"] = list(df3.columns)
        service.update(classes)
    metadata.save(path, data)
def write_measurement(measurement, report):
    if measurement.endswith("/"):
        measurement = measurement[:-1]
    title = os.path.basename(measurement)
    data = metadata.load(measurement)
    metrics_count = 0
    metrics_set = set()
    filtered_count = 0
    for srv in data["services"]:
        metrics_count += len(srv["fields"])
        metrics_set.update(srv["fields"])
        filtered_count += len(srv["preprocessed_fields"])
        for i in range(1, cluster_number(srv) + 1):
            clusters = []
            for j in range(1, i + 1):
                name = "%s-cluster-%d_%d.png" % (srv["name"], i, j)
                url = "https://gitlab.com/micro-analytics/measurements2/raw/master/%s/%s" % (
                    title, name)
                clusters.append((name, j, url))
            args = dict(title=title, cluster_size=i, clusters=clusters)
            path = os.path.join(report,
                                "%s-%s-%d.md" % (title, srv["name"], i))
            write_template(CLUSTER, path, **args)
    return title, data["services"]
示例#13
0
def run_rca(causality_graphs,
            updated_services="all",
            filter_edges=None,
            excluded_metrics=[]):

    # extract metrics & clusters from metadata
    start_time = time.time()

    for version in causality_graphs:
        causality_graphs[version]['graph'].extract_metadata(
            metadata.load(causality_graphs[version]['dir']),
            add_services=['nova_novncproxy'])
        # causality_graphs[version].print_graph(version, True, False)
        print(
            "%s::run_rca() : extracted \"%s\" metrics & clusters in %s seconds"
            % (sys.argv[0], version, time.time() - start_time))

    # 1st phase of rca is individual metric differences in metadata
    metric_diffs_by_service, totals = causality_graphs['faulty'][
        'graph'].get_metric_diffs(causality_graphs['non-faulty']['graph'])

    # order the metric_diffs_by_service, according to 'total-change'
    metric_diffs_list = sorted(list(metric_diffs_by_service),
                               key=lambda x:
                               (len(metric_diffs_by_service[x]['new']) + len(
                                   metric_diffs_by_service[x]['discarded'])),
                               reverse=True)

    # # print a LaTEX formatted table, for paper purposes
    # rca_plots.to_latex_table(metric_diffs_by_service, metric_diffs_list, ['new', 'discarded', 'unchanged'])

    print("\n#1 : individual metric differences:")
    table = PrettyTable(['service', 'new', 'discarded', 'unchanged'])
    for service_name in metric_diffs_list:

        # print("%s [NEW] -> %s" % (service_name, metric_diffs_by_service[service_name]['new']))
        # print("%s [DISCARDED] -> %s" % (service_name, metric_diffs_by_service[service_name]['discarded']))
        # print("%s [UNCHANGED] -> %s" % (service_name, metric_diffs_by_service[service_name]['unchanged']))

        table.add_row([
            service_name,
            len(metric_diffs_by_service[service_name]['new']),
            len(metric_diffs_by_service[service_name]['discarded']),
            len(metric_diffs_by_service[service_name]['unchanged'])
        ])
    table.add_row(
        ["TOTALS", totals['new'], totals['discarded'], totals['unchanged']])
    print(table)
    print("")

    # # plot indiv. metrics
    # rca_plots.plot_individual_metrics(metric_diffs_by_service)

    # 2nd phase : cluster differences

    # calculate cluster difference stats
    cluster_diffs = causality_graphs['faulty']['graph'].get_cluster_diffs(
        causality_graphs['non-faulty']['graph'])

    print("\n#2.1 : silhouette scores by service:")
    # print silhouette scores by service
    table = PrettyTable(
        ['service', 'silhouette score non-faulty', 'silhouette score faulty'])
    for service_name in cluster_diffs:
        table.add_row([
            service_name, cluster_diffs[service_name]['silhouette-score'][1],
            cluster_diffs[service_name]['silhouette-score'][0]
        ])

    print(table)
    print("")

    print("\n#2.2 : cluster similarity:")
    # print cluster similarity table
    similarities = []

    table = PrettyTable([
        'service', 'cluster non-faulty', 'cluster faulty', 'similarity score'
    ])
    for service_name in cluster_diffs:

        is_first = True
        for rep_metric in cluster_diffs[service_name]['similarity']['f-nf']:

            if not is_first:
                name = ""
            else:
                name = service_name
                is_first = False

            similarities.append(cluster_diffs[service_name]['similarity']
                                ['f-nf'][rep_metric][1])
            table.add_row([
                name,
                "N/A" if cluster_diffs[service_name]['similarity']['f-nf']
                [rep_metric][0] is None else cluster_diffs[service_name]
                ['similarity']['f-nf'][rep_metric][0].rep_metric, rep_metric,
                cluster_diffs[service_name]['similarity']['f-nf'][rep_metric]
                [1]
            ])

    print(table)
    print("")

    print("\n#2.3 : cluster metric differences")

    # print cluster metric diffs table and gather data for a plot showing the
    # number of clusters w/ novelty vs. the total number of clusters
    cluster_novelty = []

    # keep total nr. of clusters for 'All' and 'Top' scopes
    # FIXME: this sounds like i'm doing something wrong, but anyway...
    total_changed = 0
    total_top = 0
    total_all = 0

    # the threshold for the top services
    top_threshold = len(metric_diffs_list)
    top_threshold_str = ('Top %d' % (top_threshold))

    table = PrettyTable(
        ['service', 'cluster', 'new', 'discarded', 'unchanged'])
    for service_name in causality_graphs['faulty']['graph'].clusters:

        is_first = True
        for rep_metric in causality_graphs['faulty']['graph'].clusters[
                service_name]['cluster-table']:

            cluster = causality_graphs['faulty']['graph'].clusters[
                service_name]['cluster-table'][rep_metric]

            if not is_first:
                name = ""
            else:
                name = service_name
                is_first = False

            columns = {}

            for column_name in ['new', 'discarded', 'unchanged']:
                columns[
                    column_name] = 0 if column_name not in cluster.metric_diffs else len(
                        cluster.metric_diffs[column_name])

            # update the cluster novelty data (for plotting)
            if (columns['new'] > 0) and (columns['discarded'] > 0):
                cluster_novelty.append(('All', 'New\nand\nDiscarded', 1))
                total_changed += 1

                # if service_name in metric_diffs_list[0:top_threshold]:
                #     cluster_novelty.append((top_threshold_str, 'New\nand\nDiscarded', 1))

            elif columns['new'] > 0:
                cluster_novelty.append(('All', 'New', 1))
                total_changed += 1

                # if service_name in metric_diffs_list[0:top_threshold]:
                #     cluster_novelty.append((top_threshold_str, 'New', 1))

            elif columns['discarded'] > 0:
                cluster_novelty.append(('All', 'Discarded', 1))
                total_changed += 1

                # if service_name in metric_diffs_list[0:top_threshold]:
                #     cluster_novelty.append((top_threshold_str, 'Discarded', 1))

            total_all += 1
            if service_name in metric_diffs_list[0:top_threshold]:
                total_top += 1

            table.add_row([
                name, cluster.rep_metric, columns['new'], columns['discarded'],
                columns['unchanged']
            ])

    cluster_novelty.append(('All', 'Changed', total_changed))
    cluster_novelty.append(('All', 'Total', total_all))
    # cluster_novelty.append((top_threshold_str, 'Total', total_top))

    print(table)
    print("")

    print("\n#3.1 : edge differences")

    edge_diffs_stats = []
    cluster_reduction_stats = []

    column_titles = OrderedDict()
    column_titles['new'] = 'New'
    column_titles['discarded'] = 'Discarded'
    column_titles['lag-change'] = 'Lag change'
    # column_titles['changed'] = 'Changed (total)'
    column_titles['unchanged'] = 'Unchanged'

    for similarity_threshold in [0.01, 0.50, 0.60, 0.70]:
        edge_diffs = causality_graphs['faulty']['graph'].get_edge_diffs(
            causality_graphs['non-faulty']['graph'],
            cluster_diffs,
            metric_diffs_list[0:top_threshold],
            similarity_threshold=similarity_threshold)

        # print cluster metric diffs table
        table = PrettyTable(
            ['difference-type', 'new', 'discarded', 'unchanged', 'lag-change'])

        total_edge_diffs = 0
        total_metrics = 0
        included_services = set()
        visited_similarity_clusters = set()
        visited_similarity_services = set()
        metrics_per_service = defaultdict(int)

        for edge_diff_type in edge_diffs:

            columns = {}

            for column_name, column_str in column_titles.iteritems():

                if similarity_threshold == 0.01:
                    similarity_threshold = 0.00

                columns[column_name] = 0 if column_name not in edge_diffs[
                    edge_diff_type] else len(
                        edge_diffs[edge_diff_type][column_name])

                # FIXME: ugly form of data collection
                if edge_diff_type == 'similarity':

                    # if column_name != 'discarded':
                    edge_diffs_stats.append((similarity_threshold, column_str,
                                             columns[column_name]))
                    if column_name != 'unchanged':
                        total_edge_diffs += columns[column_name]

                if column_name == 'discarded':
                    graph = causality_graphs['non-faulty']['graph']
                else:
                    graph = causality_graphs['faulty']['graph']

                for edge in edge_diffs[edge_diff_type][column_name]:
                    for cluster in edge:
                        if edge_diff_type == 'similarity' or (
                                edge_diff_type == 'novelty'
                                and similarity_threshold == 0.00):

                            if edge[cluster] not in visited_similarity_clusters:
                                total_metrics += len(graph.clusters[
                                    edge[cluster][0]]['cluster-table'][
                                        edge[cluster][1]].other_metrics)
                                metrics_per_service[edge[cluster][0]] += len(
                                    graph.clusters[
                                        edge[cluster][0]]['cluster-table'][
                                            edge[cluster][1]].other_metrics)

                            visited_similarity_clusters.add(edge[cluster])
                            visited_similarity_services.add(edge[cluster][0])

                            included_services.add(edge[cluster][0])

            table.add_row([
                edge_diff_type, columns['new'], columns['discarded'],
                columns['unchanged'], columns['lag-change']
            ])

        print("included : %s (%d)" %
              (str(included_services), len(included_services)))

        # edge_diffs_stats.append((similarity_threshold, 'Changed (total)', total_edge_diffs))
        cluster_reduction_stats.append((similarity_threshold, 'Services',
                                        len(visited_similarity_services)))
        # print(visited_similarity_clusters)
        cluster_reduction_stats.append((similarity_threshold, 'Clusters',
                                        len(visited_similarity_clusters)))
        cluster_reduction_stats.append(
            (similarity_threshold, 'Metrics', total_metrics))

        print("\n\nEDGE DIFFS SUMMARY (SIMILARITY THRESHOLD : %f" %
              (similarity_threshold))
        print("\tMETRICS (%d) : %s" % (total_metrics, metrics_per_service))
        print("\tEDGES")
        for column_name in edge_diffs['similarity']:

            if column_name == 'discarded':
                graph = causality_graphs['non-faulty']['graph']
            else:
                graph = causality_graphs['faulty']['graph']

            for edge in edge_diffs[edge_diff_type][column_name]:
                print("\t(%s) %s" % (column_name, edge))

    print("")
    print(table)
    print("")

    print("\n#3.2 : edge differences (cluster compositions)")

    visited_clusters = []
    similarity_edges = []

    for edge_diff_type in edge_diffs:
        for column_name in edge_diffs[edge_diff_type]:

            if column_name == 'discarded':
                graph = causality_graphs['non-faulty']['graph']
            else:
                graph = causality_graphs['faulty']['graph']

            for edge in edge_diffs[edge_diff_type][column_name]:

                if edge_diff_type == 'similarity':
                    similarity_edges.append(edge)

                for cluster in edge:

                    if edge[cluster] in visited_clusters:
                        continue

                    service_name = edge[cluster][0]
                    p_metric = edge[cluster][1]
                    # print("%s -> %s\n"
                    #     % (str(edge[cluster]), str(graph.clusters[service_name]['cluster-table'][p_metric].other_metrics)))

                    visited_clusters.append(edge[cluster])

    rca_plots.draw_edge_differences(edge_diffs['similarity'], 'similarity')
    rca_plots.draw_edge_differences(edge_diffs['novelty'], 'novelty')

    cluster_novelty = pd.DataFrame(cluster_novelty,
                                   columns=['Scope', 'Type', 'nr-clusters'])
    edge_diffs_stats = pd.DataFrame(
        edge_diffs_stats,
        columns=['Similarity threshold', 'Edge diff.', 'nr-edges'])
    cluster_reduction_stats = pd.DataFrame(
        cluster_reduction_stats,
        columns=['Similarity threshold', 'Type', 'nr'])
    print(cluster_reduction_stats)
    print(edge_diffs_stats)
    rca_plots.plot_clusters(cluster_novelty, edge_diffs_stats,
                            cluster_reduction_stats)

    print("")
示例#14
0
import os
from flask import Flask, send_file, request
import metadata
import random
import csv

items = metadata.load()

app = Flask(__name__)

IMAGE_FILE_FORMAT = 'data/images_alpha/{:s}.png'

INDEX_HTML = '''<!DOCTYPE html>
<html lang="en">
    <head>
        <meta charset="utf-8">
        <title>Dataset Labelling</title>
    </head>

    <body>
        <div id="viewer">
            <div id="image" draggable="false" ondragstart="return false;"></div>
            <div class="hline" style="top:100px;"></div>
            <div class="hline" style="top:150px;"></div>
            <div class="hline" style="top:200px;"></div>
            <div class="hline" style="top:250px;"></div>
            <div class="hline" style="top:300px;"></div>
            <div class="hline" style="top:350px;"></div>
            <div class="hline" style="top:400px;"></div>
            <div class="hline" style="top:450px;"></div>
            <div class="hline" style="top:500px;"></div>
示例#15
0
def draw(path):
    data = metadata.load(path)
    p_values_pearson = []
    p_values_shapiro = []
    norm_dist_path = os.path.join(path, "normtest_distribution.png")
    if os.path.exists(norm_dist_path):
        print("path exists %s, skip" % norm_dist_path)
        #return
    for srv in data["services"]:
        filename = os.path.join(path, srv["filename"])
        df = load_timeseries(filename, srv)
        columns = []
        for c in df.columns:
            if (not df[c].isnull().all()) and df[c].var() != 0:
                columns.append(c)
        df = df[columns]
        n = len(columns)
        if n == 0:
            continue
        fig, axis = plt.subplots(n, 2)
        fig.set_figheight(n * 4)
        fig.set_figwidth(30)

        for i, col in enumerate(df.columns):
            serie = df[col].dropna()
            sns.boxplot(x=serie, ax=axis[i, 0])
            statistic_1, p_value_1 = normaltest(serie)
            p_values_pearson.append(p_value_1)
            statistic_2, p_value_2 = shapiro(serie)
            p_values_shapiro.append(p_value_2)
            templ = """Pearson's normtest:
statistic: %f
p-value: %E
-> %s

Shapiro-Wilk test for normality:
statistic: %f
p-value: %E
-> %s
"""
            outcome_1 = "not normal distributed" if p_value_1 < 0.05 else "normal distributed"
            outcome_2 = "not normal distributed" if p_value_2 < 0.05 else "normal distributed"
            text = templ % (statistic_1, p_value_1, outcome_1, statistic_2,
                            p_value_2, outcome_2)
            axis[i, 1].axis('off')
            axis[i, 1].text(0.05, 0.05, text, fontsize=18)
        plot_path = os.path.join(path, "%s_normtest.png" % srv["name"])
        plt.savefig(plot_path)
        print(plot_path)

    fig, axis = plt.subplots(2)
    fig.set_figheight(8)
    measurement = os.path.dirname(os.path.join(path, ''))
    name = "Distribution of p-value for Pearson's normtest for %s" % measurement
    plot = sns.distplot(pd.Series(p_values_pearson, name=name),
                        rug=True,
                        kde=False,
                        norm_hist=False,
                        ax=axis[0])
    name = "Distribution of p-value for Shapiro-Wilk's normtest for %s" % measurement
    plot = sns.distplot(pd.Series(p_values_shapiro, name=name),
                        rug=True,
                        kde=False,
                        norm_hist=False,
                        ax=axis[1])
    fig.savefig(norm_dist_path)
    print(norm_dist_path)
示例#16
0
# LyfeOnEdge 2020
# Written for Open Shop Channel Project

import io, json
import metadata

# load metadata json
metadata = metadata.Metadata()
metadata.load()


# python object to parse converted hbb list file
class hbbjsonparser(object):
    def __init__(self):
        self.init()

    def init(self):
        self.all = []
        self.demos = []
        self.emulators = []
        self.games = []
        self.media = []
        self.utilities = []

        self.map = {
            "demos": self.demos,
            "emulators": self.emulators,
            "games": self.games,
            "media": self.media,
            "utilities": self.utilities,
        }
def cluster_words(words, service_name, size):
    stopwords = [
        "GET", "POST", "total", "http-requests", service_name, "-", "_"
    ]
    cleaned_words = []
    for word in words:
        for stopword in stopwords:
            word = word.replace(stopword, "")
        cleaned_words.append(word)

    def distance(coord):
        i, j = coord
        return 1 - jaro_distance(cleaned_words[i], cleaned_words[j])

    indices = np.triu_indices(len(words), 1)
    distances = np.apply_along_axis(distance, 0, indices)
    return cluster_of_size(linkage(distances), size)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.stderr.write("USAGE: %s measurement" % sys.argv[0])
        sys.exit(1)
    data = metadata.load(sys.argv[1])
    for srv in data["services"]:
        words = srv["preprocessed_fields"]
        print("### %s ###" % srv["name"])
        clusters = cluster_words(words, srv["name"], 10)
        for i, cluster in enumerate(clusters):
            print(i, [words[idx] for idx in cluster])
示例#18
0
def loadMetadata(path):
    try:
        return metadata.load(path)
    except IOError:
        print(u"Cannot load {}.".format(path))
        return None
示例#19
0
import json
import math
import numpy as np
import metadata

butterflies_by_image_id = {i.image_id: i for i in metadata.load()}

strings = []
name_ids = {}


def get_name_id(value):
    if value not in name_ids:
        name_ids[value] = len(strings)
        strings.append(value)
    return name_ids[value]


def create_json_dict(item, x, y):
    result = {
        'x':
        x,
        'y':
        y,
        'occId':
        item.occurence_id,
        'image':
        item.image_id,
        'properties': [
            get_name_id(p)
            for p in (item.family, item.genus, item.species, item.subspecies,
        "--initial-cluster-dir", 
         help = """dir w/ clustered data from which to get prev. cluster 
                   assigments.""")

    parser.add_argument(
        "--callgraph", 
         help = """path to callgraph .dot file. default is 'openstack-callgraph.dot' 
                (on the local dir).""")

    args = parser.parse_args()

    # quit if a dir w/ measurement files hasn't been provided
    if not args.msr_dir:
        sys.stderr.write("""%s: [ERROR] please pass a dir w/ clustered data as '--msr-dir'\n""" % sys.argv[0]) 
        parser.print_help()
        sys.exit(1)

    if args.initial_cluster_dir:
        prev_cluster_metadata = metadata.load(args.initial_cluster_dir)
    else:
        prev_cluster_metadata = None

    # choose the default .dot callgraph if one hasn't been provided
    if not args.callgraph:
        callgraph_file_path = DEFAULT_CALLGRAPH_FILE_PATH
    else:
        callgraph_file_path = args.callgraph

    find_causality(args.msr_dir, callgraph_file_path, prev_cluster_metadata)

示例#21
0
文件: flow.py 项目: ernesta/Parrot
def loadMetadata(path):
	try:
		return metadata.load(path)
	except IOError:
		print(u"Cannot load {}.".format(path))
		return None
     for i, col in enumerate(df.columns):
         serie = df[col].dropna()
         if pd.algos.is_monotonic_float64(serie.values, False)[0]:
             serie = serie.diff()[1:]
         p_value = adfuller(serie, autolag='AIC')[1]
         if math.isnan(p_value): continue
         nearest = 0.05 * round(p_value/0.05)
         bins[nearest].append(serie)
     for bin, members in bins.items():
         series = [serie.name for serie in members]
         if len(members) <= 10:
             columns = series
         else:
             columns = random.sample(series, 10)

         subset = df[columns]
         name = "%s_adf_confidence_%.2f.png" % (srv["name"], bin)
         print(name)
         axes = subset.plot(subplots=True)
         plt.savefig(os.path.join(path, name))
         plt.close("all")

if __name__ == '__main__':
    if len(sys.argv) < 1:
        sys.stderr.write("USAGE: %s measurment\n" % sys.argv[0])
        sys.exit(1)
    for path in sys.argv[1:]:
        services = metadata.load(path)["services"]
        for srv in services:
            draw(path, srv)
        "--initial-cluster-dir",
        help="""dir w/ clustered data from which to derive initial cluster 
                   assigments.""")

    args = parser.parse_args()

    # quit if a dir w/ causality files hasn't been provided
    if not args.msr_dir:
        sys.stderr.write(
            """%s: [ERROR] please supply 1 measurement data dir\n""" %
            sys.argv[0])
        parser.print_help()
        sys.exit(1)

    if args.initial_cluster_dir:
        prev_metadata = metadata.load(args.initial_cluster_dir)
    else:
        prev_metadata = None

    last_cluster_size = defaultdict(int)

    start_time = datetime.utcnow()
    for n in range(2, 7):

        # to reduce clustering time, use paralellism
        pool = mp.Pool(mp.cpu_count())

        # tasks to run in paralell
        tasks = []
        for srv in metadata.load(args.msr_dir)["services"]: