def print_names_per_mac(dt_start, dt_end, mac_node):
    # TODO: Probably deprecated

    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_path = ("{}/prints/{}/not_filtered/names_per_mac.csv".format(
        script_dir, str_dt))
    with open(out_path, "w") as f:
        f.write("server,node,mac,names\n")
        for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
            ts = TimeSeries(in_path=in_path,
                            metric="traceroute",
                            dt_start=dt_start,
                            dt_end=dt_end)
            names = set()
            for traceroute in ts.y:
                if traceroute:
                    ip_name = get_ip_name(traceroute)
                    for hop in traceroute:
                        for name in hop["names"]:
                            names.add(get_name(name, ip_name))
            node = mac_node.get(mac)
            f.write("{},{},{},\"{}\"\n".format(server, node, mac,
                                               sorted(list(names))))

    utils.sort_csv_file(out_path, ["server", "node"])
def aggregate_first_hop_not_zero_indegree_vertex(first_hops, g, metric, server,
                                                 dt_start, dt_end,
                                                 traceroute_type):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_path = ("{}/plots/paths/{}/{}/{}/{}/"
                "problem_location_first_hop_not_zero_indegree_vertex.csv".
                format(script_dir, str_dt, metric, traceroute_type, server))
    with open(out_path, "w") as f:
        f.write("cp_dt_start,cp_dt_end,cp_type,fraction_of_clients,"
                "cnt_clients,clients,problem_location\n")
        for first_hop in first_hops:
            _, dir_path = get_path(g, str(first_hop), str_dt, traceroute_type,
                                   server)

            in_path = ("{}/problem_location_first_hop_not_zero_indegree_vertex"
                       ".csv".format(dir_path))
            if os.path.exists(in_path):
                df = pd.read_csv(in_path)
                for idx, row in df.iterrows():
                    l_format = "{},{},{},{},{},\"{}\",\"{}\"\n"
                    f.write(l_format.format(row["cp_dt_start"],
                                            row["cp_dt_end"],
                                            row["cp_type"],
                                            row["fraction_of_clients"],
                                            row["cnt_clients"],
                                            row["clients"],
                                            row["problem_location"]))

    out_path_name = "{}/plots/names/{}/{}/{}/{}".format(script_dir, str_dt,
                                                        metric,
                                                        traceroute_type,
                                                        server)
    shutil.copy(out_path, out_path_name)
def print_macs_per_name(dt_start, dt_end, mac_node):
    # TODO: Probably deprecated

    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    name_macs = {}
    for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
        ts = TimeSeries(in_path=in_path,
                        metric="traceroute",
                        dt_start=dt_start,
                        dt_end=dt_end)
        for traceroute in ts.y:
            if traceroute:
                ip_name = get_ip_name(traceroute)
                for hop in traceroute:
                    for name in hop["names"]:
                        name = get_name(name, ip_name)
                        if name not in name_macs:
                            name_macs[name] = set()
                        name_macs[name].add((server, mac_node.get(mac), mac))

    out_path = ("{}/prints/{}/not_filtered/macs_per_name.csv".format(
        script_dir, str_dt))
    with open(out_path, "w") as f:
        f.write("name,macs\n")
        names = sorted(name_macs.keys())
        for name in names:
            f.write("{},\"{}\"\n".format(name, sorted(list(name_macs[name]))))
def get_graph(dt_start,
              dt_end,
              valid_traceroute_field,
              traceroute_field,
              server=None):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format(
        script_dir, str_dt)

    names = set()
    name_neigh = {}
    df = pd.read_csv(in_path)
    if server:
        df = df[df["server"] == server]
    for idx, row in df.iterrows():
        if row["valid_cnt_samples"] and row[valid_traceroute_field]:
            traceroute = ast.literal_eval(row[traceroute_field])
            last_name = None
            for name in traceroute:
                names.add(name)
                if name not in name_neigh:
                    name_neigh[name] = set()
                if last_name:
                    name_neigh[last_name].add(name)
                last_name = name

    for name in names:
        if name not in name_neigh:
            name_neigh[name] = set()

    return name_neigh
def aggregate_servers_correlations(dt_start, dt_end, metric, servers):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_path = ("{}/prints/{}/filtered/{}/"
                "problem_location_zero_indegree_vertexes_correlation.csv".
                format(script_dir, str_dt, metric))
    with open(out_path, "w") as f:
        f.write("server,traceroute_type,cp_dt_start,cp_dt_end,cp_type,"
                "cnt_vertexes_with_zero_indegree,suffix_match,"
                "vertexes_with_zero_indegree\n")
        for server in servers:
            for traceroute_type in unsupervised_utils.iter_traceroute_types():
                if valid_graph(dt_start, dt_end, server, traceroute_type):
                    in_path = ("{}/plots/names/{}/{}/{}/{}/"
                               "problem_location_zero_indegree_vertexes_"
                               "correlation.csv".
                               format(script_dir, str_dt, metric,
                                      traceroute_type, server))
                    df = pd.read_csv(in_path)
                    for idx, row in df.iterrows():
                        f.write("{},{},{},{},{},{},\"{}\",\"{}\"\n".
                                format(server,
                                       row["traceroute_type"],
                                       row["cp_dt_start"],
                                       row["cp_dt_end"],
                                       row["cp_type"],
                                       row["cnt_vertexes_with_zero_indegree"],
                                       row["suffix_match"],
                                       row["vertexes_with_zero_indegree"]))
                    break

    utils.sort_csv_file(out_path,
                        ["cnt_vertexes_with_zero_indegree", "server"],
                        ascending=[False, True])
def print_name_ips(dt_start, dt_end):
    # TODO: Probably deprecated

    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    name_ip = {}
    for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
        ts = TimeSeries(in_path=in_path,
                        metric="traceroute",
                        dt_start=dt_start,
                        dt_end=dt_end)
        for traceroute in ts.y:
            if traceroute:
                for hop in traceroute:
                    for name, ip in izip(hop["names"], hop["ips"]):
                        if name not in name_ip:
                            name_ip[name] = set()
                        name_ip[name].add(ip)

    out_path = "{}/prints/{}/not_filtered/name_ips.csv".format(
        script_dir, str_dt)
    with open(out_path, "w") as f:
        f.write("name,ips\n")
        for name in sorted(name_ip.keys()):
            f.write("{},{}\n".format(name, sorted(list(name_ip[name]))))
def aggregate_servers_first_hop_not_zero_indegree_vertex(dt_start, dt_end,
                                                         metric, servers):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_path = ("{}/prints/{}/filtered/{}/"
                "problem_location_first_hop_not_zero_indegree_vertex.csv".
                format(script_dir, str_dt, metric))
    with open(out_path, "w") as f:
        f.write("server,cp_dt_start,cp_dt_end,cp_type,fraction_of_clients,"
                "cnt_clients,clients,problem_location\n")

        for server in servers:
            for traceroute_type in unsupervised_utils.iter_traceroute_types():
                if valid_graph(dt_start, dt_end, server, traceroute_type):
                    in_path = ("{}/plots/paths/{}/{}/{}/{}/"
                               "problem_location_first_hop_not_zero_indegree_"
                               "vertex.csv".
                               format(script_dir, str_dt, metric,
                                      traceroute_type,
                                      server))
                    df = pd.read_csv(in_path)
                    for idx, row in df.iterrows():
                        l_format = "{},{},{},{},{},{},\"{}\",\"{}\"\n"
                        f.write(l_format.format(server,
                                                row["cp_dt_start"],
                                                row["cp_dt_end"],
                                                row["cp_type"],
                                                row["fraction_of_clients"],
                                                row["cnt_clients"],
                                                row["clients"],
                                                row["problem_location"]))
                    break
def process_graphs(dt_start, dt_end):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_dir = "{}/prints/{}/filtered/graph/".format(script_dir, str_dt)
    utils.create_dirs([out_dir])

    in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format(
        script_dir, str_dt)
    servers = np.unique(pd.read_csv(in_path)["server"].values)

    for traceroute_type in unsupervised_utils.iter_traceroute_types():
        valid_traceroute_field, traceroute_field = \
            cp_utils.get_traceroute_fields(traceroute_type)

        for server in servers:
            utils.create_dirs([
                "{}/prints/{}/filtered/graph/".format(script_dir, str_dt),
                "{}/prints/{}/filtered/graph/{}".format(
                    script_dir, str_dt, server)
            ])

            out_dir = "{}/prints/{}/filtered/graph/{}".format(
                script_dir, str_dt, server)
            out_path = "{}/{}_graph.gv".format(out_dir, traceroute_field)

            name_neigh = get_graph(dt_start, dt_end, valid_traceroute_field,
                                   traceroute_field, server)
            write_graph(out_path, name_neigh)
            check_graph(out_dir, name_neigh, traceroute_field)
def print_macs_per_name_filtered(dt_start, dt_end, mac_node):
    # TODO: Probably deprecated

    str_dt = utils.get_str_dt(dt_start, dt_end)

    in_path = ("{}/prints/{}/filtered/traceroute_per_mac.csv".format(
        script_dir, str_dt))
    name_macs = {}
    df = pd.read_csv(in_path)
    for idx, row in df.iterrows():
        traceroute_filtered = ast.literal_eval(row["traceroute_filtered"])
        for elem in traceroute_filtered:
            name0 = elem[0][0]
            name1 = elem[1][0]
            tp = (name0, name1)
            if tp not in name_macs:
                name_macs[tp] = set()
            name_macs[tp].add(
                (row["server"], mac_node.get(row["mac"]), row["mac"]))

    out_path = ("{}/prints/{}/filtered/macs_per_name.csv".format(
        script_dir, str_dt))
    with open(out_path, "w") as f:
        f.write("name,macs\n")
        names = sorted(name_macs.keys())
        for name in names:
            f.write("\"{}\",\"{}\"\n".format(name,
                                             sorted(list(name_macs[name]))))
def valid_graph(dt_start, dt_end, server, traceroute_type):
    str_dt = utils.get_str_dt(dt_start, dt_end)
    in_path = ("{}/prints/{}/filtered/graph/{}/{}_filter_graph_stats.txt".
               format(script_dir, str_dt, server, traceroute_type))
    with open(in_path) as f:
        for line in f:
            if "valid_graph=" in line:
                return (line.split("=")[-1].rstrip("\n") == "True")
def write_csvs(dt_dir, dt_start, dt_end, cursor, collection):
    for cnt, doc in enumerate(cursor):
        print "{}, {}".format(cnt, utils.get_str_dt(dt_start, dt_end))
        if valid_doc(doc):
            mac = doc["_id"]["mac"]
            server = doc["host"]

            utils.create_dirs([
                "{}/{}".format(script_dir, dt_dir),
                "{}/{}/{}/".format(script_dir, dt_dir, server)
            ])

            out_path = "{}/{}/{}/{}.csv".format(script_dir, dt_dir, server,
                                                mac)

            if not os.path.exists(out_path):
                with open(out_path, "w") as f:
                    l = ("dt,uf,server_ip,loss,latency,throughput_up,"
                         "throughput_down,nominal_up,nominal_down,"
                         "loss_cross_traffic_up,loss_cross_traffic_down,"
                         "latency_cross_traffic_up,latency_cross_traffic_down,"
                         "throughput_up_cross_traffic_up,"
                         "throughput_up_cross_traffic_down,"
                         "throughput_down_cross_traffic_up,"
                         "throughput_down_cross_traffic_down,"
                         "traceroute\n")
                    f.write(l)

            uf = get_uf(doc)
            server = doc["host"]
            dt = dt_procedures.from_utc_to_sp(doc["_id"]["date"])
            server_ip = get_server_ip(doc)

            (loss, loss_cross_traffic_up, loss_cross_traffic_down) = \
                get_loss(doc)

            (latency, latency_cross_traffic_up,
             latency_cross_traffic_down) = get_latency(doc)

            (throughput_up, nominal_up, throughput_up_cross_traffic_up,
             throughput_up_cross_traffic_down) = get_throughput_up(doc)

            (throughput_down, nominal_down, throughput_down_cross_traffic_up,
             throughput_down_cross_traffic_down) = get_throughput_down(doc)

            traceroute = get_traceroute(doc)

            l = "{}" + ",{}" * 16 + ",\"{}\"\n"
            l = l.format(dt, uf, server_ip, loss, latency, throughput_up,
                         throughput_down, nominal_up, nominal_down,
                         loss_cross_traffic_up, loss_cross_traffic_down,
                         latency_cross_traffic_up, latency_cross_traffic_down,
                         throughput_up_cross_traffic_up,
                         throughput_up_cross_traffic_down,
                         throughput_down_cross_traffic_up,
                         throughput_down_cross_traffic_down, traceroute)
            with open(out_path, "a") as f:
                f.write(l)
示例#12
0
def voting(dt_start, dt_end, metric, in_dir, eps_hours):
    """
    By now I assume that the cps from a single time series are more than
    eps_hours apart
    """

    str_dt = utils.get_str_dt(dt_start, dt_end)

    in_dir = "{}/plots/{}/{}/{}".format(script_dir, in_dir, str_dt, metric)
    for dir_path, _, file_names in os.walk(in_dir):
        if "cps_per_mac.csv" in file_names:
            with open("{}/match_cps.csv".format(dir_path), "w") as f:
                f.write("cp_dt_start,cp_dt_end,cp_type,fraction_of_clients,"
                        "cnt_clients,clients\n")

                df = pd.read_csv("{}/cps_per_mac.csv".format(dir_path))
                cnt_clients = df.shape[0]

                for cp_type in cp_utils.iter_cp_types():
                    l = []
                    for idx, row in df.iterrows():
                        cp_dts = []
                        cp_dts_aux = map(dt_procedures.from_strdt_to_dt,
                                         ast.literal_eval(row["cp_dts"]))
                        cp_types_aux = ast.literal_eval(row["type_cps"])
                        for i_cp_dt, i_cp_type in izip(cp_dts_aux,
                                                       cp_types_aux):
                            if i_cp_type == cp_type:
                                cp_dts.append(i_cp_dt)

                        l = l + map(
                            lambda dt: {
                                "dt": dt,
                                "mac": row["mac"],
                                "server": row["server"]
                            }, cp_dts)
                    l.sort(key=itemgetter("dt"))

                    votes = \
                        unsupervised_utils.multiple_inexact_voting(
                            l, eps_hours)
                    for vote in votes:
                        clients = map(
                            lambda dic: {
                                "mac": dic["mac"],
                                "server": dic["server"]
                            }, vote["interval"])
                        l_dt = vote["l_dt"]
                        r_dt = vote["r_dt"]

                        fraction_of_clients = \
                            float(len(clients)) / cnt_clients

                        f.write("{},{},{},{},{},\"{}\"\n".format(
                            l_dt, r_dt, cp_type, fraction_of_clients,
                            len(clients), clients))
def plot_latencies_traceroute(dt_start, dt_end, preprocess_args):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format(script_dir,
                                                                    str_dt)
    df = pd.read_csv(in_path)
    for _, row, in df.iterrows():
        if row["valid_cnt_samples"]:
            in_path = utils.get_in_path(row["server"], row["mac"], dt_start,
                                        dt_end)
            ts_traceroute = TimeSeries(in_path=in_path, metric="traceroute",
                                       dt_start=dt_start, dt_end=dt_end)

            for traceroute_type in unsupervised_utils.iter_traceroute_types():
                valid_traceroute_field, traceroute_field = \
                    cp_utils.get_traceroute_fields(traceroute_type)
                if row[valid_traceroute_field]:
                    traceroute = ast.literal_eval(row[traceroute_field])
                    name_ts = get_ts_per_name(traceroute_type, ts_traceroute,
                                              dt_start, dt_end)

                    dir_path = ("{}/plots/paths/{}/{}/{}/{}".
                                format(script_dir, str_dt, "latency",
                                       traceroute_type, row["server"]))
                    traceroute_path = "/".join(map(str,
                                                   list(reversed(traceroute))))
                    dir_path = "{}/{}".format(dir_path, traceroute_path)

                    utils.create_dirs(["{}/traceroute_latencies/".
                                       format(dir_path),
                                       "{}/traceroute_latencies/{}".
                                       format(dir_path, row["mac"])])

                    for i in range(len(traceroute) - 1):
                        name = traceroute[i][0][0]
                        traceroute_path = "hop{}_{}".format(str(i).zfill(2),
                                                            name)
                        out_path = ("{}/traceroute_latencies/{}/{}.png".
                                    format(dir_path, row["mac"],
                                           traceroute_path))

                        ts_preprocessed = name_ts[name].copy()
                        cp_utils.preprocess(ts_preprocessed, preprocess_args)

                        # plot_procedures.plot_ts_share_x(
                        #     name_ts[name],
                        #     ts_preprocessed,
                        #     out_path,
                        #     plot_type2="scatter",
                        #     title1="raw",
                        #     title2="median filtered",
                        #     default_ylabel=True)
                        ts_preprocessed.metric = "latency"
                        plot_procedures.plot_ts(ts_preprocessed, out_path,
                                                title="median filtered")
def analyse_first_hop(g, u, is_zero_indegree, metric, server, dt_start, dt_end,
                      traceroute_type, eps_hours, min_fraction_of_clients):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    path, dir_path = get_path(g, u, str_dt, traceroute_type, server)

    if is_zero_indegree:
        out_path = "{}/problem_location.csv".format(dir_path)
    else:
        out_path = ("{}/problem_location_first_hop_not_zero_indegree_vertex"
                    ".csv".format(dir_path))

    with open(out_path, "w") as f:
        f.write("cp_dt_start,cp_dt_end,cp_type,fraction_of_clients,"
                "cnt_clients,clients,problem_location\n")

        in_path = "{}/match_cps.csv".format(dir_path)
        df = pd.read_csv(in_path)
        for idx, row in df.iterrows():
            if row["fraction_of_clients"] >= min_fraction_of_clients:
                cp_dt_start = dt_procedures.from_strdt_to_dt(
                    row["cp_dt_start"])
                cp_dt_end = dt_procedures.from_strdt_to_dt(row["cp_dt_end"])

                if is_zero_indegree:
                    problem_location = \
                        map(ast.literal_eval,
                            analyse_path(path,
                                         cp_dt_start,
                                         cp_dt_end,
                                         row["cp_type"],
                                         str_dt,
                                         metric,
                                         traceroute_type,
                                         server,
                                         eps_hours,
                                         min_fraction_of_clients))
                else:
                    problem_location = ("already_analysed_during_zero_"
                                        "indegree_vertexes_analysis")
            else:
                problem_location = ["before"]
            l_format = "{},{},{},{},{},\"{}\",\"{}\"\n"
            f.write(l_format.format(row["cp_dt_start"], row["cp_dt_end"],
                                    row["cp_type"],
                                    row["fraction_of_clients"],
                                    row["cnt_clients"],
                                    row["clients"], problem_location))

    out_path_name = "{}/plots/names/{}/{}/{}/{}/{}".format(script_dir, str_dt,
                                                           metric,
                                                           traceroute_type,
                                                           server, path[0])
    shutil.copy(out_path, out_path_name)
def read_graph(dt_start, dt_end, server, traceroute_type):
    str_dt = utils.get_str_dt(dt_start, dt_end)
    in_path = "{}/prints/{}/filtered/graph/{}/{}_filter_graph.gv".format(
        script_dir, str_dt, server, traceroute_type)
    g = defaultdict(list)
    with open(in_path) as f:
        for line in f.readlines()[1:-1]:
            u = line.split(" -> ")[0].lstrip(" ").lstrip("\"").rstrip("\"")
            v = line.split(" -> ")[1].rstrip("\n").lstrip("\"").rstrip("\"")
            g[u].append(v)
    return g
def print_cps(dt_start, dt_end, dir_model, metric, preprocess_args):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    utils.create_dirs(["{}/prints/".format(script_dir),
                       "{}/prints/{}".format(script_dir, str_dt),
                       "{}/prints/{}/filtered".format(script_dir, str_dt),
                       "{}/prints/{}/filtered/{}".format(script_dir, str_dt,
                                                         metric)])

    out_path = "{}/prints/{}/filtered/{}/cps_per_mac.csv".format(script_dir,
                                                                 str_dt,
                                                                 metric)
    with open(out_path, "w") as f:
        f.write("server,mac,cp_dts,type_cps,seg_means\n")
        in_path_dir = ("{}/change_point/models/{}/plots/unsupervised/{}/{}".
                       format(base_dir, dir_model, str_dt, metric))

        cnt = 0
        for file_name in os.listdir(in_path_dir):
            if ".csv" in file_name:
                cnt += 1
                print "cnt={}".format(cnt)

                server = file_name.split("server")[1].split("_")[0]
                mac = file_name.split("mac")[1].split("_")[0]

                dt_cps = []
                id_cps = []
                df = pd.read_csv("{}/{}".format(in_path_dir, file_name))
                for idx, row in df.iterrows():
                    dt_cps.append(row["dt"])
                    id_cps.append(row["dt_id"])

                in_path = utils.get_in_path(server, mac, dt_start, dt_end)
                ts = TimeSeries(in_path, metric, dt_start, dt_end)
                cp_utils.preprocess(ts, preprocess_args)

                seg_means = []
                type_cps = []
                if id_cps:
                    mean1 = np.mean(ts.y[0:id_cps[0]])
                    seg_means.append(mean1)
                    for i in range(1, len(id_cps)):
                        mean2 = np.mean(ts.y[id_cps[i - 1]:id_cps[i]])
                        seg_means.append(mean2)
                        update_type_cps(type_cps, mean1, mean2, metric)
                        mean1 = mean2
                    mean2 = np.mean(ts.y[id_cps[-1]:-1])
                    seg_means.append(mean2)
                    update_type_cps(type_cps, mean1, mean2, metric)

                f.write("{},{},\"{}\",\"{}\",\"{}\"\n".format(server, mac,
                                                              dt_cps, type_cps,
                                                              seg_means))
def print_all(dt_start, dt_end, mac_node):
    str_dt = utils.get_str_dt(dt_start, dt_end)
    utils.create_dirs([
        "{}/prints".format(script_dir),
        "{}/prints/{}".format(script_dir, str_dt),
        "{}/prints/{}/filtered".format(script_dir, str_dt),
        "{}/prints/{}/not_filtered".format(script_dir, str_dt)
    ])

    # print_macs_per_name(dt_start, dt_end, mac_node)
    # print_names_per_mac(dt_start, dt_end, mac_node)
    # print_name_ips(dt_start, dt_end)
    print_traceroute_per_mac(dt_start, dt_end)

    print_traceroute_per_mac_filtered(dt_start, dt_end)
示例#18
0
def get_client_traceroute(dt_start, dt_end, traceroute_type):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    valid_traceroute_field, traceroute_field = \
        get_traceroute_fields(traceroute_type)

    in_path = ("{}/change_point/unsupervised/prints/{}/filtered/"
               "traceroute_per_mac.csv".format(base_dir, str_dt))
    df = pd.read_csv(in_path)
    client_traceroute = {}
    for idx, row in df.iterrows():
        if row["valid_cnt_samples"] and row[valid_traceroute_field]:
            client = utils.get_client(row["server"], row["mac"])
            client_traceroute[client] = ast.literal_eval(row[traceroute_field])
    return client_traceroute
def get_first_hops(dt_start, dt_end, server, traceroute_type):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    valid_traceroute_field, traceroute_field = \
        cp_utils.get_traceroute_fields(traceroute_type)

    first_hops = set()
    in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format(script_dir,
                                                                    str_dt)
    df = pd.read_csv(in_path)
    df = df[df["server"] == server]
    for idx, row in df.iterrows():
        if row["valid_cnt_samples"] and row[valid_traceroute_field]:
            traceroute = ast.literal_eval(row[traceroute_field])
            first_hops.add(traceroute[0])
    return first_hops
def print_traceroute_per_mac(dt_start, dt_end):
    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_path = ("{}/prints/{}/not_filtered/traceroute_per_mac.csv".format(
        script_dir, str_dt))
    with open(out_path, "w") as f:
        f.write("server,mac,"
                "valid_traceroute_compress_embratel,"
                "traceroute_compress_embratel,"
                "valid_traceroute_compress_embratel_without_last_hop_embratel,"
                "traceroute_compress_embratel_without_last_hop_embratel,"
                "valid_traceroute_without_embratel,"
                "traceroute_without_embratel,"
                "valid_traceroute,"
                "traceroute\n")
        for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
            ts_traceroute = TimeSeries(in_path=in_path,
                                       metric="traceroute",
                                       dt_start=dt_start,
                                       dt_end=dt_end)

            (valid_traceroute_compress_embratel,
             traceroute_compress_embratel) = \
                get_traceroute(ts_traceroute, True, True, True)

            (valid_traceroute_compress_embratel_without_last_hop_embratel,
             traceroute_compress_embratel_without_last_hop_embratel) = \
                get_traceroute(ts_traceroute, True, True, False)

            (valid_traceroute_without_embratel,
             traceroute_without_embratel) = \
                get_traceroute(ts_traceroute, False, False, False)

            (valid_traceroute, traceroute) = \
                get_traceroute(ts_traceroute, True, False, False)

            l = "{},{}" + ",{},\"{}\"" * 4 + "\n"
            l = l.format(
                server, mac, valid_traceroute_compress_embratel,
                traceroute_compress_embratel,
                valid_traceroute_compress_embratel_without_last_hop_embratel,
                traceroute_compress_embratel_without_last_hop_embratel,
                valid_traceroute_without_embratel, traceroute_without_embratel,
                valid_traceroute, traceroute)
            f.write(l)
    utils.sort_csv_file(out_path, ["server", "mac"])
def print_per_path(dt_start, dt_end, metric, file_name):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_dir = "{}/change_point/unsupervised/".format(base_dir)
    utils.create_dirs([
        "{}/plots".format(out_dir), "{}/plots/paths".format(out_dir),
        "{}/plots/paths/{}".format(out_dir, str_dt),
        "{}/plots/paths/{}/{}".format(out_dir, str_dt, metric)
    ])

    in_path = "{}/prints/{}/filtered/{}/{}".format(script_dir, str_dt, metric,
                                                   file_name)

    for traceroute_type in iter_traceroute_types():
        valid_traceroute_field, traceroute_field = \
            cp_utils.get_traceroute_fields(traceroute_type)

        client_traceroute = cp_utils.get_client_traceroute(
            dt_start, dt_end, traceroute_type)

        path_dirs = set()

        df = pd.read_csv(in_path)
        for idx, row in df.iterrows():
            client = utils.get_client(row["server"], row["mac"])
            if client in client_traceroute:
                traceroute = client_traceroute[client]

                dir_path = "{}/plots/paths/{}/{}/{}/{}".format(
                    out_dir, str_dt, metric, traceroute_type, row["server"])
                utils.create_dirs([dir_path])

                for name in reversed(traceroute):
                    if name[0][0].split(".")[0] == "192":
                        continue

                    dir_path = "{}/{}".format(dir_path, name)
                    utils.create_dirs([dir_path])

                    out_path = "{}/{}".format(dir_path, file_name)
                    if dir_path not in path_dirs:
                        create_csv_with_same_header(out_path, df)
                    pd.DataFrame(row).T.to_csv(out_path,
                                               mode="a",
                                               header=False,
                                               index=False)
                    path_dirs.add(dir_path)
def plot(dt_start, dt_end, metric):
    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)
    utils.create_dirs([
        "{}/{}".format(script_dir, str_dt),
        "{}/{}/{}".format(script_dir, str_dt, metric)
    ])
    for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
        out_file_name = utils.get_out_file_name(server, mac, dt_start, dt_end)
        out_path = "{}/{}/{}/{}.png".format(script_dir, str_dt, metric,
                                            out_file_name)

        # comparison between not filtered and filtered
        ts = TimeSeries(in_path, metric, dt_start, dt_end)
        ts_filter = TimeSeries(in_path, metric, dt_start, dt_end)
        ts_filter.percentile_filter(win_len=5, p=0.5)

        # if len(ts_filter.y) > 100:
        #     plot_procedures.plot_stl_decomposition(ts_filter,
        #                                            "median_filtered",
        #                                            out_path)

        # comparison between with cross traffic and without
        # ts = TimeSeries(in_path, metric, dt_start, dt_end)
        # ts.percentile_filter(win_len=13, p=0.5)
        # ts_filter = TimeSeries(in_path, metric, dt_start, dt_end,
        #                        cross_traffic_thresh=0)
        # ts_filter.percentile_filter(win_len=13, p=0.5)

        # plot_procedures.plot_ts_share_x(ts, ts_filter, out_path,
        #                                 compress=True,
        #                                 plot_type2="scatter",
        #                                 title1="raw",
        #                                 title2="median filtered",
        #                                 default_ylabel=True,
        #                                 xlabel="$i$")

        ylabel = plot_procedures.get_default_ylabel(ts)
        plot_procedures.plot_ts(ts_filter,
                                out_path,
                                ylabel=ylabel,
                                compress=False,
                                title="median filtered")
示例#23
0
def plot_clients_per_zero_indegree_vertex_distribution(dt_start, dt_end):
    cnt_clients_zero_indegree_vertex = []

    str_dt = utils.get_str_dt(dt_start, dt_end)
    for server in os.listdir("{}/prints/{}/filtered/graph".format(
            script_dir, str_dt)):
        for traceroute_type in unsupervised_utils.iter_traceroute_types():
            if spatial_time_correlation.valid_graph(dt_start, dt_end, server,
                                                    traceroute_type):
                g = spatial_time_correlation.read_graph(
                    dt_start, dt_end, server, traceroute_type)
                u_indegree = spatial_time_correlation.get_indegree(g)

                for u in g:
                    if u_indegree[u] == 0:
                        in_path = ("{}/plots/names/{}/latency/{}/{}/{}/"
                                   "cps_per_mac.csv".format(
                                       script_dir, str_dt, traceroute_type,
                                       server, u))
                        df = pd.read_csv(in_path)
                        cnt_clients_zero_indegree_vertex.append(df.shape[0])
                break

    print sum(cnt_clients_zero_indegree_vertex)

    out_path = ("{}/plots/cnt_clients_zero_indegree_vertex_distribution.png".
                format(script_dir))
    plt.clf()
    matplotlib.rcParams.update({"font.size": 27})
    plt.gcf().set_size_inches(16, 11)
    bins = range(1, max(cnt_clients_zero_indegree_vertex) + 2)
    weights = (np.asarray([1.0] * len(cnt_clients_zero_indegree_vertex)) /
               len(cnt_clients_zero_indegree_vertex))
    plt.ylabel("frequency")
    plt.xlabel("number of clients in a zero indegree user-group")
    plt.xticks(bins[:-1], rotation=45)
    plt.hist(cnt_clients_zero_indegree_vertex,
             bins=bins,
             normed=True,
             weights=weights)
    plt.savefig(out_path)
def localize_events(dt_start, dt_end, metric, eps_hours,
                    min_fraction_of_clients):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format(script_dir,
                                                                    str_dt)
    servers = np.unique(pd.read_csv(in_path)["server"].values)
    for server in servers:
        for traceroute_type in unsupervised_utils.iter_traceroute_types():
            if valid_graph(dt_start, dt_end, server, traceroute_type):
                g = read_graph(dt_start, dt_end, server, traceroute_type)
                u_indegree = get_indegree(g)

                for u in g:
                    if u_indegree[u] == 0:
                        analyse_first_hop(g, u, True, metric, server, dt_start,
                                          dt_end, traceroute_type, eps_hours,
                                          min_fraction_of_clients)

                correlate_zero_indegree_vertexes(g, u_indegree, server,
                                                 dt_start, dt_end, metric,
                                                 traceroute_type, eps_hours)

                first_hops = get_first_hops(dt_start, dt_end, server,
                                            traceroute_type)
                for first_hop in first_hops:
                    if u_indegree[first_hop] != 0:
                        analyse_first_hop(g, u, False, metric, server,
                                          dt_start, dt_end, traceroute_type,
                                          eps_hours, min_fraction_of_clients)
                aggregate_first_hop_not_zero_indegree_vertex(first_hops, g,
                                                             metric, server,
                                                             dt_start, dt_end,
                                                             traceroute_type)

                break

    aggregate_servers_correlations(dt_start, dt_end, metric, servers)
    aggregate_servers_first_hop_not_zero_indegree_vertex(dt_start, dt_end,
                                                         metric, servers)
示例#25
0
def plot_per_node(dt_start, dt_end, metric, only_unique_traceroute):
    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    utils.create_dirs([
        "{}/plots/".format(script_dir), "{}/plots/nodes".format(script_dir),
        "{}/plots/nodes/{}".format(script_dir, str_dt),
        "{}/plots/nodes/{}/{}".format(script_dir, str_dt, metric)
    ])

    valid_nodes = read_input.get_valid_nodes()
    mac_node = read_input.get_mac_node()

    macs_unique_traceroute = read_input.get_macs_traceroute_filter(
        dt_start, dt_end, "filtered")

    for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
        if only_unique_traceroute and (mac not in macs_unique_traceroute):
            continue

        if mac_node[mac] in valid_nodes:
            utils.create_dirs([
                "{}/plots/nodes/{}/{}/{}".format(script_dir, str_dt, metric,
                                                 mac_node[mac])
            ])
            out_file_name = utils.get_out_file_name(server, mac, dt_start,
                                                    dt_end)
            out_path = ("{}/plots/nodes/{}/{}/{}/{}.png".format(
                script_dir, str_dt, metric, mac_node[mac], out_file_name))

            ts = TimeSeries(in_path, metric, dt_start, dt_end)
            ts_filter = TimeSeries(in_path, metric, dt_start, dt_end)
            ts_filter.percentile_filter(win_len=13, p=0.5)
            plot_procedures.plot_ts_share_x(ts,
                                            ts_filter,
                                            out_path,
                                            compress=False,
                                            plot_type2="scatter")
def create_dataset_unsupervised(dt_start, dt_end):
    """
    all [dt_start, dt_end) must be in the same month.
    datetimes must represent days
    """

    str_dt = utils.get_str_dt(dt_start, dt_end)
    utils.create_dirs([
        "{}/change_point/input/unsupervised/".format(base_dir),
        "{}/change_point/input/unsupervised/{}".format(base_dir, str_dt)
    ])

    out_path = "{}/unsupervised/{}/dataset.csv".format(script_dir, str_dt)
    with open(out_path, "w") as f:
        f.write("email,mac,server,dt_start,dt_end,change_points,"
                "change_points_ids\n")

        in_path = ("{}/change_point/unsupervised/prints/{}/filtered/"
                   "traceroute_per_mac.csv".format(base_dir, str_dt))
        df = pd.read_csv(in_path)
        for idx, row in df.iterrows():
            if include_in_dataset(row):
                f.write("{},{},{},{},{},\"\",\"\"\n".format(
                    str_dt, row["mac"], row["server"], dt_start, dt_end))
def print_traceroute_per_mac_filtered(dt_start,
                                      dt_end,
                                      min_fraction_of_samples=0.7):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_path = ("{}/prints/{}/filtered/traceroute_per_mac.csv".format(
        script_dir, str_dt))
    with open(out_path, "w") as f:
        l = ("server,mac,valid_cnt_samples,"
             "valid_traceroute_compress_embratel,"
             "traceroute_compress_embratel_filter,"
             "valid_traceroute_compress_embratel_without_last_hop_embratel,"
             "traceroute_compress_embratel_without_last_hop_embratel_filter,"
             "valid_traceroute_without_embratel,"
             "traceroute_without_embratel_filter,"
             "valid_traceroute,"
             "traceroute_filter\n")
        f.write(l)

        in_path = ("{}/prints/{}/not_filtered/traceroute_per_mac.csv".format(
            script_dir, str_dt))
        df = pd.read_csv(in_path)
        for cnt, (idx, row) in enumerate(df.iterrows()):
            print(
                "print_traceroute_per_mac_filtered, str_dt={}, cnt={}".format(
                    str_dt, cnt))

            traceroute_compress_embratel_filter = \
                get_traceroute_filtered(
                    row["valid_traceroute_compress_embratel"],
                    row["traceroute_compress_embratel"],
                    row["server"])

            traceroute_compress_embratel_without_last_hop_embratel_filter = \
                get_traceroute_filtered(
                    row["valid_traceroute_compress_embratel_"
                        "without_last_hop_embratel"],
                    row["traceroute_compress_embratel_"
                        "without_last_hop_embratel"],
                    row["server"])

            traceroute_without_embratel_filter = \
                get_traceroute_filtered(
                    row["valid_traceroute_without_embratel"],
                    row["traceroute_without_embratel"],
                    row["server"])

            traceroute_filter = \
                get_traceroute_filtered(
                    row["valid_traceroute"],
                    row["traceroute"],
                    row["server"])

            # check if client has the minimum number of samples. Since the
            # metric is not specified at the moment, only check the presence of
            # the measurement timestamp
            in_path = utils.get_in_path(row["server"], row["mac"], dt_start,
                                        dt_end)
            ts = TimeSeries(in_path=in_path,
                            metric="dt",
                            dt_start=dt_start,
                            dt_end=dt_end)
            delta_days = (dt_end - dt_start).days
            fraction_of_samples = float(len(ts.y)) / (delta_days * 24.0 * 2.0)
            if fraction_of_samples < min_fraction_of_samples:
                valid_cnt_samples = False
            else:
                valid_cnt_samples = True

            l = "{},{},{}" + ",{},\"{}\"" * 4 + "\n"
            l = l.format(
                row["server"], row["mac"], valid_cnt_samples,
                row["valid_traceroute_compress_embratel"],
                traceroute_compress_embratel_filter,
                row["valid_traceroute_compress_embratel_"
                    "without_last_hop_embratel"],
                traceroute_compress_embratel_without_last_hop_embratel_filter,
                row["valid_traceroute_without_embratel"],
                traceroute_without_embratel_filter, row["valid_traceroute"],
                traceroute_filter)
            f.write(l)
def run_single(dt_start, dt_end, cmp_class_args, preprocess_args, param,
               metric, run):
    str_dt = utils.get_str_dt(dt_start, dt_end)
    datasets = ["unsupervised/{}".format(str_dt)]
    cp_utils.run_sequential(datasets, run, cmp_class_args, preprocess_args,
                            param, metric)
def correlate_zero_indegree_vertexes(g, u_indegree, server, dt_start, dt_end,
                                     metric, traceroute_type, eps_hours):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_path = ("{}/plots/paths/{}/{}/{}/{}/"
                "problem_location_zero_indegree_vertexes_correlation.csv".
                format(script_dir, str_dt, metric, traceroute_type, server))
    with open(out_path, "w") as f:
        f.write("cp_dt_start,cp_dt_end,cp_type,traceroute_type,"
                "cnt_vertexes_with_zero_indegree,suffix_match,"
                "vertexes_with_zero_indegree\n")

        for cp_type in cp_utils.iter_cp_types():
            l = []
            for u in g:
                if u_indegree[u] == 0:
                    in_path = ("{}/plots/names/{}/{}/{}/{}/{}/"
                               "problem_location.csv".format(script_dir,
                                                             str_dt,
                                                             metric,
                                                             traceroute_type,
                                                             server, u))
                    df = pd.read_csv(in_path)
                    for idx, row in df[df["cp_type"] == cp_type].iterrows():
                        cp_dt_start = dt_procedures.from_strdt_to_dt(
                            row["cp_dt_start"])
                        cp_dt_end = dt_procedures.from_strdt_to_dt(
                            row["cp_dt_end"])
                        cp_dt = cp_dt_start + (cp_dt_end - cp_dt_start) / 2

                        problem_locations = ast.literal_eval(
                            row["problem_location"])

                        dic = {"dt": cp_dt, "name": ast.literal_eval(u),
                               "problem_locations": problem_locations,
                               "fraction_of_clients":
                               row["fraction_of_clients"],
                               "cnt_clients": row["cnt_clients"]}

                        if problem_locations == ["before"]:
                            dic["dt"] = str(dic["dt"])
                            f.write("{},{},{},{},{},\"{}\",\"{}\"\n".
                                    format(row["cp_dt_start"],
                                           row["cp_dt_end"],
                                           cp_type,
                                           traceroute_type,
                                           1,
                                           problem_locations,
                                           [dic]))
                        else:
                            l.append(dic)

            l.sort(key=itemgetter("dt"))
            votes = \
                unsupervised_utils.multiple_inexact_voting(l,
                                                           eps_hours)
            for event in votes:
                for problem_location, votes in suffix_match(event):
                    for dic in votes:
                        dic["dt"] = str(dic["dt"])
                    f.write("{},{},{},{},{},\"{}\",\"{}\"\n".
                            format(event["l_dt"],
                                   event["r_dt"],
                                   cp_type,
                                   traceroute_type,
                                   len(votes),
                                   problem_location,
                                   votes))

    out_path_name = ("{}/plots/names/{}/{}/{}/{}".
                     format(script_dir, str_dt, metric, traceroute_type,
                            server))
    shutil.copy(out_path, out_path_name)
def plot_per_name(dt_start, dt_end, metric, preprocess_args, plot_cps=True):
    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    utils.create_dirs([
        "{}/plots/".format(script_dir), "{}/plots/names".format(script_dir),
        "{}/plots/names/{}".format(script_dir, str_dt),
        "{}/plots/names/{}/{}".format(script_dir, str_dt, metric)
    ])

    client_cps = unsupervised_utils.get_client_cps(plot_cps, str_dt, metric)

    # avoid reploting
    client_plotPath = {}

    for traceroute_type in unsupervised_utils.iter_traceroute_types():
        valid_traceroute_field, traceroute_field = \
            cp_utils.get_traceroute_fields(traceroute_type)

        utils.create_dirs([
            "{}/plots/names/{}/{}/{}".format(script_dir, str_dt, metric,
                                             traceroute_type)
        ])

        df = pd.read_csv("{}/prints/{}/filtered/traceroute_per_mac.csv".format(
            script_dir, str_dt))
        cnt = 0
        for idx, row in df.iterrows():
            if row["valid_cnt_samples"] and row[valid_traceroute_field]:
                print("cnt={}, traceroute_type={}, str_dt={}".format(
                    cnt, traceroute_type, str_dt))
                cnt += 1

                client = utils.get_client(row["server"], row["mac"])

                for name in cp_utils.iter_names_traceroute_filtered(
                        ast.literal_eval(row[traceroute_field])):

                    utils.create_dirs([
                        "{}/plots/names/{}/{}/{}/{}".format(
                            script_dir, str_dt, metric, traceroute_type,
                            row["server"]),
                        "{}/plots/names/{}/{}/{}/{}/{}".format(
                            script_dir, str_dt, metric, traceroute_type,
                            row["server"], name)
                    ])

                    out_file_name = utils.get_out_file_name(
                        row["server"], row["mac"], dt_start, dt_end)
                    out_path = ("{}/plots/names/{}/{}/{}/{}/{}/{}.png".format(
                        script_dir, str_dt, metric, traceroute_type,
                        row["server"], name, out_file_name))

                    # avoid reploting
                    if client in client_plotPath:
                        shutil.copyfile(client_plotPath[client], out_path)
                    else:
                        client_plotPath[client] = out_path
                        cp_dts = client_cps[client]

                        in_path = "{}/input/{}/{}/{}.csv".format(
                            base_dir, dt_dir, row["server"], row["mac"])

                        ts = TimeSeries(in_path, metric, dt_start, dt_end)
                        cp_utils.preprocess(ts, preprocess_args)
                        plot_procedures.plot_ts(ts,
                                                out_path,
                                                dt_axvline=cp_dts,
                                                title="median filtered")