예제 #1
0
    def sample(self, n_samples):
        ts = TimeSeries()
        ts.y = self.model.sample(n_samples)[0]
        ts.x = range(len(ts.y))

        out_path = "{}/plots/{}/sample.png".format(script_dir,
                                                   self.__class__.__name__)
        plot_procedures.plot_ts(ts, out_path, compress=True)
def get_ts_per_name(traceroute_type, ts_traceroute, dt_start, dt_end):
    update_ip_name = False
    ip_name = traceroute_exploratory_prints.get_ip_name(ts_traceroute.y)
    if not ip_name:
        update_ip_name = True

    name_ts = {}
    for i in xrange(len(ts_traceroute.y)):
        traceroute = ts_traceroute.y[i]

        if update_ip_name:
            ip_name = traceroute_exploratory_prints.get_ip_name([traceroute])

        ignore_traceroute = False
        for hop in traceroute:
            curr_name = None
            for name in hop["names"]:
                if name != u"##":
                    curr_name = name
            if curr_name:
                for name in hop["names"]:
                    if name == u"##":
                        ignore_traceroute = True
                if not ignore_traceroute:
                    name = get_name(hop["names"][0], ip_name, traceroute_type)
                    for j in xrange(1, len(hop["names"])):
                        curr_name = get_name(hop["names"][j], ip_name,
                                             traceroute_type)
                        if name != curr_name:
                            ignore_traceroute = True
        if ignore_traceroute:
            continue

        for hop in traceroute:
            name = hop["names"][0]
            if name != "##":
                name = get_name(name, ip_name, traceroute_type)

                if name not in name_ts:
                    name_ts[name] = TimeSeries(dt_start=dt_start,
                                               dt_end=dt_end)
                sum_latency = 0
                cnt_latency = 0
                for latency in hop["times"]:
                    if type(latency) == int:
                        sum_latency += latency / 1000.0
                        cnt_latency += 1
                if cnt_latency > 0:
                    mean_latency = sum_latency / cnt_latency
                else:
                    continue
                name_ts[name].dt_start = ts_traceroute.dt_start
                name_ts[name].dt_end = ts_traceroute.dt_end
                name_ts[name].x.append(ts_traceroute.x[i])
                name_ts[name].y.append(mean_latency)
    return name_ts
예제 #3
0
def simulate():
    ts_len = 1000
    l1 = np.random.normal(1, 0.2, ts_len)
    l2 = np.random.normal(5, 0.2, ts_len)
    l = np.append(l1, l2)

    ts = TimeSeries(compressed=True)
    ts.x = range(1, len(l) + 1)
    ts.y = l

    return ts
예제 #4
0
def mean_per_hour(in_path, server, mac):
    ts = TimeSeries(in_path,
                    "loss",
                    dt_start=dt_start,
                    dt_end=dt_end,
                    ts_type="hourly")
    ts.compress()
    plot_procedures.plot_ts(ts,
                            "./plots/ts/ts_{}_{}.png".format(server, mac),
                            ylim=[-0.05, 1.05],
                            ylabel="Loss Fraction",
                            xlabel="day/month")
def print_name_ips(dt_start, dt_end):
    # TODO: Probably deprecated

    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    name_ip = {}
    for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
        ts = TimeSeries(in_path=in_path,
                        metric="traceroute",
                        dt_start=dt_start,
                        dt_end=dt_end)
        for traceroute in ts.y:
            if traceroute:
                for hop in traceroute:
                    for name, ip in izip(hop["names"], hop["ips"]):
                        if name not in name_ip:
                            name_ip[name] = set()
                        name_ip[name].add(ip)

    out_path = "{}/prints/{}/not_filtered/name_ips.csv".format(
        script_dir, str_dt)
    with open(out_path, "w") as f:
        f.write("name,ips\n")
        for name in sorted(name_ip.keys()):
            f.write("{},{}\n".format(name, sorted(list(name_ip[name]))))
def print_names_per_mac(dt_start, dt_end, mac_node):
    # TODO: Probably deprecated

    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_path = ("{}/prints/{}/not_filtered/names_per_mac.csv".format(
        script_dir, str_dt))
    with open(out_path, "w") as f:
        f.write("server,node,mac,names\n")
        for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
            ts = TimeSeries(in_path=in_path,
                            metric="traceroute",
                            dt_start=dt_start,
                            dt_end=dt_end)
            names = set()
            for traceroute in ts.y:
                if traceroute:
                    ip_name = get_ip_name(traceroute)
                    for hop in traceroute:
                        for name in hop["names"]:
                            names.add(get_name(name, ip_name))
            node = mac_node.get(mac)
            f.write("{},{},{},\"{}\"\n".format(server, node, mac,
                                               sorted(list(names))))

    utils.sort_csv_file(out_path, ["server", "node"])
def print_macs_per_name(dt_start, dt_end, mac_node):
    # TODO: Probably deprecated

    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    name_macs = {}
    for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
        ts = TimeSeries(in_path=in_path,
                        metric="traceroute",
                        dt_start=dt_start,
                        dt_end=dt_end)
        for traceroute in ts.y:
            if traceroute:
                ip_name = get_ip_name(traceroute)
                for hop in traceroute:
                    for name in hop["names"]:
                        name = get_name(name, ip_name)
                        if name not in name_macs:
                            name_macs[name] = set()
                        name_macs[name].add((server, mac_node.get(mac), mac))

    out_path = ("{}/prints/{}/not_filtered/macs_per_name.csv".format(
        script_dir, str_dt))
    with open(out_path, "w") as f:
        f.write("name,macs\n")
        names = sorted(name_macs.keys())
        for name in names:
            f.write("{},\"{}\"\n".format(name, sorted(list(name_macs[name]))))
예제 #8
0
 def unpack_row(self, row):
     dt_start = datetime.datetime.strptime(row["dt_start"], "%Y-%m-%d")
     dt_end = datetime.datetime.strptime(row["dt_end"], "%Y-%m-%d")
     in_path = utils.get_in_path(row["server"], row["mac"], dt_start,
                                 dt_end)
     ts = TimeSeries(in_path, "loss", dt_start, dt_end)
     return row["server"], row["mac"], dt_start, dt_end, ts
    def plot_single(self, row, cmp_class_args, dataset, out_path):
        ts_preprocessed = cp_utils.get_ts(row, self.preprocess_args,
                                          self.metric)
        pred = self.predict(ts_preprocessed)
        correct = cp_utils.from_str_to_int_list(row["change_points_ids"])
        pred.sort()
        correct.sort()
        conf = cmp_class.conf_mat(correct, pred, ts_preprocessed,
                                  cmp_class.match_id, **cmp_class_args)
        print "pred={}".format(pred)
        print "correct={}".format(correct)
        print "conf={}".format(conf)

        # if is an unsupervised problem, plot the predicted cps in the ts
        cp_to_print = pred
        if "unsupervised" in dataset:
            pred, correct = correct, pred

        in_path, dt_start, dt_end = cp_utils.unpack_pandas_row(row)

        out_path = "{}.png".format(out_path)
        ts_raw = TimeSeries(in_path, self.metric, dt_start, dt_end)
        self.plot(ts_preprocessed, ts_raw, correct, pred, conf, out_path)

        out_path = "{}.csv".format(out_path)
        self.print_cp(ts_preprocessed, cp_to_print, out_path)
예제 #10
0
def mean_per_hour_in_a_day(in_path, server, mac):
    ts = TimeSeries(in_path, "loss", dt_start=dt_start, dt_end=dt_end)
    hour_samples, hour_cnt = [], []
    for hour in xrange(24):
        hour_samples.append([])
        hour_cnt.append(0)
    for i in xrange(len(ts.x)):
        hour_samples[ts.x[i].hour].append(ts.y[i])
        hour_cnt[ts.x[i].hour] += 1

    hour_list, mean_list, var_list = [], [], []
    for hour in xrange(24):
        if len(hour_samples) > 0:
            hour_list.append(hour)
            mean_list.append(np.mean(hour_samples[hour]))
            var_list.append(np.var(hour_samples[hour]))
    hour_list = np.asarray(hour_list)
    mean_list = np.asarray(mean_list)
    var_list = np.asarray(var_list)

    print "hour_cnt={}".format(hour_cnt)
    plt.clf()
    matplotlib.rcParams.update({'font.size': 30})
    plt.gcf().set_size_inches(16, 15)
    plt.grid()
    plt.ylabel("Mean Loss Fraction")
    plt.xlabel("Hour")
    plt.xticks(range(0, 24, 2), rotation=45)
    plt.plot(hour_list, mean_list, marker="o")
    plt.fill_between(hour_list,
                     mean_list - var_list,
                     mean_list + var_list,
                     alpha=0.3)
    plt.savefig("./plots/mean_per_hour_in_a_day/mean_per_hour_in_a_day_{}_{}."
                "png".format(server, mac))
예제 #11
0
def add_cp_ids():
    """
    write to in_path a new column: the change points ids (index of change
    points when points are sorted by measure datetime)
    """

    in_path = "{}/data_web_system.csv".format(script_dir)
    df = pd.read_csv(in_path)
    if "change_points_ids" not in df:
        cp_ids = []
        for idx, row in df.iterrows():
            dt_start = dt_procedures.from_strdt_to_dt(row["dt_start"])
            dt_end = dt_procedures.from_strdt_to_dt(row["dt_end"])
            dt_dir = utils.get_dt_dir(dt_start, dt_end)
            in_path = "{}/input/{}/{}/{}.csv".format(base_dir, dt_dir,
                                                     row["server"], row["mac"])

            ts = TimeSeries(in_path, "loss", dt_start=dt_start, dt_end=dt_end)
            if not ts.x:
                cp_ids.append("")
                continue

            if str(row["change_points"]) != "\'\'":
                l_dt = map(dt_procedures.from_js_strdt_to_dt,
                           row["change_points"].split(","))
                l_id = from_dt_to_id(in_path, "loss", dt_start, dt_end, l_dt)
                cp_ids.append(",".join(map(str, l_id)))
            else:
                cp_ids.append("")

        df["change_points_ids"] = cp_ids
        df.to_csv("{}/data_web_system.csv".format(script_dir), index=False)
예제 #12
0
def acf(in_path, server, mac):
    ts = TimeSeries(in_path,
                    "loss",
                    dt_start=dt_start,
                    dt_end=dt_end,
                    ts_type="hourly")
    lags, acf, cnt_pairs = ts.get_acf(84)

    print "cnt_pairs={}".format(cnt_pairs)
    plt.clf()
    matplotlib.rcParams.update({'font.size': 30})
    plt.gcf().set_size_inches(16, 15)
    plt.grid()
    plt.xticks(range(0, 85, 12), rotation=45)
    plt.xlabel("Lag (hours)")
    plt.ylabel("ACF")
    plt.plot(lags, acf, marker="o")
    plt.savefig("./plots/acf/acf_{}_{}.png".format(server, mac))
예제 #13
0
def plot(dt_start, dt_end, metric):
    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)
    utils.create_dirs([
        "{}/{}".format(script_dir, str_dt),
        "{}/{}/{}".format(script_dir, str_dt, metric)
    ])
    for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
        out_file_name = utils.get_out_file_name(server, mac, dt_start, dt_end)
        out_path = "{}/{}/{}/{}.png".format(script_dir, str_dt, metric,
                                            out_file_name)

        # comparison between not filtered and filtered
        ts = TimeSeries(in_path, metric, dt_start, dt_end)
        ts_filter = TimeSeries(in_path, metric, dt_start, dt_end)
        ts_filter.percentile_filter(win_len=5, p=0.5)

        # if len(ts_filter.y) > 100:
        #     plot_procedures.plot_stl_decomposition(ts_filter,
        #                                            "median_filtered",
        #                                            out_path)

        # comparison between with cross traffic and without
        # ts = TimeSeries(in_path, metric, dt_start, dt_end)
        # ts.percentile_filter(win_len=13, p=0.5)
        # ts_filter = TimeSeries(in_path, metric, dt_start, dt_end,
        #                        cross_traffic_thresh=0)
        # ts_filter.percentile_filter(win_len=13, p=0.5)

        # plot_procedures.plot_ts_share_x(ts, ts_filter, out_path,
        #                                 compress=True,
        #                                 plot_type2="scatter",
        #                                 title1="raw",
        #                                 title2="median filtered",
        #                                 default_ylabel=True,
        #                                 xlabel="$i$")

        ylabel = plot_procedures.get_default_ylabel(ts)
        plot_procedures.plot_ts(ts_filter,
                                out_path,
                                ylabel=ylabel,
                                compress=False,
                                title="median filtered")
예제 #14
0
    def plot(self, server, mac, dt_start, dt_end, ts, hidden_state_path):
        ts_hidden_state_path = TimeSeries()
        ts_hidden_state_path.x = copy.deepcopy(ts.x)
        ts_hidden_state_path.y = copy.deepcopy(hidden_state_path)

        out_file_name = utils.get_out_file_name(server, mac, dt_start, dt_end)
        out_path = "{}/plots/{}/{}.png".format(script_dir,
                                               self.__class__.__name__,
                                               out_file_name)
        plot_procedures.plot_ts_share_x(
            ts,
            ts_hidden_state_path,
            out_path,
            compress=True,
            title1="raw time series",
            title2="best hidden state path",
            plot_type2="scatter",
            yticks2=range(self.model.n_components),
            ylim2=[-0.5, self.model.n_components - 0.5])
def plot_latencies_traceroute(dt_start, dt_end, preprocess_args):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format(script_dir,
                                                                    str_dt)
    df = pd.read_csv(in_path)
    for _, row, in df.iterrows():
        if row["valid_cnt_samples"]:
            in_path = utils.get_in_path(row["server"], row["mac"], dt_start,
                                        dt_end)
            ts_traceroute = TimeSeries(in_path=in_path, metric="traceroute",
                                       dt_start=dt_start, dt_end=dt_end)

            for traceroute_type in unsupervised_utils.iter_traceroute_types():
                valid_traceroute_field, traceroute_field = \
                    cp_utils.get_traceroute_fields(traceroute_type)
                if row[valid_traceroute_field]:
                    traceroute = ast.literal_eval(row[traceroute_field])
                    name_ts = get_ts_per_name(traceroute_type, ts_traceroute,
                                              dt_start, dt_end)

                    dir_path = ("{}/plots/paths/{}/{}/{}/{}".
                                format(script_dir, str_dt, "latency",
                                       traceroute_type, row["server"]))
                    traceroute_path = "/".join(map(str,
                                                   list(reversed(traceroute))))
                    dir_path = "{}/{}".format(dir_path, traceroute_path)

                    utils.create_dirs(["{}/traceroute_latencies/".
                                       format(dir_path),
                                       "{}/traceroute_latencies/{}".
                                       format(dir_path, row["mac"])])

                    for i in range(len(traceroute) - 1):
                        name = traceroute[i][0][0]
                        traceroute_path = "hop{}_{}".format(str(i).zfill(2),
                                                            name)
                        out_path = ("{}/traceroute_latencies/{}/{}.png".
                                    format(dir_path, row["mac"],
                                           traceroute_path))

                        ts_preprocessed = name_ts[name].copy()
                        cp_utils.preprocess(ts_preprocessed, preprocess_args)

                        # plot_procedures.plot_ts_share_x(
                        #     name_ts[name],
                        #     ts_preprocessed,
                        #     out_path,
                        #     plot_type2="scatter",
                        #     title1="raw",
                        #     title2="median filtered",
                        #     default_ylabel=True)
                        ts_preprocessed.metric = "latency"
                        plot_procedures.plot_ts(ts_preprocessed, out_path,
                                                title="median filtered")
예제 #16
0
def print_cps(dt_start, dt_end, dir_model, metric, preprocess_args):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    utils.create_dirs(["{}/prints/".format(script_dir),
                       "{}/prints/{}".format(script_dir, str_dt),
                       "{}/prints/{}/filtered".format(script_dir, str_dt),
                       "{}/prints/{}/filtered/{}".format(script_dir, str_dt,
                                                         metric)])

    out_path = "{}/prints/{}/filtered/{}/cps_per_mac.csv".format(script_dir,
                                                                 str_dt,
                                                                 metric)
    with open(out_path, "w") as f:
        f.write("server,mac,cp_dts,type_cps,seg_means\n")
        in_path_dir = ("{}/change_point/models/{}/plots/unsupervised/{}/{}".
                       format(base_dir, dir_model, str_dt, metric))

        cnt = 0
        for file_name in os.listdir(in_path_dir):
            if ".csv" in file_name:
                cnt += 1
                print "cnt={}".format(cnt)

                server = file_name.split("server")[1].split("_")[0]
                mac = file_name.split("mac")[1].split("_")[0]

                dt_cps = []
                id_cps = []
                df = pd.read_csv("{}/{}".format(in_path_dir, file_name))
                for idx, row in df.iterrows():
                    dt_cps.append(row["dt"])
                    id_cps.append(row["dt_id"])

                in_path = utils.get_in_path(server, mac, dt_start, dt_end)
                ts = TimeSeries(in_path, metric, dt_start, dt_end)
                cp_utils.preprocess(ts, preprocess_args)

                seg_means = []
                type_cps = []
                if id_cps:
                    mean1 = np.mean(ts.y[0:id_cps[0]])
                    seg_means.append(mean1)
                    for i in range(1, len(id_cps)):
                        mean2 = np.mean(ts.y[id_cps[i - 1]:id_cps[i]])
                        seg_means.append(mean2)
                        update_type_cps(type_cps, mean1, mean2, metric)
                        mean1 = mean2
                    mean2 = np.mean(ts.y[id_cps[-1]:-1])
                    seg_means.append(mean2)
                    update_type_cps(type_cps, mean1, mean2, metric)

                f.write("{},{},\"{}\",\"{}\",\"{}\"\n".format(server, mac,
                                                              dt_cps, type_cps,
                                                              seg_means))
예제 #17
0
def write_all_samples_to_file(targets, in_dir):
    samples = []
    for target in targets:
        server, mac = target[0], target[1]
        in_path = "{}/{}/{}.csv".format(in_dir, server, mac)
        ts = TimeSeries(in_path, "loss", dt_start=dt_start, dt_end=dt_end)
        samples = samples + ts.y
        print "server={}, mac={}".format(server, mac)

    with open("./plots/distribution/samples.csv", "w") as f:
        f.write("samples\n")
        for x in samples:
            f.write("{}\n".format(x))
예제 #18
0
def plot_per_node(dt_start, dt_end, metric, only_unique_traceroute):
    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    utils.create_dirs([
        "{}/plots/".format(script_dir), "{}/plots/nodes".format(script_dir),
        "{}/plots/nodes/{}".format(script_dir, str_dt),
        "{}/plots/nodes/{}/{}".format(script_dir, str_dt, metric)
    ])

    valid_nodes = read_input.get_valid_nodes()
    mac_node = read_input.get_mac_node()

    macs_unique_traceroute = read_input.get_macs_traceroute_filter(
        dt_start, dt_end, "filtered")

    for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
        if only_unique_traceroute and (mac not in macs_unique_traceroute):
            continue

        if mac_node[mac] in valid_nodes:
            utils.create_dirs([
                "{}/plots/nodes/{}/{}/{}".format(script_dir, str_dt, metric,
                                                 mac_node[mac])
            ])
            out_file_name = utils.get_out_file_name(server, mac, dt_start,
                                                    dt_end)
            out_path = ("{}/plots/nodes/{}/{}/{}/{}.png".format(
                script_dir, str_dt, metric, mac_node[mac], out_file_name))

            ts = TimeSeries(in_path, metric, dt_start, dt_end)
            ts_filter = TimeSeries(in_path, metric, dt_start, dt_end)
            ts_filter.percentile_filter(win_len=13, p=0.5)
            plot_procedures.plot_ts_share_x(ts,
                                            ts_filter,
                                            out_path,
                                            compress=False,
                                            plot_type2="scatter")
예제 #19
0
def from_dt_to_id(in_path, metric, dt_start, dt_end, l_dt):
    """
    return ids associated with dts in l_dt list
    """
    ts = TimeSeries(in_path, metric, dt_start=dt_start, dt_end=dt_end)

    dt_id = {}
    for i in xrange(len(ts.x)):
        dt_id[ts.x[i]] = i

    l_id = []
    for dt in l_dt:
        l_id.append(dt_id[dt])
    return l_id
def print_traceroute_per_mac(dt_start, dt_end):
    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_path = ("{}/prints/{}/not_filtered/traceroute_per_mac.csv".format(
        script_dir, str_dt))
    with open(out_path, "w") as f:
        f.write("server,mac,"
                "valid_traceroute_compress_embratel,"
                "traceroute_compress_embratel,"
                "valid_traceroute_compress_embratel_without_last_hop_embratel,"
                "traceroute_compress_embratel_without_last_hop_embratel,"
                "valid_traceroute_without_embratel,"
                "traceroute_without_embratel,"
                "valid_traceroute,"
                "traceroute\n")
        for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
            ts_traceroute = TimeSeries(in_path=in_path,
                                       metric="traceroute",
                                       dt_start=dt_start,
                                       dt_end=dt_end)

            (valid_traceroute_compress_embratel,
             traceroute_compress_embratel) = \
                get_traceroute(ts_traceroute, True, True, True)

            (valid_traceroute_compress_embratel_without_last_hop_embratel,
             traceroute_compress_embratel_without_last_hop_embratel) = \
                get_traceroute(ts_traceroute, True, True, False)

            (valid_traceroute_without_embratel,
             traceroute_without_embratel) = \
                get_traceroute(ts_traceroute, False, False, False)

            (valid_traceroute, traceroute) = \
                get_traceroute(ts_traceroute, True, False, False)

            l = "{},{}" + ",{},\"{}\"" * 4 + "\n"
            l = l.format(
                server, mac, valid_traceroute_compress_embratel,
                traceroute_compress_embratel,
                valid_traceroute_compress_embratel_without_last_hop_embratel,
                traceroute_compress_embratel_without_last_hop_embratel,
                valid_traceroute_without_embratel, traceroute_without_embratel,
                valid_traceroute, traceroute)
            f.write(l)
    utils.sort_csv_file(out_path, ["server", "mac"])
예제 #21
0
def sliding_window(ts):
    ts_dist = TimeSeries(compressed=True)

    win_len = 100
    for i in xrange(win_len, len(ts.y) - win_len + 1):
        dist = cmp_win.hellinger_dist(ts.y[i - win_len:i],
                                      ts.y[i:i + win_len],
                                      bins=np.arange(0.02, 10.02, 0.02))
        ts_dist.x.append(ts.x[i])
        ts_dist.y.append(dist)

    plot_procedures.plot_ts_share_x(ts,
                                    ts_dist,
                                    "./sliding_window_toy_example.png",
                                    compress=True,
                                    plot_type1="plot",
                                    ylim1=[0, max(ts.y)],
                                    ylabel1="$y_{i}$",
                                    ylabel2="$H_{i}$",
                                    xlabel="$i$")
예제 #22
0
def plot_per_name(dt_start, dt_end, metric, preprocess_args, plot_cps=True):
    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    utils.create_dirs([
        "{}/plots/".format(script_dir), "{}/plots/names".format(script_dir),
        "{}/plots/names/{}".format(script_dir, str_dt),
        "{}/plots/names/{}/{}".format(script_dir, str_dt, metric)
    ])

    client_cps = unsupervised_utils.get_client_cps(plot_cps, str_dt, metric)

    # avoid reploting
    client_plotPath = {}

    for traceroute_type in unsupervised_utils.iter_traceroute_types():
        valid_traceroute_field, traceroute_field = \
            cp_utils.get_traceroute_fields(traceroute_type)

        utils.create_dirs([
            "{}/plots/names/{}/{}/{}".format(script_dir, str_dt, metric,
                                             traceroute_type)
        ])

        df = pd.read_csv("{}/prints/{}/filtered/traceroute_per_mac.csv".format(
            script_dir, str_dt))
        cnt = 0
        for idx, row in df.iterrows():
            if row["valid_cnt_samples"] and row[valid_traceroute_field]:
                print("cnt={}, traceroute_type={}, str_dt={}".format(
                    cnt, traceroute_type, str_dt))
                cnt += 1

                client = utils.get_client(row["server"], row["mac"])

                for name in cp_utils.iter_names_traceroute_filtered(
                        ast.literal_eval(row[traceroute_field])):

                    utils.create_dirs([
                        "{}/plots/names/{}/{}/{}/{}".format(
                            script_dir, str_dt, metric, traceroute_type,
                            row["server"]),
                        "{}/plots/names/{}/{}/{}/{}/{}".format(
                            script_dir, str_dt, metric, traceroute_type,
                            row["server"], name)
                    ])

                    out_file_name = utils.get_out_file_name(
                        row["server"], row["mac"], dt_start, dt_end)
                    out_path = ("{}/plots/names/{}/{}/{}/{}/{}/{}.png".format(
                        script_dir, str_dt, metric, traceroute_type,
                        row["server"], name, out_file_name))

                    # avoid reploting
                    if client in client_plotPath:
                        shutil.copyfile(client_plotPath[client], out_path)
                    else:
                        client_plotPath[client] = out_path
                        cp_dts = client_cps[client]

                        in_path = "{}/input/{}/{}/{}.csv".format(
                            base_dir, dt_dir, row["server"], row["mac"])

                        ts = TimeSeries(in_path, metric, dt_start, dt_end)
                        cp_utils.preprocess(ts, preprocess_args)
                        plot_procedures.plot_ts(ts,
                                                out_path,
                                                dt_axvline=cp_dts,
                                                title="median filtered")
def process():
    create_dirs(target_email)

    cnt_points = 0
    cps_per_ts_samples = []
    middle_seg_len_samples = []
    first_seg_len_samples = []
    last_seg_len_samples = []
    abs_mean_diff_consecutive_segs = []
    hellinger_dist_consecutive_segs = []
    cnt_rows = 0

    df = pd.read_csv("./classifications.csv", sep=";")
    for idx, row in df.iterrows():
        if row["email"] == target_email:
            cnt_rows += 1
            print "cnt_rows={}".format(cnt_rows)
            # print "row={}".format(row)

            # get change points list
            dt_cp_list = []
            if row["change_points"] != "''":
                for strdt in row["change_points"].split(","):
                    dt = dt_procedures.from_js_strdt_to_dt(strdt)
                    dt_cp_list.append(dt)
            dt_cp_list.sort()

            dt_start = dt_procedures.from_js_strdate_to_dt(row["date_start"])
            dt_end = dt_procedures.from_js_strdate_to_dt(row["date_end"])
            in_path = ("../../../input/{}_{}/{}/{}.csv"
                       "".format(dt_start.year,
                                 str(dt_start.month).zfill(2), row["server"],
                                 row["mac"]))

            ts = TimeSeries(in_path=in_path,
                            metric="loss",
                            dt_start=dt_start,
                            dt_end=dt_end)

            cnt_points += len(ts.y)
            cps_per_ts_samples.append(len(dt_cp_list))

            # get id_cp_list, first_seg_len, middle_seg_len, last_seg_len
            id_cp_list = []
            i, j, last_cp_i = 0, 0, -1
            while (i < len(ts.x)) and (j < len(dt_cp_list)):
                if ts.x[i] == dt_cp_list[j]:
                    id_cp_list.append(i)
                    if j == 0:
                        first_seg_len_samples.append(i + 1)
                    if j == len(dt_cp_list) - 1:
                        last_seg_len_samples.append(len(ts.x) - i - 1)
                    if last_cp_i != -1:
                        middle_seg_len_samples.append(i - last_cp_i)
                    last_cp_i = i
                    j += 1
                i += 1
            if (j != len(dt_cp_list)):
                print "ERROR"

            # get abs_mean_diff_consecutive_segs and
            # hellinger_dist_consecutive_segments
            segs_list = []
            last_id_cp = -1
            for i in xrange(len(id_cp_list)):
                if (id_cp_list[i] == 0) or (id_cp_list[i] == len(ts.y) - 1):
                    continue
                segs_list.append([last_id_cp + 1, id_cp_list[i]])
                if i == len(id_cp_list) - 1:
                    segs_list.append([id_cp_list[i] + 1, len(ts.y) - 1])
                last_id_cp = id_cp_list[i]
            for i in xrange(1, len(segs_list)):
                l1 = ts.y[segs_list[i - 1][0]:segs_list[i - 1][1] + 1]
                l2 = ts.y[segs_list[i][0]:segs_list[i][1] + 1]

                mean1 = np.mean(l1)
                mean2 = np.mean(l2)
                abs_mean_diff_consecutive_segs.append(abs(mean1 - mean2))

                bins = np.arange(0.0, 1.02, 0.02)
                hellinger_dist_consecutive_segs.append(
                    hellinger_dist(l1, l2, bins))

            plot(row, dt_cp_list, ts, dt_start, dt_end)
def print_empty_segs(dt_start,
                     dt_end,
                     metric,
                     min_seg_len,
                     filtered,
                     plot=False):
    dt_dir = utils.get_dt_dir(dt_start, dt_end)
    str_dt = utils.get_str_dt(dt_start, dt_end)

    utils.create_dirs([
        "{}/prints/".format(script_dir),
        "{}/prints/{}".format(script_dir, str_dt),
        "{}/prints/{}/{}".format(script_dir, str_dt, filtered),
        "{}/prints/{}/{}/{}".format(script_dir, str_dt, filtered, metric)
    ])

    out_path = "{}/prints/{}/{}/{}/empty_segs_per_mac.csv".format(
        script_dir, str_dt, filtered, metric)
    with open(out_path, "w") as f:
        f.write("server,mac,empty_segs\n")

        target_macs = read_input.get_macs_traceroute_filter(
            dt_start, dt_end, filtered)
        for server, mac, in_path in utils.iter_server_mac(dt_dir, True):
            if mac not in target_macs:
                continue

            ts = TimeSeries(in_path=in_path,
                            metric=metric,
                            dt_start=dt_start,
                            dt_end=dt_end)

            axvline_dts = []
            empty_segs = []
            if len(ts.x) >= 2:
                if is_empty_seg(dt_start, ts.x[0], min_seg_len):
                    axvline_dts.append(ts.x[0])
                    empty_segs.append([str(dt_start), str(ts.x[0])])
                for i in xrange(1, len(ts.x)):
                    if is_empty_seg(ts.x[i - 1], ts.x[i], min_seg_len):
                        axvline_dts.append(ts.x[i - 1])
                        axvline_dts.append(ts.x[i])
                        empty_segs.append([str(ts.x[i - 1]), str(ts.x[i])])
                if is_empty_seg(ts.x[-1], dt_end, min_seg_len):
                    axvline_dts.append(ts.x[i - 1])
                    empty_segs.append([str(ts.x[-1]), str(dt_end)])

            f.write("{},{},\"{}\"\n".format(server, mac, empty_segs))

            if plot:
                utils.create_dirs([
                    "{}/plots/".format(script_dir),
                    "{}/plots/empty_segs".format(script_dir),
                    "{}/plots/empty_segs/{}".format(script_dir, str_dt),
                    "{}/plots/empty_segs/{}/{}".format(script_dir, str_dt,
                                                       metric)
                ])

                out_file_name = utils.get_out_file_name(
                    server, mac, dt_start, dt_end)
                out_path = ("{}/plots/empty_segs/{}/{}/{}.png".format(
                    script_dir, str_dt, metric, out_file_name))
                plot_procedures.plot_ts(ts, out_path, dt_axvline=axvline_dts)
def print_traceroute_per_mac_filtered(dt_start,
                                      dt_end,
                                      min_fraction_of_samples=0.7):
    str_dt = utils.get_str_dt(dt_start, dt_end)

    out_path = ("{}/prints/{}/filtered/traceroute_per_mac.csv".format(
        script_dir, str_dt))
    with open(out_path, "w") as f:
        l = ("server,mac,valid_cnt_samples,"
             "valid_traceroute_compress_embratel,"
             "traceroute_compress_embratel_filter,"
             "valid_traceroute_compress_embratel_without_last_hop_embratel,"
             "traceroute_compress_embratel_without_last_hop_embratel_filter,"
             "valid_traceroute_without_embratel,"
             "traceroute_without_embratel_filter,"
             "valid_traceroute,"
             "traceroute_filter\n")
        f.write(l)

        in_path = ("{}/prints/{}/not_filtered/traceroute_per_mac.csv".format(
            script_dir, str_dt))
        df = pd.read_csv(in_path)
        for cnt, (idx, row) in enumerate(df.iterrows()):
            print(
                "print_traceroute_per_mac_filtered, str_dt={}, cnt={}".format(
                    str_dt, cnt))

            traceroute_compress_embratel_filter = \
                get_traceroute_filtered(
                    row["valid_traceroute_compress_embratel"],
                    row["traceroute_compress_embratel"],
                    row["server"])

            traceroute_compress_embratel_without_last_hop_embratel_filter = \
                get_traceroute_filtered(
                    row["valid_traceroute_compress_embratel_"
                        "without_last_hop_embratel"],
                    row["traceroute_compress_embratel_"
                        "without_last_hop_embratel"],
                    row["server"])

            traceroute_without_embratel_filter = \
                get_traceroute_filtered(
                    row["valid_traceroute_without_embratel"],
                    row["traceroute_without_embratel"],
                    row["server"])

            traceroute_filter = \
                get_traceroute_filtered(
                    row["valid_traceroute"],
                    row["traceroute"],
                    row["server"])

            # check if client has the minimum number of samples. Since the
            # metric is not specified at the moment, only check the presence of
            # the measurement timestamp
            in_path = utils.get_in_path(row["server"], row["mac"], dt_start,
                                        dt_end)
            ts = TimeSeries(in_path=in_path,
                            metric="dt",
                            dt_start=dt_start,
                            dt_end=dt_end)
            delta_days = (dt_end - dt_start).days
            fraction_of_samples = float(len(ts.y)) / (delta_days * 24.0 * 2.0)
            if fraction_of_samples < min_fraction_of_samples:
                valid_cnt_samples = False
            else:
                valid_cnt_samples = True

            l = "{},{},{}" + ",{},\"{}\"" * 4 + "\n"
            l = l.format(
                row["server"], row["mac"], valid_cnt_samples,
                row["valid_traceroute_compress_embratel"],
                traceroute_compress_embratel_filter,
                row["valid_traceroute_compress_embratel_"
                    "without_last_hop_embratel"],
                traceroute_compress_embratel_without_last_hop_embratel_filter,
                row["valid_traceroute_without_embratel"],
                traceroute_without_embratel_filter, row["valid_traceroute"],
                traceroute_filter)
            f.write(l)
예제 #26
0
def get_ts(row, preprocess_args, metric):
    in_path, dt_start, dt_end = unpack_pandas_row(row)
    ts = TimeSeries(in_path, metric, dt_start, dt_end)
    preprocess(ts, preprocess_args)
    return ts