def sample(self, n_samples): ts = TimeSeries() ts.y = self.model.sample(n_samples)[0] ts.x = range(len(ts.y)) out_path = "{}/plots/{}/sample.png".format(script_dir, self.__class__.__name__) plot_procedures.plot_ts(ts, out_path, compress=True)
def get_ts_per_name(traceroute_type, ts_traceroute, dt_start, dt_end): update_ip_name = False ip_name = traceroute_exploratory_prints.get_ip_name(ts_traceroute.y) if not ip_name: update_ip_name = True name_ts = {} for i in xrange(len(ts_traceroute.y)): traceroute = ts_traceroute.y[i] if update_ip_name: ip_name = traceroute_exploratory_prints.get_ip_name([traceroute]) ignore_traceroute = False for hop in traceroute: curr_name = None for name in hop["names"]: if name != u"##": curr_name = name if curr_name: for name in hop["names"]: if name == u"##": ignore_traceroute = True if not ignore_traceroute: name = get_name(hop["names"][0], ip_name, traceroute_type) for j in xrange(1, len(hop["names"])): curr_name = get_name(hop["names"][j], ip_name, traceroute_type) if name != curr_name: ignore_traceroute = True if ignore_traceroute: continue for hop in traceroute: name = hop["names"][0] if name != "##": name = get_name(name, ip_name, traceroute_type) if name not in name_ts: name_ts[name] = TimeSeries(dt_start=dt_start, dt_end=dt_end) sum_latency = 0 cnt_latency = 0 for latency in hop["times"]: if type(latency) == int: sum_latency += latency / 1000.0 cnt_latency += 1 if cnt_latency > 0: mean_latency = sum_latency / cnt_latency else: continue name_ts[name].dt_start = ts_traceroute.dt_start name_ts[name].dt_end = ts_traceroute.dt_end name_ts[name].x.append(ts_traceroute.x[i]) name_ts[name].y.append(mean_latency) return name_ts
def simulate(): ts_len = 1000 l1 = np.random.normal(1, 0.2, ts_len) l2 = np.random.normal(5, 0.2, ts_len) l = np.append(l1, l2) ts = TimeSeries(compressed=True) ts.x = range(1, len(l) + 1) ts.y = l return ts
def mean_per_hour(in_path, server, mac): ts = TimeSeries(in_path, "loss", dt_start=dt_start, dt_end=dt_end, ts_type="hourly") ts.compress() plot_procedures.plot_ts(ts, "./plots/ts/ts_{}_{}.png".format(server, mac), ylim=[-0.05, 1.05], ylabel="Loss Fraction", xlabel="day/month")
def print_name_ips(dt_start, dt_end): # TODO: Probably deprecated dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) name_ip = {} for server, mac, in_path in utils.iter_server_mac(dt_dir, True): ts = TimeSeries(in_path=in_path, metric="traceroute", dt_start=dt_start, dt_end=dt_end) for traceroute in ts.y: if traceroute: for hop in traceroute: for name, ip in izip(hop["names"], hop["ips"]): if name not in name_ip: name_ip[name] = set() name_ip[name].add(ip) out_path = "{}/prints/{}/not_filtered/name_ips.csv".format( script_dir, str_dt) with open(out_path, "w") as f: f.write("name,ips\n") for name in sorted(name_ip.keys()): f.write("{},{}\n".format(name, sorted(list(name_ip[name]))))
def print_names_per_mac(dt_start, dt_end, mac_node): # TODO: Probably deprecated dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) out_path = ("{}/prints/{}/not_filtered/names_per_mac.csv".format( script_dir, str_dt)) with open(out_path, "w") as f: f.write("server,node,mac,names\n") for server, mac, in_path in utils.iter_server_mac(dt_dir, True): ts = TimeSeries(in_path=in_path, metric="traceroute", dt_start=dt_start, dt_end=dt_end) names = set() for traceroute in ts.y: if traceroute: ip_name = get_ip_name(traceroute) for hop in traceroute: for name in hop["names"]: names.add(get_name(name, ip_name)) node = mac_node.get(mac) f.write("{},{},{},\"{}\"\n".format(server, node, mac, sorted(list(names)))) utils.sort_csv_file(out_path, ["server", "node"])
def print_macs_per_name(dt_start, dt_end, mac_node): # TODO: Probably deprecated dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) name_macs = {} for server, mac, in_path in utils.iter_server_mac(dt_dir, True): ts = TimeSeries(in_path=in_path, metric="traceroute", dt_start=dt_start, dt_end=dt_end) for traceroute in ts.y: if traceroute: ip_name = get_ip_name(traceroute) for hop in traceroute: for name in hop["names"]: name = get_name(name, ip_name) if name not in name_macs: name_macs[name] = set() name_macs[name].add((server, mac_node.get(mac), mac)) out_path = ("{}/prints/{}/not_filtered/macs_per_name.csv".format( script_dir, str_dt)) with open(out_path, "w") as f: f.write("name,macs\n") names = sorted(name_macs.keys()) for name in names: f.write("{},\"{}\"\n".format(name, sorted(list(name_macs[name]))))
def unpack_row(self, row): dt_start = datetime.datetime.strptime(row["dt_start"], "%Y-%m-%d") dt_end = datetime.datetime.strptime(row["dt_end"], "%Y-%m-%d") in_path = utils.get_in_path(row["server"], row["mac"], dt_start, dt_end) ts = TimeSeries(in_path, "loss", dt_start, dt_end) return row["server"], row["mac"], dt_start, dt_end, ts
def plot_single(self, row, cmp_class_args, dataset, out_path): ts_preprocessed = cp_utils.get_ts(row, self.preprocess_args, self.metric) pred = self.predict(ts_preprocessed) correct = cp_utils.from_str_to_int_list(row["change_points_ids"]) pred.sort() correct.sort() conf = cmp_class.conf_mat(correct, pred, ts_preprocessed, cmp_class.match_id, **cmp_class_args) print "pred={}".format(pred) print "correct={}".format(correct) print "conf={}".format(conf) # if is an unsupervised problem, plot the predicted cps in the ts cp_to_print = pred if "unsupervised" in dataset: pred, correct = correct, pred in_path, dt_start, dt_end = cp_utils.unpack_pandas_row(row) out_path = "{}.png".format(out_path) ts_raw = TimeSeries(in_path, self.metric, dt_start, dt_end) self.plot(ts_preprocessed, ts_raw, correct, pred, conf, out_path) out_path = "{}.csv".format(out_path) self.print_cp(ts_preprocessed, cp_to_print, out_path)
def mean_per_hour_in_a_day(in_path, server, mac): ts = TimeSeries(in_path, "loss", dt_start=dt_start, dt_end=dt_end) hour_samples, hour_cnt = [], [] for hour in xrange(24): hour_samples.append([]) hour_cnt.append(0) for i in xrange(len(ts.x)): hour_samples[ts.x[i].hour].append(ts.y[i]) hour_cnt[ts.x[i].hour] += 1 hour_list, mean_list, var_list = [], [], [] for hour in xrange(24): if len(hour_samples) > 0: hour_list.append(hour) mean_list.append(np.mean(hour_samples[hour])) var_list.append(np.var(hour_samples[hour])) hour_list = np.asarray(hour_list) mean_list = np.asarray(mean_list) var_list = np.asarray(var_list) print "hour_cnt={}".format(hour_cnt) plt.clf() matplotlib.rcParams.update({'font.size': 30}) plt.gcf().set_size_inches(16, 15) plt.grid() plt.ylabel("Mean Loss Fraction") plt.xlabel("Hour") plt.xticks(range(0, 24, 2), rotation=45) plt.plot(hour_list, mean_list, marker="o") plt.fill_between(hour_list, mean_list - var_list, mean_list + var_list, alpha=0.3) plt.savefig("./plots/mean_per_hour_in_a_day/mean_per_hour_in_a_day_{}_{}." "png".format(server, mac))
def add_cp_ids(): """ write to in_path a new column: the change points ids (index of change points when points are sorted by measure datetime) """ in_path = "{}/data_web_system.csv".format(script_dir) df = pd.read_csv(in_path) if "change_points_ids" not in df: cp_ids = [] for idx, row in df.iterrows(): dt_start = dt_procedures.from_strdt_to_dt(row["dt_start"]) dt_end = dt_procedures.from_strdt_to_dt(row["dt_end"]) dt_dir = utils.get_dt_dir(dt_start, dt_end) in_path = "{}/input/{}/{}/{}.csv".format(base_dir, dt_dir, row["server"], row["mac"]) ts = TimeSeries(in_path, "loss", dt_start=dt_start, dt_end=dt_end) if not ts.x: cp_ids.append("") continue if str(row["change_points"]) != "\'\'": l_dt = map(dt_procedures.from_js_strdt_to_dt, row["change_points"].split(",")) l_id = from_dt_to_id(in_path, "loss", dt_start, dt_end, l_dt) cp_ids.append(",".join(map(str, l_id))) else: cp_ids.append("") df["change_points_ids"] = cp_ids df.to_csv("{}/data_web_system.csv".format(script_dir), index=False)
def acf(in_path, server, mac): ts = TimeSeries(in_path, "loss", dt_start=dt_start, dt_end=dt_end, ts_type="hourly") lags, acf, cnt_pairs = ts.get_acf(84) print "cnt_pairs={}".format(cnt_pairs) plt.clf() matplotlib.rcParams.update({'font.size': 30}) plt.gcf().set_size_inches(16, 15) plt.grid() plt.xticks(range(0, 85, 12), rotation=45) plt.xlabel("Lag (hours)") plt.ylabel("ACF") plt.plot(lags, acf, marker="o") plt.savefig("./plots/acf/acf_{}_{}.png".format(server, mac))
def plot(dt_start, dt_end, metric): dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs([ "{}/{}".format(script_dir, str_dt), "{}/{}/{}".format(script_dir, str_dt, metric) ]) for server, mac, in_path in utils.iter_server_mac(dt_dir, True): out_file_name = utils.get_out_file_name(server, mac, dt_start, dt_end) out_path = "{}/{}/{}/{}.png".format(script_dir, str_dt, metric, out_file_name) # comparison between not filtered and filtered ts = TimeSeries(in_path, metric, dt_start, dt_end) ts_filter = TimeSeries(in_path, metric, dt_start, dt_end) ts_filter.percentile_filter(win_len=5, p=0.5) # if len(ts_filter.y) > 100: # plot_procedures.plot_stl_decomposition(ts_filter, # "median_filtered", # out_path) # comparison between with cross traffic and without # ts = TimeSeries(in_path, metric, dt_start, dt_end) # ts.percentile_filter(win_len=13, p=0.5) # ts_filter = TimeSeries(in_path, metric, dt_start, dt_end, # cross_traffic_thresh=0) # ts_filter.percentile_filter(win_len=13, p=0.5) # plot_procedures.plot_ts_share_x(ts, ts_filter, out_path, # compress=True, # plot_type2="scatter", # title1="raw", # title2="median filtered", # default_ylabel=True, # xlabel="$i$") ylabel = plot_procedures.get_default_ylabel(ts) plot_procedures.plot_ts(ts_filter, out_path, ylabel=ylabel, compress=False, title="median filtered")
def plot(self, server, mac, dt_start, dt_end, ts, hidden_state_path): ts_hidden_state_path = TimeSeries() ts_hidden_state_path.x = copy.deepcopy(ts.x) ts_hidden_state_path.y = copy.deepcopy(hidden_state_path) out_file_name = utils.get_out_file_name(server, mac, dt_start, dt_end) out_path = "{}/plots/{}/{}.png".format(script_dir, self.__class__.__name__, out_file_name) plot_procedures.plot_ts_share_x( ts, ts_hidden_state_path, out_path, compress=True, title1="raw time series", title2="best hidden state path", plot_type2="scatter", yticks2=range(self.model.n_components), ylim2=[-0.5, self.model.n_components - 0.5])
def plot_latencies_traceroute(dt_start, dt_end, preprocess_args): str_dt = utils.get_str_dt(dt_start, dt_end) in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format(script_dir, str_dt) df = pd.read_csv(in_path) for _, row, in df.iterrows(): if row["valid_cnt_samples"]: in_path = utils.get_in_path(row["server"], row["mac"], dt_start, dt_end) ts_traceroute = TimeSeries(in_path=in_path, metric="traceroute", dt_start=dt_start, dt_end=dt_end) for traceroute_type in unsupervised_utils.iter_traceroute_types(): valid_traceroute_field, traceroute_field = \ cp_utils.get_traceroute_fields(traceroute_type) if row[valid_traceroute_field]: traceroute = ast.literal_eval(row[traceroute_field]) name_ts = get_ts_per_name(traceroute_type, ts_traceroute, dt_start, dt_end) dir_path = ("{}/plots/paths/{}/{}/{}/{}". format(script_dir, str_dt, "latency", traceroute_type, row["server"])) traceroute_path = "/".join(map(str, list(reversed(traceroute)))) dir_path = "{}/{}".format(dir_path, traceroute_path) utils.create_dirs(["{}/traceroute_latencies/". format(dir_path), "{}/traceroute_latencies/{}". format(dir_path, row["mac"])]) for i in range(len(traceroute) - 1): name = traceroute[i][0][0] traceroute_path = "hop{}_{}".format(str(i).zfill(2), name) out_path = ("{}/traceroute_latencies/{}/{}.png". format(dir_path, row["mac"], traceroute_path)) ts_preprocessed = name_ts[name].copy() cp_utils.preprocess(ts_preprocessed, preprocess_args) # plot_procedures.plot_ts_share_x( # name_ts[name], # ts_preprocessed, # out_path, # plot_type2="scatter", # title1="raw", # title2="median filtered", # default_ylabel=True) ts_preprocessed.metric = "latency" plot_procedures.plot_ts(ts_preprocessed, out_path, title="median filtered")
def print_cps(dt_start, dt_end, dir_model, metric, preprocess_args): str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs(["{}/prints/".format(script_dir), "{}/prints/{}".format(script_dir, str_dt), "{}/prints/{}/filtered".format(script_dir, str_dt), "{}/prints/{}/filtered/{}".format(script_dir, str_dt, metric)]) out_path = "{}/prints/{}/filtered/{}/cps_per_mac.csv".format(script_dir, str_dt, metric) with open(out_path, "w") as f: f.write("server,mac,cp_dts,type_cps,seg_means\n") in_path_dir = ("{}/change_point/models/{}/plots/unsupervised/{}/{}". format(base_dir, dir_model, str_dt, metric)) cnt = 0 for file_name in os.listdir(in_path_dir): if ".csv" in file_name: cnt += 1 print "cnt={}".format(cnt) server = file_name.split("server")[1].split("_")[0] mac = file_name.split("mac")[1].split("_")[0] dt_cps = [] id_cps = [] df = pd.read_csv("{}/{}".format(in_path_dir, file_name)) for idx, row in df.iterrows(): dt_cps.append(row["dt"]) id_cps.append(row["dt_id"]) in_path = utils.get_in_path(server, mac, dt_start, dt_end) ts = TimeSeries(in_path, metric, dt_start, dt_end) cp_utils.preprocess(ts, preprocess_args) seg_means = [] type_cps = [] if id_cps: mean1 = np.mean(ts.y[0:id_cps[0]]) seg_means.append(mean1) for i in range(1, len(id_cps)): mean2 = np.mean(ts.y[id_cps[i - 1]:id_cps[i]]) seg_means.append(mean2) update_type_cps(type_cps, mean1, mean2, metric) mean1 = mean2 mean2 = np.mean(ts.y[id_cps[-1]:-1]) seg_means.append(mean2) update_type_cps(type_cps, mean1, mean2, metric) f.write("{},{},\"{}\",\"{}\",\"{}\"\n".format(server, mac, dt_cps, type_cps, seg_means))
def write_all_samples_to_file(targets, in_dir): samples = [] for target in targets: server, mac = target[0], target[1] in_path = "{}/{}/{}.csv".format(in_dir, server, mac) ts = TimeSeries(in_path, "loss", dt_start=dt_start, dt_end=dt_end) samples = samples + ts.y print "server={}, mac={}".format(server, mac) with open("./plots/distribution/samples.csv", "w") as f: f.write("samples\n") for x in samples: f.write("{}\n".format(x))
def plot_per_node(dt_start, dt_end, metric, only_unique_traceroute): dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs([ "{}/plots/".format(script_dir), "{}/plots/nodes".format(script_dir), "{}/plots/nodes/{}".format(script_dir, str_dt), "{}/plots/nodes/{}/{}".format(script_dir, str_dt, metric) ]) valid_nodes = read_input.get_valid_nodes() mac_node = read_input.get_mac_node() macs_unique_traceroute = read_input.get_macs_traceroute_filter( dt_start, dt_end, "filtered") for server, mac, in_path in utils.iter_server_mac(dt_dir, True): if only_unique_traceroute and (mac not in macs_unique_traceroute): continue if mac_node[mac] in valid_nodes: utils.create_dirs([ "{}/plots/nodes/{}/{}/{}".format(script_dir, str_dt, metric, mac_node[mac]) ]) out_file_name = utils.get_out_file_name(server, mac, dt_start, dt_end) out_path = ("{}/plots/nodes/{}/{}/{}/{}.png".format( script_dir, str_dt, metric, mac_node[mac], out_file_name)) ts = TimeSeries(in_path, metric, dt_start, dt_end) ts_filter = TimeSeries(in_path, metric, dt_start, dt_end) ts_filter.percentile_filter(win_len=13, p=0.5) plot_procedures.plot_ts_share_x(ts, ts_filter, out_path, compress=False, plot_type2="scatter")
def from_dt_to_id(in_path, metric, dt_start, dt_end, l_dt): """ return ids associated with dts in l_dt list """ ts = TimeSeries(in_path, metric, dt_start=dt_start, dt_end=dt_end) dt_id = {} for i in xrange(len(ts.x)): dt_id[ts.x[i]] = i l_id = [] for dt in l_dt: l_id.append(dt_id[dt]) return l_id
def print_traceroute_per_mac(dt_start, dt_end): dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) out_path = ("{}/prints/{}/not_filtered/traceroute_per_mac.csv".format( script_dir, str_dt)) with open(out_path, "w") as f: f.write("server,mac," "valid_traceroute_compress_embratel," "traceroute_compress_embratel," "valid_traceroute_compress_embratel_without_last_hop_embratel," "traceroute_compress_embratel_without_last_hop_embratel," "valid_traceroute_without_embratel," "traceroute_without_embratel," "valid_traceroute," "traceroute\n") for server, mac, in_path in utils.iter_server_mac(dt_dir, True): ts_traceroute = TimeSeries(in_path=in_path, metric="traceroute", dt_start=dt_start, dt_end=dt_end) (valid_traceroute_compress_embratel, traceroute_compress_embratel) = \ get_traceroute(ts_traceroute, True, True, True) (valid_traceroute_compress_embratel_without_last_hop_embratel, traceroute_compress_embratel_without_last_hop_embratel) = \ get_traceroute(ts_traceroute, True, True, False) (valid_traceroute_without_embratel, traceroute_without_embratel) = \ get_traceroute(ts_traceroute, False, False, False) (valid_traceroute, traceroute) = \ get_traceroute(ts_traceroute, True, False, False) l = "{},{}" + ",{},\"{}\"" * 4 + "\n" l = l.format( server, mac, valid_traceroute_compress_embratel, traceroute_compress_embratel, valid_traceroute_compress_embratel_without_last_hop_embratel, traceroute_compress_embratel_without_last_hop_embratel, valid_traceroute_without_embratel, traceroute_without_embratel, valid_traceroute, traceroute) f.write(l) utils.sort_csv_file(out_path, ["server", "mac"])
def sliding_window(ts): ts_dist = TimeSeries(compressed=True) win_len = 100 for i in xrange(win_len, len(ts.y) - win_len + 1): dist = cmp_win.hellinger_dist(ts.y[i - win_len:i], ts.y[i:i + win_len], bins=np.arange(0.02, 10.02, 0.02)) ts_dist.x.append(ts.x[i]) ts_dist.y.append(dist) plot_procedures.plot_ts_share_x(ts, ts_dist, "./sliding_window_toy_example.png", compress=True, plot_type1="plot", ylim1=[0, max(ts.y)], ylabel1="$y_{i}$", ylabel2="$H_{i}$", xlabel="$i$")
def plot_per_name(dt_start, dt_end, metric, preprocess_args, plot_cps=True): dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs([ "{}/plots/".format(script_dir), "{}/plots/names".format(script_dir), "{}/plots/names/{}".format(script_dir, str_dt), "{}/plots/names/{}/{}".format(script_dir, str_dt, metric) ]) client_cps = unsupervised_utils.get_client_cps(plot_cps, str_dt, metric) # avoid reploting client_plotPath = {} for traceroute_type in unsupervised_utils.iter_traceroute_types(): valid_traceroute_field, traceroute_field = \ cp_utils.get_traceroute_fields(traceroute_type) utils.create_dirs([ "{}/plots/names/{}/{}/{}".format(script_dir, str_dt, metric, traceroute_type) ]) df = pd.read_csv("{}/prints/{}/filtered/traceroute_per_mac.csv".format( script_dir, str_dt)) cnt = 0 for idx, row in df.iterrows(): if row["valid_cnt_samples"] and row[valid_traceroute_field]: print("cnt={}, traceroute_type={}, str_dt={}".format( cnt, traceroute_type, str_dt)) cnt += 1 client = utils.get_client(row["server"], row["mac"]) for name in cp_utils.iter_names_traceroute_filtered( ast.literal_eval(row[traceroute_field])): utils.create_dirs([ "{}/plots/names/{}/{}/{}/{}".format( script_dir, str_dt, metric, traceroute_type, row["server"]), "{}/plots/names/{}/{}/{}/{}/{}".format( script_dir, str_dt, metric, traceroute_type, row["server"], name) ]) out_file_name = utils.get_out_file_name( row["server"], row["mac"], dt_start, dt_end) out_path = ("{}/plots/names/{}/{}/{}/{}/{}/{}.png".format( script_dir, str_dt, metric, traceroute_type, row["server"], name, out_file_name)) # avoid reploting if client in client_plotPath: shutil.copyfile(client_plotPath[client], out_path) else: client_plotPath[client] = out_path cp_dts = client_cps[client] in_path = "{}/input/{}/{}/{}.csv".format( base_dir, dt_dir, row["server"], row["mac"]) ts = TimeSeries(in_path, metric, dt_start, dt_end) cp_utils.preprocess(ts, preprocess_args) plot_procedures.plot_ts(ts, out_path, dt_axvline=cp_dts, title="median filtered")
def process(): create_dirs(target_email) cnt_points = 0 cps_per_ts_samples = [] middle_seg_len_samples = [] first_seg_len_samples = [] last_seg_len_samples = [] abs_mean_diff_consecutive_segs = [] hellinger_dist_consecutive_segs = [] cnt_rows = 0 df = pd.read_csv("./classifications.csv", sep=";") for idx, row in df.iterrows(): if row["email"] == target_email: cnt_rows += 1 print "cnt_rows={}".format(cnt_rows) # print "row={}".format(row) # get change points list dt_cp_list = [] if row["change_points"] != "''": for strdt in row["change_points"].split(","): dt = dt_procedures.from_js_strdt_to_dt(strdt) dt_cp_list.append(dt) dt_cp_list.sort() dt_start = dt_procedures.from_js_strdate_to_dt(row["date_start"]) dt_end = dt_procedures.from_js_strdate_to_dt(row["date_end"]) in_path = ("../../../input/{}_{}/{}/{}.csv" "".format(dt_start.year, str(dt_start.month).zfill(2), row["server"], row["mac"])) ts = TimeSeries(in_path=in_path, metric="loss", dt_start=dt_start, dt_end=dt_end) cnt_points += len(ts.y) cps_per_ts_samples.append(len(dt_cp_list)) # get id_cp_list, first_seg_len, middle_seg_len, last_seg_len id_cp_list = [] i, j, last_cp_i = 0, 0, -1 while (i < len(ts.x)) and (j < len(dt_cp_list)): if ts.x[i] == dt_cp_list[j]: id_cp_list.append(i) if j == 0: first_seg_len_samples.append(i + 1) if j == len(dt_cp_list) - 1: last_seg_len_samples.append(len(ts.x) - i - 1) if last_cp_i != -1: middle_seg_len_samples.append(i - last_cp_i) last_cp_i = i j += 1 i += 1 if (j != len(dt_cp_list)): print "ERROR" # get abs_mean_diff_consecutive_segs and # hellinger_dist_consecutive_segments segs_list = [] last_id_cp = -1 for i in xrange(len(id_cp_list)): if (id_cp_list[i] == 0) or (id_cp_list[i] == len(ts.y) - 1): continue segs_list.append([last_id_cp + 1, id_cp_list[i]]) if i == len(id_cp_list) - 1: segs_list.append([id_cp_list[i] + 1, len(ts.y) - 1]) last_id_cp = id_cp_list[i] for i in xrange(1, len(segs_list)): l1 = ts.y[segs_list[i - 1][0]:segs_list[i - 1][1] + 1] l2 = ts.y[segs_list[i][0]:segs_list[i][1] + 1] mean1 = np.mean(l1) mean2 = np.mean(l2) abs_mean_diff_consecutive_segs.append(abs(mean1 - mean2)) bins = np.arange(0.0, 1.02, 0.02) hellinger_dist_consecutive_segs.append( hellinger_dist(l1, l2, bins)) plot(row, dt_cp_list, ts, dt_start, dt_end)
def print_empty_segs(dt_start, dt_end, metric, min_seg_len, filtered, plot=False): dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs([ "{}/prints/".format(script_dir), "{}/prints/{}".format(script_dir, str_dt), "{}/prints/{}/{}".format(script_dir, str_dt, filtered), "{}/prints/{}/{}/{}".format(script_dir, str_dt, filtered, metric) ]) out_path = "{}/prints/{}/{}/{}/empty_segs_per_mac.csv".format( script_dir, str_dt, filtered, metric) with open(out_path, "w") as f: f.write("server,mac,empty_segs\n") target_macs = read_input.get_macs_traceroute_filter( dt_start, dt_end, filtered) for server, mac, in_path in utils.iter_server_mac(dt_dir, True): if mac not in target_macs: continue ts = TimeSeries(in_path=in_path, metric=metric, dt_start=dt_start, dt_end=dt_end) axvline_dts = [] empty_segs = [] if len(ts.x) >= 2: if is_empty_seg(dt_start, ts.x[0], min_seg_len): axvline_dts.append(ts.x[0]) empty_segs.append([str(dt_start), str(ts.x[0])]) for i in xrange(1, len(ts.x)): if is_empty_seg(ts.x[i - 1], ts.x[i], min_seg_len): axvline_dts.append(ts.x[i - 1]) axvline_dts.append(ts.x[i]) empty_segs.append([str(ts.x[i - 1]), str(ts.x[i])]) if is_empty_seg(ts.x[-1], dt_end, min_seg_len): axvline_dts.append(ts.x[i - 1]) empty_segs.append([str(ts.x[-1]), str(dt_end)]) f.write("{},{},\"{}\"\n".format(server, mac, empty_segs)) if plot: utils.create_dirs([ "{}/plots/".format(script_dir), "{}/plots/empty_segs".format(script_dir), "{}/plots/empty_segs/{}".format(script_dir, str_dt), "{}/plots/empty_segs/{}/{}".format(script_dir, str_dt, metric) ]) out_file_name = utils.get_out_file_name( server, mac, dt_start, dt_end) out_path = ("{}/plots/empty_segs/{}/{}/{}.png".format( script_dir, str_dt, metric, out_file_name)) plot_procedures.plot_ts(ts, out_path, dt_axvline=axvline_dts)
def print_traceroute_per_mac_filtered(dt_start, dt_end, min_fraction_of_samples=0.7): str_dt = utils.get_str_dt(dt_start, dt_end) out_path = ("{}/prints/{}/filtered/traceroute_per_mac.csv".format( script_dir, str_dt)) with open(out_path, "w") as f: l = ("server,mac,valid_cnt_samples," "valid_traceroute_compress_embratel," "traceroute_compress_embratel_filter," "valid_traceroute_compress_embratel_without_last_hop_embratel," "traceroute_compress_embratel_without_last_hop_embratel_filter," "valid_traceroute_without_embratel," "traceroute_without_embratel_filter," "valid_traceroute," "traceroute_filter\n") f.write(l) in_path = ("{}/prints/{}/not_filtered/traceroute_per_mac.csv".format( script_dir, str_dt)) df = pd.read_csv(in_path) for cnt, (idx, row) in enumerate(df.iterrows()): print( "print_traceroute_per_mac_filtered, str_dt={}, cnt={}".format( str_dt, cnt)) traceroute_compress_embratel_filter = \ get_traceroute_filtered( row["valid_traceroute_compress_embratel"], row["traceroute_compress_embratel"], row["server"]) traceroute_compress_embratel_without_last_hop_embratel_filter = \ get_traceroute_filtered( row["valid_traceroute_compress_embratel_" "without_last_hop_embratel"], row["traceroute_compress_embratel_" "without_last_hop_embratel"], row["server"]) traceroute_without_embratel_filter = \ get_traceroute_filtered( row["valid_traceroute_without_embratel"], row["traceroute_without_embratel"], row["server"]) traceroute_filter = \ get_traceroute_filtered( row["valid_traceroute"], row["traceroute"], row["server"]) # check if client has the minimum number of samples. Since the # metric is not specified at the moment, only check the presence of # the measurement timestamp in_path = utils.get_in_path(row["server"], row["mac"], dt_start, dt_end) ts = TimeSeries(in_path=in_path, metric="dt", dt_start=dt_start, dt_end=dt_end) delta_days = (dt_end - dt_start).days fraction_of_samples = float(len(ts.y)) / (delta_days * 24.0 * 2.0) if fraction_of_samples < min_fraction_of_samples: valid_cnt_samples = False else: valid_cnt_samples = True l = "{},{},{}" + ",{},\"{}\"" * 4 + "\n" l = l.format( row["server"], row["mac"], valid_cnt_samples, row["valid_traceroute_compress_embratel"], traceroute_compress_embratel_filter, row["valid_traceroute_compress_embratel_" "without_last_hop_embratel"], traceroute_compress_embratel_without_last_hop_embratel_filter, row["valid_traceroute_without_embratel"], traceroute_without_embratel_filter, row["valid_traceroute"], traceroute_filter) f.write(l)
def get_ts(row, preprocess_args, metric): in_path, dt_start, dt_end = unpack_pandas_row(row) ts = TimeSeries(in_path, metric, dt_start, dt_end) preprocess(ts, preprocess_args) return ts