def print_names_per_mac(dt_start, dt_end, mac_node): # TODO: Probably deprecated dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) out_path = ("{}/prints/{}/not_filtered/names_per_mac.csv".format( script_dir, str_dt)) with open(out_path, "w") as f: f.write("server,node,mac,names\n") for server, mac, in_path in utils.iter_server_mac(dt_dir, True): ts = TimeSeries(in_path=in_path, metric="traceroute", dt_start=dt_start, dt_end=dt_end) names = set() for traceroute in ts.y: if traceroute: ip_name = get_ip_name(traceroute) for hop in traceroute: for name in hop["names"]: names.add(get_name(name, ip_name)) node = mac_node.get(mac) f.write("{},{},{},\"{}\"\n".format(server, node, mac, sorted(list(names)))) utils.sort_csv_file(out_path, ["server", "node"])
def aggregate_first_hop_not_zero_indegree_vertex(first_hops, g, metric, server, dt_start, dt_end, traceroute_type): str_dt = utils.get_str_dt(dt_start, dt_end) out_path = ("{}/plots/paths/{}/{}/{}/{}/" "problem_location_first_hop_not_zero_indegree_vertex.csv". format(script_dir, str_dt, metric, traceroute_type, server)) with open(out_path, "w") as f: f.write("cp_dt_start,cp_dt_end,cp_type,fraction_of_clients," "cnt_clients,clients,problem_location\n") for first_hop in first_hops: _, dir_path = get_path(g, str(first_hop), str_dt, traceroute_type, server) in_path = ("{}/problem_location_first_hop_not_zero_indegree_vertex" ".csv".format(dir_path)) if os.path.exists(in_path): df = pd.read_csv(in_path) for idx, row in df.iterrows(): l_format = "{},{},{},{},{},\"{}\",\"{}\"\n" f.write(l_format.format(row["cp_dt_start"], row["cp_dt_end"], row["cp_type"], row["fraction_of_clients"], row["cnt_clients"], row["clients"], row["problem_location"])) out_path_name = "{}/plots/names/{}/{}/{}/{}".format(script_dir, str_dt, metric, traceroute_type, server) shutil.copy(out_path, out_path_name)
def print_macs_per_name(dt_start, dt_end, mac_node): # TODO: Probably deprecated dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) name_macs = {} for server, mac, in_path in utils.iter_server_mac(dt_dir, True): ts = TimeSeries(in_path=in_path, metric="traceroute", dt_start=dt_start, dt_end=dt_end) for traceroute in ts.y: if traceroute: ip_name = get_ip_name(traceroute) for hop in traceroute: for name in hop["names"]: name = get_name(name, ip_name) if name not in name_macs: name_macs[name] = set() name_macs[name].add((server, mac_node.get(mac), mac)) out_path = ("{}/prints/{}/not_filtered/macs_per_name.csv".format( script_dir, str_dt)) with open(out_path, "w") as f: f.write("name,macs\n") names = sorted(name_macs.keys()) for name in names: f.write("{},\"{}\"\n".format(name, sorted(list(name_macs[name]))))
def get_graph(dt_start, dt_end, valid_traceroute_field, traceroute_field, server=None): str_dt = utils.get_str_dt(dt_start, dt_end) in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format( script_dir, str_dt) names = set() name_neigh = {} df = pd.read_csv(in_path) if server: df = df[df["server"] == server] for idx, row in df.iterrows(): if row["valid_cnt_samples"] and row[valid_traceroute_field]: traceroute = ast.literal_eval(row[traceroute_field]) last_name = None for name in traceroute: names.add(name) if name not in name_neigh: name_neigh[name] = set() if last_name: name_neigh[last_name].add(name) last_name = name for name in names: if name not in name_neigh: name_neigh[name] = set() return name_neigh
def aggregate_servers_correlations(dt_start, dt_end, metric, servers): str_dt = utils.get_str_dt(dt_start, dt_end) out_path = ("{}/prints/{}/filtered/{}/" "problem_location_zero_indegree_vertexes_correlation.csv". format(script_dir, str_dt, metric)) with open(out_path, "w") as f: f.write("server,traceroute_type,cp_dt_start,cp_dt_end,cp_type," "cnt_vertexes_with_zero_indegree,suffix_match," "vertexes_with_zero_indegree\n") for server in servers: for traceroute_type in unsupervised_utils.iter_traceroute_types(): if valid_graph(dt_start, dt_end, server, traceroute_type): in_path = ("{}/plots/names/{}/{}/{}/{}/" "problem_location_zero_indegree_vertexes_" "correlation.csv". format(script_dir, str_dt, metric, traceroute_type, server)) df = pd.read_csv(in_path) for idx, row in df.iterrows(): f.write("{},{},{},{},{},{},\"{}\",\"{}\"\n". format(server, row["traceroute_type"], row["cp_dt_start"], row["cp_dt_end"], row["cp_type"], row["cnt_vertexes_with_zero_indegree"], row["suffix_match"], row["vertexes_with_zero_indegree"])) break utils.sort_csv_file(out_path, ["cnt_vertexes_with_zero_indegree", "server"], ascending=[False, True])
def print_name_ips(dt_start, dt_end): # TODO: Probably deprecated dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) name_ip = {} for server, mac, in_path in utils.iter_server_mac(dt_dir, True): ts = TimeSeries(in_path=in_path, metric="traceroute", dt_start=dt_start, dt_end=dt_end) for traceroute in ts.y: if traceroute: for hop in traceroute: for name, ip in izip(hop["names"], hop["ips"]): if name not in name_ip: name_ip[name] = set() name_ip[name].add(ip) out_path = "{}/prints/{}/not_filtered/name_ips.csv".format( script_dir, str_dt) with open(out_path, "w") as f: f.write("name,ips\n") for name in sorted(name_ip.keys()): f.write("{},{}\n".format(name, sorted(list(name_ip[name]))))
def aggregate_servers_first_hop_not_zero_indegree_vertex(dt_start, dt_end, metric, servers): str_dt = utils.get_str_dt(dt_start, dt_end) out_path = ("{}/prints/{}/filtered/{}/" "problem_location_first_hop_not_zero_indegree_vertex.csv". format(script_dir, str_dt, metric)) with open(out_path, "w") as f: f.write("server,cp_dt_start,cp_dt_end,cp_type,fraction_of_clients," "cnt_clients,clients,problem_location\n") for server in servers: for traceroute_type in unsupervised_utils.iter_traceroute_types(): if valid_graph(dt_start, dt_end, server, traceroute_type): in_path = ("{}/plots/paths/{}/{}/{}/{}/" "problem_location_first_hop_not_zero_indegree_" "vertex.csv". format(script_dir, str_dt, metric, traceroute_type, server)) df = pd.read_csv(in_path) for idx, row in df.iterrows(): l_format = "{},{},{},{},{},{},\"{}\",\"{}\"\n" f.write(l_format.format(server, row["cp_dt_start"], row["cp_dt_end"], row["cp_type"], row["fraction_of_clients"], row["cnt_clients"], row["clients"], row["problem_location"])) break
def process_graphs(dt_start, dt_end): str_dt = utils.get_str_dt(dt_start, dt_end) out_dir = "{}/prints/{}/filtered/graph/".format(script_dir, str_dt) utils.create_dirs([out_dir]) in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format( script_dir, str_dt) servers = np.unique(pd.read_csv(in_path)["server"].values) for traceroute_type in unsupervised_utils.iter_traceroute_types(): valid_traceroute_field, traceroute_field = \ cp_utils.get_traceroute_fields(traceroute_type) for server in servers: utils.create_dirs([ "{}/prints/{}/filtered/graph/".format(script_dir, str_dt), "{}/prints/{}/filtered/graph/{}".format( script_dir, str_dt, server) ]) out_dir = "{}/prints/{}/filtered/graph/{}".format( script_dir, str_dt, server) out_path = "{}/{}_graph.gv".format(out_dir, traceroute_field) name_neigh = get_graph(dt_start, dt_end, valid_traceroute_field, traceroute_field, server) write_graph(out_path, name_neigh) check_graph(out_dir, name_neigh, traceroute_field)
def print_macs_per_name_filtered(dt_start, dt_end, mac_node): # TODO: Probably deprecated str_dt = utils.get_str_dt(dt_start, dt_end) in_path = ("{}/prints/{}/filtered/traceroute_per_mac.csv".format( script_dir, str_dt)) name_macs = {} df = pd.read_csv(in_path) for idx, row in df.iterrows(): traceroute_filtered = ast.literal_eval(row["traceroute_filtered"]) for elem in traceroute_filtered: name0 = elem[0][0] name1 = elem[1][0] tp = (name0, name1) if tp not in name_macs: name_macs[tp] = set() name_macs[tp].add( (row["server"], mac_node.get(row["mac"]), row["mac"])) out_path = ("{}/prints/{}/filtered/macs_per_name.csv".format( script_dir, str_dt)) with open(out_path, "w") as f: f.write("name,macs\n") names = sorted(name_macs.keys()) for name in names: f.write("\"{}\",\"{}\"\n".format(name, sorted(list(name_macs[name]))))
def valid_graph(dt_start, dt_end, server, traceroute_type): str_dt = utils.get_str_dt(dt_start, dt_end) in_path = ("{}/prints/{}/filtered/graph/{}/{}_filter_graph_stats.txt". format(script_dir, str_dt, server, traceroute_type)) with open(in_path) as f: for line in f: if "valid_graph=" in line: return (line.split("=")[-1].rstrip("\n") == "True")
def write_csvs(dt_dir, dt_start, dt_end, cursor, collection): for cnt, doc in enumerate(cursor): print "{}, {}".format(cnt, utils.get_str_dt(dt_start, dt_end)) if valid_doc(doc): mac = doc["_id"]["mac"] server = doc["host"] utils.create_dirs([ "{}/{}".format(script_dir, dt_dir), "{}/{}/{}/".format(script_dir, dt_dir, server) ]) out_path = "{}/{}/{}/{}.csv".format(script_dir, dt_dir, server, mac) if not os.path.exists(out_path): with open(out_path, "w") as f: l = ("dt,uf,server_ip,loss,latency,throughput_up," "throughput_down,nominal_up,nominal_down," "loss_cross_traffic_up,loss_cross_traffic_down," "latency_cross_traffic_up,latency_cross_traffic_down," "throughput_up_cross_traffic_up," "throughput_up_cross_traffic_down," "throughput_down_cross_traffic_up," "throughput_down_cross_traffic_down," "traceroute\n") f.write(l) uf = get_uf(doc) server = doc["host"] dt = dt_procedures.from_utc_to_sp(doc["_id"]["date"]) server_ip = get_server_ip(doc) (loss, loss_cross_traffic_up, loss_cross_traffic_down) = \ get_loss(doc) (latency, latency_cross_traffic_up, latency_cross_traffic_down) = get_latency(doc) (throughput_up, nominal_up, throughput_up_cross_traffic_up, throughput_up_cross_traffic_down) = get_throughput_up(doc) (throughput_down, nominal_down, throughput_down_cross_traffic_up, throughput_down_cross_traffic_down) = get_throughput_down(doc) traceroute = get_traceroute(doc) l = "{}" + ",{}" * 16 + ",\"{}\"\n" l = l.format(dt, uf, server_ip, loss, latency, throughput_up, throughput_down, nominal_up, nominal_down, loss_cross_traffic_up, loss_cross_traffic_down, latency_cross_traffic_up, latency_cross_traffic_down, throughput_up_cross_traffic_up, throughput_up_cross_traffic_down, throughput_down_cross_traffic_up, throughput_down_cross_traffic_down, traceroute) with open(out_path, "a") as f: f.write(l)
def voting(dt_start, dt_end, metric, in_dir, eps_hours): """ By now I assume that the cps from a single time series are more than eps_hours apart """ str_dt = utils.get_str_dt(dt_start, dt_end) in_dir = "{}/plots/{}/{}/{}".format(script_dir, in_dir, str_dt, metric) for dir_path, _, file_names in os.walk(in_dir): if "cps_per_mac.csv" in file_names: with open("{}/match_cps.csv".format(dir_path), "w") as f: f.write("cp_dt_start,cp_dt_end,cp_type,fraction_of_clients," "cnt_clients,clients\n") df = pd.read_csv("{}/cps_per_mac.csv".format(dir_path)) cnt_clients = df.shape[0] for cp_type in cp_utils.iter_cp_types(): l = [] for idx, row in df.iterrows(): cp_dts = [] cp_dts_aux = map(dt_procedures.from_strdt_to_dt, ast.literal_eval(row["cp_dts"])) cp_types_aux = ast.literal_eval(row["type_cps"]) for i_cp_dt, i_cp_type in izip(cp_dts_aux, cp_types_aux): if i_cp_type == cp_type: cp_dts.append(i_cp_dt) l = l + map( lambda dt: { "dt": dt, "mac": row["mac"], "server": row["server"] }, cp_dts) l.sort(key=itemgetter("dt")) votes = \ unsupervised_utils.multiple_inexact_voting( l, eps_hours) for vote in votes: clients = map( lambda dic: { "mac": dic["mac"], "server": dic["server"] }, vote["interval"]) l_dt = vote["l_dt"] r_dt = vote["r_dt"] fraction_of_clients = \ float(len(clients)) / cnt_clients f.write("{},{},{},{},{},\"{}\"\n".format( l_dt, r_dt, cp_type, fraction_of_clients, len(clients), clients))
def plot_latencies_traceroute(dt_start, dt_end, preprocess_args): str_dt = utils.get_str_dt(dt_start, dt_end) in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format(script_dir, str_dt) df = pd.read_csv(in_path) for _, row, in df.iterrows(): if row["valid_cnt_samples"]: in_path = utils.get_in_path(row["server"], row["mac"], dt_start, dt_end) ts_traceroute = TimeSeries(in_path=in_path, metric="traceroute", dt_start=dt_start, dt_end=dt_end) for traceroute_type in unsupervised_utils.iter_traceroute_types(): valid_traceroute_field, traceroute_field = \ cp_utils.get_traceroute_fields(traceroute_type) if row[valid_traceroute_field]: traceroute = ast.literal_eval(row[traceroute_field]) name_ts = get_ts_per_name(traceroute_type, ts_traceroute, dt_start, dt_end) dir_path = ("{}/plots/paths/{}/{}/{}/{}". format(script_dir, str_dt, "latency", traceroute_type, row["server"])) traceroute_path = "/".join(map(str, list(reversed(traceroute)))) dir_path = "{}/{}".format(dir_path, traceroute_path) utils.create_dirs(["{}/traceroute_latencies/". format(dir_path), "{}/traceroute_latencies/{}". format(dir_path, row["mac"])]) for i in range(len(traceroute) - 1): name = traceroute[i][0][0] traceroute_path = "hop{}_{}".format(str(i).zfill(2), name) out_path = ("{}/traceroute_latencies/{}/{}.png". format(dir_path, row["mac"], traceroute_path)) ts_preprocessed = name_ts[name].copy() cp_utils.preprocess(ts_preprocessed, preprocess_args) # plot_procedures.plot_ts_share_x( # name_ts[name], # ts_preprocessed, # out_path, # plot_type2="scatter", # title1="raw", # title2="median filtered", # default_ylabel=True) ts_preprocessed.metric = "latency" plot_procedures.plot_ts(ts_preprocessed, out_path, title="median filtered")
def analyse_first_hop(g, u, is_zero_indegree, metric, server, dt_start, dt_end, traceroute_type, eps_hours, min_fraction_of_clients): str_dt = utils.get_str_dt(dt_start, dt_end) path, dir_path = get_path(g, u, str_dt, traceroute_type, server) if is_zero_indegree: out_path = "{}/problem_location.csv".format(dir_path) else: out_path = ("{}/problem_location_first_hop_not_zero_indegree_vertex" ".csv".format(dir_path)) with open(out_path, "w") as f: f.write("cp_dt_start,cp_dt_end,cp_type,fraction_of_clients," "cnt_clients,clients,problem_location\n") in_path = "{}/match_cps.csv".format(dir_path) df = pd.read_csv(in_path) for idx, row in df.iterrows(): if row["fraction_of_clients"] >= min_fraction_of_clients: cp_dt_start = dt_procedures.from_strdt_to_dt( row["cp_dt_start"]) cp_dt_end = dt_procedures.from_strdt_to_dt(row["cp_dt_end"]) if is_zero_indegree: problem_location = \ map(ast.literal_eval, analyse_path(path, cp_dt_start, cp_dt_end, row["cp_type"], str_dt, metric, traceroute_type, server, eps_hours, min_fraction_of_clients)) else: problem_location = ("already_analysed_during_zero_" "indegree_vertexes_analysis") else: problem_location = ["before"] l_format = "{},{},{},{},{},\"{}\",\"{}\"\n" f.write(l_format.format(row["cp_dt_start"], row["cp_dt_end"], row["cp_type"], row["fraction_of_clients"], row["cnt_clients"], row["clients"], problem_location)) out_path_name = "{}/plots/names/{}/{}/{}/{}/{}".format(script_dir, str_dt, metric, traceroute_type, server, path[0]) shutil.copy(out_path, out_path_name)
def read_graph(dt_start, dt_end, server, traceroute_type): str_dt = utils.get_str_dt(dt_start, dt_end) in_path = "{}/prints/{}/filtered/graph/{}/{}_filter_graph.gv".format( script_dir, str_dt, server, traceroute_type) g = defaultdict(list) with open(in_path) as f: for line in f.readlines()[1:-1]: u = line.split(" -> ")[0].lstrip(" ").lstrip("\"").rstrip("\"") v = line.split(" -> ")[1].rstrip("\n").lstrip("\"").rstrip("\"") g[u].append(v) return g
def print_cps(dt_start, dt_end, dir_model, metric, preprocess_args): str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs(["{}/prints/".format(script_dir), "{}/prints/{}".format(script_dir, str_dt), "{}/prints/{}/filtered".format(script_dir, str_dt), "{}/prints/{}/filtered/{}".format(script_dir, str_dt, metric)]) out_path = "{}/prints/{}/filtered/{}/cps_per_mac.csv".format(script_dir, str_dt, metric) with open(out_path, "w") as f: f.write("server,mac,cp_dts,type_cps,seg_means\n") in_path_dir = ("{}/change_point/models/{}/plots/unsupervised/{}/{}". format(base_dir, dir_model, str_dt, metric)) cnt = 0 for file_name in os.listdir(in_path_dir): if ".csv" in file_name: cnt += 1 print "cnt={}".format(cnt) server = file_name.split("server")[1].split("_")[0] mac = file_name.split("mac")[1].split("_")[0] dt_cps = [] id_cps = [] df = pd.read_csv("{}/{}".format(in_path_dir, file_name)) for idx, row in df.iterrows(): dt_cps.append(row["dt"]) id_cps.append(row["dt_id"]) in_path = utils.get_in_path(server, mac, dt_start, dt_end) ts = TimeSeries(in_path, metric, dt_start, dt_end) cp_utils.preprocess(ts, preprocess_args) seg_means = [] type_cps = [] if id_cps: mean1 = np.mean(ts.y[0:id_cps[0]]) seg_means.append(mean1) for i in range(1, len(id_cps)): mean2 = np.mean(ts.y[id_cps[i - 1]:id_cps[i]]) seg_means.append(mean2) update_type_cps(type_cps, mean1, mean2, metric) mean1 = mean2 mean2 = np.mean(ts.y[id_cps[-1]:-1]) seg_means.append(mean2) update_type_cps(type_cps, mean1, mean2, metric) f.write("{},{},\"{}\",\"{}\",\"{}\"\n".format(server, mac, dt_cps, type_cps, seg_means))
def print_all(dt_start, dt_end, mac_node): str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs([ "{}/prints".format(script_dir), "{}/prints/{}".format(script_dir, str_dt), "{}/prints/{}/filtered".format(script_dir, str_dt), "{}/prints/{}/not_filtered".format(script_dir, str_dt) ]) # print_macs_per_name(dt_start, dt_end, mac_node) # print_names_per_mac(dt_start, dt_end, mac_node) # print_name_ips(dt_start, dt_end) print_traceroute_per_mac(dt_start, dt_end) print_traceroute_per_mac_filtered(dt_start, dt_end)
def get_client_traceroute(dt_start, dt_end, traceroute_type): str_dt = utils.get_str_dt(dt_start, dt_end) valid_traceroute_field, traceroute_field = \ get_traceroute_fields(traceroute_type) in_path = ("{}/change_point/unsupervised/prints/{}/filtered/" "traceroute_per_mac.csv".format(base_dir, str_dt)) df = pd.read_csv(in_path) client_traceroute = {} for idx, row in df.iterrows(): if row["valid_cnt_samples"] and row[valid_traceroute_field]: client = utils.get_client(row["server"], row["mac"]) client_traceroute[client] = ast.literal_eval(row[traceroute_field]) return client_traceroute
def get_first_hops(dt_start, dt_end, server, traceroute_type): str_dt = utils.get_str_dt(dt_start, dt_end) valid_traceroute_field, traceroute_field = \ cp_utils.get_traceroute_fields(traceroute_type) first_hops = set() in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format(script_dir, str_dt) df = pd.read_csv(in_path) df = df[df["server"] == server] for idx, row in df.iterrows(): if row["valid_cnt_samples"] and row[valid_traceroute_field]: traceroute = ast.literal_eval(row[traceroute_field]) first_hops.add(traceroute[0]) return first_hops
def print_traceroute_per_mac(dt_start, dt_end): dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) out_path = ("{}/prints/{}/not_filtered/traceroute_per_mac.csv".format( script_dir, str_dt)) with open(out_path, "w") as f: f.write("server,mac," "valid_traceroute_compress_embratel," "traceroute_compress_embratel," "valid_traceroute_compress_embratel_without_last_hop_embratel," "traceroute_compress_embratel_without_last_hop_embratel," "valid_traceroute_without_embratel," "traceroute_without_embratel," "valid_traceroute," "traceroute\n") for server, mac, in_path in utils.iter_server_mac(dt_dir, True): ts_traceroute = TimeSeries(in_path=in_path, metric="traceroute", dt_start=dt_start, dt_end=dt_end) (valid_traceroute_compress_embratel, traceroute_compress_embratel) = \ get_traceroute(ts_traceroute, True, True, True) (valid_traceroute_compress_embratel_without_last_hop_embratel, traceroute_compress_embratel_without_last_hop_embratel) = \ get_traceroute(ts_traceroute, True, True, False) (valid_traceroute_without_embratel, traceroute_without_embratel) = \ get_traceroute(ts_traceroute, False, False, False) (valid_traceroute, traceroute) = \ get_traceroute(ts_traceroute, True, False, False) l = "{},{}" + ",{},\"{}\"" * 4 + "\n" l = l.format( server, mac, valid_traceroute_compress_embratel, traceroute_compress_embratel, valid_traceroute_compress_embratel_without_last_hop_embratel, traceroute_compress_embratel_without_last_hop_embratel, valid_traceroute_without_embratel, traceroute_without_embratel, valid_traceroute, traceroute) f.write(l) utils.sort_csv_file(out_path, ["server", "mac"])
def print_per_path(dt_start, dt_end, metric, file_name): str_dt = utils.get_str_dt(dt_start, dt_end) out_dir = "{}/change_point/unsupervised/".format(base_dir) utils.create_dirs([ "{}/plots".format(out_dir), "{}/plots/paths".format(out_dir), "{}/plots/paths/{}".format(out_dir, str_dt), "{}/plots/paths/{}/{}".format(out_dir, str_dt, metric) ]) in_path = "{}/prints/{}/filtered/{}/{}".format(script_dir, str_dt, metric, file_name) for traceroute_type in iter_traceroute_types(): valid_traceroute_field, traceroute_field = \ cp_utils.get_traceroute_fields(traceroute_type) client_traceroute = cp_utils.get_client_traceroute( dt_start, dt_end, traceroute_type) path_dirs = set() df = pd.read_csv(in_path) for idx, row in df.iterrows(): client = utils.get_client(row["server"], row["mac"]) if client in client_traceroute: traceroute = client_traceroute[client] dir_path = "{}/plots/paths/{}/{}/{}/{}".format( out_dir, str_dt, metric, traceroute_type, row["server"]) utils.create_dirs([dir_path]) for name in reversed(traceroute): if name[0][0].split(".")[0] == "192": continue dir_path = "{}/{}".format(dir_path, name) utils.create_dirs([dir_path]) out_path = "{}/{}".format(dir_path, file_name) if dir_path not in path_dirs: create_csv_with_same_header(out_path, df) pd.DataFrame(row).T.to_csv(out_path, mode="a", header=False, index=False) path_dirs.add(dir_path)
def plot(dt_start, dt_end, metric): dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs([ "{}/{}".format(script_dir, str_dt), "{}/{}/{}".format(script_dir, str_dt, metric) ]) for server, mac, in_path in utils.iter_server_mac(dt_dir, True): out_file_name = utils.get_out_file_name(server, mac, dt_start, dt_end) out_path = "{}/{}/{}/{}.png".format(script_dir, str_dt, metric, out_file_name) # comparison between not filtered and filtered ts = TimeSeries(in_path, metric, dt_start, dt_end) ts_filter = TimeSeries(in_path, metric, dt_start, dt_end) ts_filter.percentile_filter(win_len=5, p=0.5) # if len(ts_filter.y) > 100: # plot_procedures.plot_stl_decomposition(ts_filter, # "median_filtered", # out_path) # comparison between with cross traffic and without # ts = TimeSeries(in_path, metric, dt_start, dt_end) # ts.percentile_filter(win_len=13, p=0.5) # ts_filter = TimeSeries(in_path, metric, dt_start, dt_end, # cross_traffic_thresh=0) # ts_filter.percentile_filter(win_len=13, p=0.5) # plot_procedures.plot_ts_share_x(ts, ts_filter, out_path, # compress=True, # plot_type2="scatter", # title1="raw", # title2="median filtered", # default_ylabel=True, # xlabel="$i$") ylabel = plot_procedures.get_default_ylabel(ts) plot_procedures.plot_ts(ts_filter, out_path, ylabel=ylabel, compress=False, title="median filtered")
def plot_clients_per_zero_indegree_vertex_distribution(dt_start, dt_end): cnt_clients_zero_indegree_vertex = [] str_dt = utils.get_str_dt(dt_start, dt_end) for server in os.listdir("{}/prints/{}/filtered/graph".format( script_dir, str_dt)): for traceroute_type in unsupervised_utils.iter_traceroute_types(): if spatial_time_correlation.valid_graph(dt_start, dt_end, server, traceroute_type): g = spatial_time_correlation.read_graph( dt_start, dt_end, server, traceroute_type) u_indegree = spatial_time_correlation.get_indegree(g) for u in g: if u_indegree[u] == 0: in_path = ("{}/plots/names/{}/latency/{}/{}/{}/" "cps_per_mac.csv".format( script_dir, str_dt, traceroute_type, server, u)) df = pd.read_csv(in_path) cnt_clients_zero_indegree_vertex.append(df.shape[0]) break print sum(cnt_clients_zero_indegree_vertex) out_path = ("{}/plots/cnt_clients_zero_indegree_vertex_distribution.png". format(script_dir)) plt.clf() matplotlib.rcParams.update({"font.size": 27}) plt.gcf().set_size_inches(16, 11) bins = range(1, max(cnt_clients_zero_indegree_vertex) + 2) weights = (np.asarray([1.0] * len(cnt_clients_zero_indegree_vertex)) / len(cnt_clients_zero_indegree_vertex)) plt.ylabel("frequency") plt.xlabel("number of clients in a zero indegree user-group") plt.xticks(bins[:-1], rotation=45) plt.hist(cnt_clients_zero_indegree_vertex, bins=bins, normed=True, weights=weights) plt.savefig(out_path)
def localize_events(dt_start, dt_end, metric, eps_hours, min_fraction_of_clients): str_dt = utils.get_str_dt(dt_start, dt_end) in_path = "{}/prints/{}/filtered/traceroute_per_mac.csv".format(script_dir, str_dt) servers = np.unique(pd.read_csv(in_path)["server"].values) for server in servers: for traceroute_type in unsupervised_utils.iter_traceroute_types(): if valid_graph(dt_start, dt_end, server, traceroute_type): g = read_graph(dt_start, dt_end, server, traceroute_type) u_indegree = get_indegree(g) for u in g: if u_indegree[u] == 0: analyse_first_hop(g, u, True, metric, server, dt_start, dt_end, traceroute_type, eps_hours, min_fraction_of_clients) correlate_zero_indegree_vertexes(g, u_indegree, server, dt_start, dt_end, metric, traceroute_type, eps_hours) first_hops = get_first_hops(dt_start, dt_end, server, traceroute_type) for first_hop in first_hops: if u_indegree[first_hop] != 0: analyse_first_hop(g, u, False, metric, server, dt_start, dt_end, traceroute_type, eps_hours, min_fraction_of_clients) aggregate_first_hop_not_zero_indegree_vertex(first_hops, g, metric, server, dt_start, dt_end, traceroute_type) break aggregate_servers_correlations(dt_start, dt_end, metric, servers) aggregate_servers_first_hop_not_zero_indegree_vertex(dt_start, dt_end, metric, servers)
def plot_per_node(dt_start, dt_end, metric, only_unique_traceroute): dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs([ "{}/plots/".format(script_dir), "{}/plots/nodes".format(script_dir), "{}/plots/nodes/{}".format(script_dir, str_dt), "{}/plots/nodes/{}/{}".format(script_dir, str_dt, metric) ]) valid_nodes = read_input.get_valid_nodes() mac_node = read_input.get_mac_node() macs_unique_traceroute = read_input.get_macs_traceroute_filter( dt_start, dt_end, "filtered") for server, mac, in_path in utils.iter_server_mac(dt_dir, True): if only_unique_traceroute and (mac not in macs_unique_traceroute): continue if mac_node[mac] in valid_nodes: utils.create_dirs([ "{}/plots/nodes/{}/{}/{}".format(script_dir, str_dt, metric, mac_node[mac]) ]) out_file_name = utils.get_out_file_name(server, mac, dt_start, dt_end) out_path = ("{}/plots/nodes/{}/{}/{}/{}.png".format( script_dir, str_dt, metric, mac_node[mac], out_file_name)) ts = TimeSeries(in_path, metric, dt_start, dt_end) ts_filter = TimeSeries(in_path, metric, dt_start, dt_end) ts_filter.percentile_filter(win_len=13, p=0.5) plot_procedures.plot_ts_share_x(ts, ts_filter, out_path, compress=False, plot_type2="scatter")
def create_dataset_unsupervised(dt_start, dt_end): """ all [dt_start, dt_end) must be in the same month. datetimes must represent days """ str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs([ "{}/change_point/input/unsupervised/".format(base_dir), "{}/change_point/input/unsupervised/{}".format(base_dir, str_dt) ]) out_path = "{}/unsupervised/{}/dataset.csv".format(script_dir, str_dt) with open(out_path, "w") as f: f.write("email,mac,server,dt_start,dt_end,change_points," "change_points_ids\n") in_path = ("{}/change_point/unsupervised/prints/{}/filtered/" "traceroute_per_mac.csv".format(base_dir, str_dt)) df = pd.read_csv(in_path) for idx, row in df.iterrows(): if include_in_dataset(row): f.write("{},{},{},{},{},\"\",\"\"\n".format( str_dt, row["mac"], row["server"], dt_start, dt_end))
def print_traceroute_per_mac_filtered(dt_start, dt_end, min_fraction_of_samples=0.7): str_dt = utils.get_str_dt(dt_start, dt_end) out_path = ("{}/prints/{}/filtered/traceroute_per_mac.csv".format( script_dir, str_dt)) with open(out_path, "w") as f: l = ("server,mac,valid_cnt_samples," "valid_traceroute_compress_embratel," "traceroute_compress_embratel_filter," "valid_traceroute_compress_embratel_without_last_hop_embratel," "traceroute_compress_embratel_without_last_hop_embratel_filter," "valid_traceroute_without_embratel," "traceroute_without_embratel_filter," "valid_traceroute," "traceroute_filter\n") f.write(l) in_path = ("{}/prints/{}/not_filtered/traceroute_per_mac.csv".format( script_dir, str_dt)) df = pd.read_csv(in_path) for cnt, (idx, row) in enumerate(df.iterrows()): print( "print_traceroute_per_mac_filtered, str_dt={}, cnt={}".format( str_dt, cnt)) traceroute_compress_embratel_filter = \ get_traceroute_filtered( row["valid_traceroute_compress_embratel"], row["traceroute_compress_embratel"], row["server"]) traceroute_compress_embratel_without_last_hop_embratel_filter = \ get_traceroute_filtered( row["valid_traceroute_compress_embratel_" "without_last_hop_embratel"], row["traceroute_compress_embratel_" "without_last_hop_embratel"], row["server"]) traceroute_without_embratel_filter = \ get_traceroute_filtered( row["valid_traceroute_without_embratel"], row["traceroute_without_embratel"], row["server"]) traceroute_filter = \ get_traceroute_filtered( row["valid_traceroute"], row["traceroute"], row["server"]) # check if client has the minimum number of samples. Since the # metric is not specified at the moment, only check the presence of # the measurement timestamp in_path = utils.get_in_path(row["server"], row["mac"], dt_start, dt_end) ts = TimeSeries(in_path=in_path, metric="dt", dt_start=dt_start, dt_end=dt_end) delta_days = (dt_end - dt_start).days fraction_of_samples = float(len(ts.y)) / (delta_days * 24.0 * 2.0) if fraction_of_samples < min_fraction_of_samples: valid_cnt_samples = False else: valid_cnt_samples = True l = "{},{},{}" + ",{},\"{}\"" * 4 + "\n" l = l.format( row["server"], row["mac"], valid_cnt_samples, row["valid_traceroute_compress_embratel"], traceroute_compress_embratel_filter, row["valid_traceroute_compress_embratel_" "without_last_hop_embratel"], traceroute_compress_embratel_without_last_hop_embratel_filter, row["valid_traceroute_without_embratel"], traceroute_without_embratel_filter, row["valid_traceroute"], traceroute_filter) f.write(l)
def run_single(dt_start, dt_end, cmp_class_args, preprocess_args, param, metric, run): str_dt = utils.get_str_dt(dt_start, dt_end) datasets = ["unsupervised/{}".format(str_dt)] cp_utils.run_sequential(datasets, run, cmp_class_args, preprocess_args, param, metric)
def correlate_zero_indegree_vertexes(g, u_indegree, server, dt_start, dt_end, metric, traceroute_type, eps_hours): str_dt = utils.get_str_dt(dt_start, dt_end) out_path = ("{}/plots/paths/{}/{}/{}/{}/" "problem_location_zero_indegree_vertexes_correlation.csv". format(script_dir, str_dt, metric, traceroute_type, server)) with open(out_path, "w") as f: f.write("cp_dt_start,cp_dt_end,cp_type,traceroute_type," "cnt_vertexes_with_zero_indegree,suffix_match," "vertexes_with_zero_indegree\n") for cp_type in cp_utils.iter_cp_types(): l = [] for u in g: if u_indegree[u] == 0: in_path = ("{}/plots/names/{}/{}/{}/{}/{}/" "problem_location.csv".format(script_dir, str_dt, metric, traceroute_type, server, u)) df = pd.read_csv(in_path) for idx, row in df[df["cp_type"] == cp_type].iterrows(): cp_dt_start = dt_procedures.from_strdt_to_dt( row["cp_dt_start"]) cp_dt_end = dt_procedures.from_strdt_to_dt( row["cp_dt_end"]) cp_dt = cp_dt_start + (cp_dt_end - cp_dt_start) / 2 problem_locations = ast.literal_eval( row["problem_location"]) dic = {"dt": cp_dt, "name": ast.literal_eval(u), "problem_locations": problem_locations, "fraction_of_clients": row["fraction_of_clients"], "cnt_clients": row["cnt_clients"]} if problem_locations == ["before"]: dic["dt"] = str(dic["dt"]) f.write("{},{},{},{},{},\"{}\",\"{}\"\n". format(row["cp_dt_start"], row["cp_dt_end"], cp_type, traceroute_type, 1, problem_locations, [dic])) else: l.append(dic) l.sort(key=itemgetter("dt")) votes = \ unsupervised_utils.multiple_inexact_voting(l, eps_hours) for event in votes: for problem_location, votes in suffix_match(event): for dic in votes: dic["dt"] = str(dic["dt"]) f.write("{},{},{},{},{},\"{}\",\"{}\"\n". format(event["l_dt"], event["r_dt"], cp_type, traceroute_type, len(votes), problem_location, votes)) out_path_name = ("{}/plots/names/{}/{}/{}/{}". format(script_dir, str_dt, metric, traceroute_type, server)) shutil.copy(out_path, out_path_name)
def plot_per_name(dt_start, dt_end, metric, preprocess_args, plot_cps=True): dt_dir = utils.get_dt_dir(dt_start, dt_end) str_dt = utils.get_str_dt(dt_start, dt_end) utils.create_dirs([ "{}/plots/".format(script_dir), "{}/plots/names".format(script_dir), "{}/plots/names/{}".format(script_dir, str_dt), "{}/plots/names/{}/{}".format(script_dir, str_dt, metric) ]) client_cps = unsupervised_utils.get_client_cps(plot_cps, str_dt, metric) # avoid reploting client_plotPath = {} for traceroute_type in unsupervised_utils.iter_traceroute_types(): valid_traceroute_field, traceroute_field = \ cp_utils.get_traceroute_fields(traceroute_type) utils.create_dirs([ "{}/plots/names/{}/{}/{}".format(script_dir, str_dt, metric, traceroute_type) ]) df = pd.read_csv("{}/prints/{}/filtered/traceroute_per_mac.csv".format( script_dir, str_dt)) cnt = 0 for idx, row in df.iterrows(): if row["valid_cnt_samples"] and row[valid_traceroute_field]: print("cnt={}, traceroute_type={}, str_dt={}".format( cnt, traceroute_type, str_dt)) cnt += 1 client = utils.get_client(row["server"], row["mac"]) for name in cp_utils.iter_names_traceroute_filtered( ast.literal_eval(row[traceroute_field])): utils.create_dirs([ "{}/plots/names/{}/{}/{}/{}".format( script_dir, str_dt, metric, traceroute_type, row["server"]), "{}/plots/names/{}/{}/{}/{}/{}".format( script_dir, str_dt, metric, traceroute_type, row["server"], name) ]) out_file_name = utils.get_out_file_name( row["server"], row["mac"], dt_start, dt_end) out_path = ("{}/plots/names/{}/{}/{}/{}/{}/{}.png".format( script_dir, str_dt, metric, traceroute_type, row["server"], name, out_file_name)) # avoid reploting if client in client_plotPath: shutil.copyfile(client_plotPath[client], out_path) else: client_plotPath[client] = out_path cp_dts = client_cps[client] in_path = "{}/input/{}/{}/{}.csv".format( base_dir, dt_dir, row["server"], row["mac"]) ts = TimeSeries(in_path, metric, dt_start, dt_end) cp_utils.preprocess(ts, preprocess_args) plot_procedures.plot_ts(ts, out_path, dt_axvline=cp_dts, title="median filtered")