def __extract_resource_usage(args, free_data, shadow_data): rusage = { "ram": __get_ram_usage(free_data), "run_time": __get_run_time(shadow_data) } outpath = f"{args.prefix}/tornet.plot.data/resource_usage.json" dump_json_data(rusage, outpath, compress=False)
def stage_users(args, min_unix_time, max_unix_time): codes_by_unix_time = {} logging.info("Processing user file from {}...".format( args.user_stats_path)) with open(args.user_stats_path, 'r') as infile: for line in infile: # skip the header; the first 4 chars are the year, e.g., '2011' if line[0:2] != '20': continue parts = line.strip().split(',') date = str(parts[0]) # like '2019-01-01' country_code = str(parts[1]) # like 'us' user_count = int(parts[2]) # like '14714' dt = datetime.strptime(date, "%Y-%m-%d") unix_time = int(dt.strftime("%s")) # returns stamp like 1548910800 if unix_time < min_unix_time or unix_time > max_unix_time: continue filter = set(['', 'a1', 'a2', '??']) if country_code in filter: continue codes_by_unix_time.setdefault(unix_time, {}).setdefault(country_code, 0) codes_by_unix_time[unix_time][country_code] += user_count # compute probs of each country over time probs_by_country_code = {} for unix_time in codes_by_unix_time: total_user_count = float(sum(codes_by_unix_time[unix_time].values())) for country_code in codes_by_unix_time[unix_time]: prob = codes_by_unix_time[unix_time][ country_code] / total_user_count probs_by_country_code.setdefault(country_code, []).append(prob) # get median country prob for each output = {} for country_code in probs_by_country_code: probs = probs_by_country_code[country_code] med_prob = median(probs) if len(probs) > 0 else 0.0 output.setdefault(country_code, med_prob) # re-normalize total_prob = float(sum(output.values())) for country_code in output: output[country_code] = output[country_code] / total_prob timesuffix = get_time_suffix(min_unix_time, max_unix_time) user_info_path = f"{args.prefix}/userinfo_staging_{timesuffix}.json" logging.info("Writing user info to {}".format(user_info_path)) dump_json_data(output, user_info_path, compress=False)
def __extract_download_time(args, data, startts, stopts): key = "time_to_first_byte_recv" dt = __get_download_time(data, startts, stopts, key) outpath = f"{args.prefix}/tornet.plot.data/{key}.json" dump_json_data(dt, outpath, compress=False) key = "time_to_last_byte_recv" dt = __get_download_time(data, startts, stopts, key) outpath = f"{args.prefix}/tornet.plot.data/{key}.json" dump_json_data(dt, outpath, compress=False)
def __parse_tornettools_log(args): gen_logs = [ f for f in os.listdir(args.prefix) if f.startswith('tornettools.generate.') ] if len(gen_logs) == 0: logging.warning( "Unable to find simulation info in tornettools.generate.log file") return info = {} gen_log_path = f"{args.prefix}/{gen_logs[-1]}" with open_readable_file(gen_log_path) as inf: for line in inf: if "Seeded standard and numpy PRNGs" in line: info['tornettools_generate_seed'] = int( line.strip().split()[11].split('=')[1]) elif "relays using scale factor" in line: parts = line.strip().split() l = len(parts) if l >= 7: info['num_sampled_relays'] = int(parts[6]) if l >= 9: info['num_public_relays'] = int(parts[8]) if l >= 14: info['net_scale'] = float(parts[13]) elif "Generated fingerprints and keys" in line: parts = line.strip().split() l = len(parts) if l >= 14: info['num_dir_authorities'] = int(parts[13].strip('(')) elif "TGen client processes to emulate" in line: parts = line.strip().split() l = len(parts) if l >= 9: info['num_tgen_markov_clients'] = int(parts[8]) if l >= 15: info['num_emulated_users'] = int(parts[14]) if l >= 20: info['num_circuits_ten_minutes'] = int(parts[19]) elif "perf nodes to benchmark Tor performance" in line: parts = line.strip().split() l = len(parts) if l >= 9: info['num_tgen_perf_clients'] = int(parts[8]) elif "TGen servers to serve" in line: parts = line.strip().split() l = len(parts) if l >= 9: info['num_tgen_servers'] = int(parts[8]) outpath = f"{args.prefix}/tornet.plot.data/simulation_info.json" dump_json_data(info, outpath, compress=False)
def __parse_shadow_rusage(args): shadow_filepath = f"{args.prefix}/shadow.log" if not os.path.exists(shadow_filepath): shadow_filepath += ".xz" if not os.path.exists(shadow_filepath): logging.warning(f"Unable to find cpu usage data at {shadow_filepath}") return False rusage = {} heartbeat = re.compile("_slave_heartbeat") with open_readable_file(shadow_filepath) as inf: for line in inf: if heartbeat.search(line) != None: parts = line.strip().split() if len(parts) >= 13: sim_time = float(parts[12]) # nanos e.g. 2000000000 std = datetime.timedelta(microseconds=sim_time / 1000.0) sim_secs = std.total_seconds() if sim_secs not in rusage: real_time = parts[0] # time e.g. 00:00:15.436056 rt_parts = real_time.split(':') rtd = datetime.timedelta(hours=int(rt_parts[0]), minutes=int(rt_parts[1]), seconds=float(rt_parts[2])) rund = { keyval.split('=')[0]: keyval.split('=')[1] for keyval in parts if '=' in keyval } rund['real_time'] = rtd.total_seconds() rusage[sim_secs] = rund if len(rusage) > 0: outpath = f"{args.prefix}/shadow_rusage.json.xz" dump_json_data(rusage, outpath, compress=True) return True else: logging.warning( f"Unable to parse resource usage data from {shadow_filepath}.") return False
def __parse_free_rusage(args): free_filepath = f"{args.prefix}/free.log" if not os.path.exists(free_filepath): free_filepath += ".xz" if not os.path.exists(free_filepath): logging.warning(f"Unable to find memory usage data at {free_filepath}") return False rusage = {} last_ts = None mem_header = None with open_readable_file(free_filepath) as inf: for line in inf: if "UTC" in line: parts = line.strip().split() if len(parts) >= 1: ts = float(parts[0]) #dt = datetime.datetime.fromtimestamp(ts) #last_ts = dt.timestamp() last_ts = ts elif 'total' in line and mem_header == None: mem_header = [p.strip() for p in line.strip().split()] elif "Mem:" in line: parts = [p.strip() for p in line.strip().split()] mem_counts = [int(p) for p in parts[1:]] memd = { f"mem_{mem_header[i]}": mem_counts[i] for i in range(len(mem_counts)) } rusage.setdefault(last_ts, memd) if len(rusage) > 0: outpath = f"{args.prefix}/free_rusage.json.xz" dump_json_data(rusage, outpath, compress=True) return True else: logging.warning( f"Unable to parse memory usage data from {free_filepath}.") return False
def run(args): db = {"circuit_rtt": [], "client_goodput": [], "circuit_build_times": [], "download_times": {}, "daily_counts": {}, "relay_goodput": {}} if args.bandwidth_data_path != None: logging.info(f"Parsing bandwidth data stored in '{args.bandwidth_data_path}'") db['relay_goodput'] = __parse_bandwidth_data(args.bandwidth_data_path) logging.info("Finished parsing bandwidth data") if args.onionperf_data_path != None: logging.info(f"Extracting onionperf data stored in '{args.onionperf_data_path}'") __extract_onionperf_data(args, db) logging.info("Finished extracting onionperf data") # format we want for filename: tor_metrics_2020-01-01--2020-01-31.json days = [] days.extend(db['daily_counts'].keys()) days.extend(db['relay_goodput'].keys()) days.sort() out_path = f"{args.prefix}/tor_metrics_{days[0]}--{days[-1]}.json" logging.info(f"Saving parsed Tor metrics data to {out_path}") dump_json_data(db, out_path, compress=False)
def stage_relays(args): num_processes = args.nprocesses if args.nprocesses > 0 else cpu_count() logging.info( "Starting to process Tor metrics data using {} processes".format( num_processes)) consensus_paths = get_file_list(args.consensus_path) logging.info("Processing {} consensus files from {}...".format( len(consensus_paths), args.consensus_path)) relays, min_unix_time, max_unix_time, network_stats = process( num_processes, consensus_paths, parse_consensus, combine_parsed_consensus_results) servdesc_paths = get_file_list(args.server_descriptor_path) logging.info("Processing {} server descriptor files from {}...".format( len(servdesc_paths), args.server_descriptor_path)) sdesc_args = [[p, min_unix_time, max_unix_time] for p in servdesc_paths] bandwidths = process(num_processes, sdesc_args, parse_serverdesc, combine_parsed_serverdesc_results) found_bandwidths = 0 for fingerprint in relays: if fingerprint in bandwidths: # overwrite empty bandwidth with parsed bandwidth info relays[fingerprint].bandwidths = bandwidths[fingerprint] found_bandwidths += 1 logging.info("We found bandwidth information for {} of {} relays".format( found_bandwidths, len(relays))) #for (k, v) in sorted(relays.items(), key=lambda kv: kv[1].bandwidths.max_obs_bw): # logging.info("fp={} capacity={}".format(k, v.bandwidths.max_obs_bw)) geo = None if args.geoip_path is not None: geo = GeoIP(args.geoip_path) output = { 'min_unix_time': min_unix_time, 'max_unix_time': max_unix_time, 'network_stats': network_stats, 'relays': {} } for fingerprint in relays: r = relays[fingerprint] output['relays'][fingerprint] = { 'fingerprint': r.fingerprint, 'address': r.address, 'running_frequency': float(len(r.weights)) / float(len( consensus_paths)), # frac consensuses in which relay appeared 'guard_frequency': float(r.num_guard) / float(len( r.weights)), # when running, frac consensuses with exit flag 'exit_frequency': float(r.num_exit) / float(len( r.weights)), # when running, frac consensuses with guard flag 'weight': float(median(r.weights)) if len(r.weights) > 0 else 0.0, 'bandwidth_capacity': int(r.bandwidths.max_obs_bw), 'bandwidth_rate': int(median(r.bandwidths.bw_rates)) if len(r.bandwidths.bw_rates) > 0 else 0, 'bandwidth_burst': int(median(r.bandwidths.bw_bursts)) if len(r.bandwidths.bw_bursts) > 0 else 0, } if geo is not None: output['relays'][fingerprint][ 'country_code'] = geo.ip_to_country_code(r.address) timesuffix = get_time_suffix(min_unix_time, max_unix_time) relay_info_path = f"{args.prefix}/relayinfo_staging_{timesuffix}.json" logging.info("Writing relay info to {}".format(relay_info_path)) dump_json_data(output, relay_info_path, compress=False) return min_unix_time, max_unix_time
def __extract_client_goodput(args, data, startts, stopts): client_goodput = __get_client_goodput(data, startts, stopts) outpath = f"{args.prefix}/tornet.plot.data/perfclient_goodput.json" dump_json_data(client_goodput, outpath, compress=False)
def __extract_error_rate(args, data, startts, stopts): errrate_per_client = __get_error_rate(data, startts, stopts) outpath = f"{args.prefix}/tornet.plot.data/error_rate.json" dump_json_data(errrate_per_client, outpath, compress=False)
def __extract_round_trip_time(args, data, startts, stopts): rtt = __get_round_trip_time(data, startts, stopts) outpath = f"{args.prefix}/tornet.plot.data/round_trip_time.json" dump_json_data(rtt, outpath, compress=False)
def __extract_relay_tput(args, data, startts, stopts): tput = __get_relay_tput(data, startts, stopts) outpath = f"{args.prefix}/tornet.plot.data/relay_goodput.json" dump_json_data(tput, outpath, compress=False)
def __extract_circuit_build_times(args, data, startts, stopts): cbt = __get_perfclient_cbt(data, startts, stopts) outpath = f"{args.prefix}/tornet.plot.data/perfclient_circuit_build_time.json" dump_json_data(cbt, outpath, compress=False)