Exemplo n.º 1
0
def __extract_resource_usage(args, free_data, shadow_data):
    rusage = {
        "ram": __get_ram_usage(free_data),
        "run_time": __get_run_time(shadow_data)
    }
    outpath = f"{args.prefix}/tornet.plot.data/resource_usage.json"
    dump_json_data(rusage, outpath, compress=False)
Exemplo n.º 2
0
def stage_users(args, min_unix_time, max_unix_time):
    codes_by_unix_time = {}

    logging.info("Processing user file from {}...".format(
        args.user_stats_path))

    with open(args.user_stats_path, 'r') as infile:
        for line in infile:
            # skip the header; the first 4 chars are the year, e.g., '2011'
            if line[0:2] != '20':
                continue

            parts = line.strip().split(',')

            date = str(parts[0])  # like '2019-01-01'
            country_code = str(parts[1])  # like 'us'
            user_count = int(parts[2])  # like '14714'

            dt = datetime.strptime(date, "%Y-%m-%d")
            unix_time = int(dt.strftime("%s"))  # returns stamp like 1548910800

            if unix_time < min_unix_time or unix_time > max_unix_time:
                continue

            filter = set(['', 'a1', 'a2', '??'])
            if country_code in filter:
                continue

            codes_by_unix_time.setdefault(unix_time,
                                          {}).setdefault(country_code, 0)
            codes_by_unix_time[unix_time][country_code] += user_count

    # compute probs of each country over time
    probs_by_country_code = {}
    for unix_time in codes_by_unix_time:
        total_user_count = float(sum(codes_by_unix_time[unix_time].values()))

        for country_code in codes_by_unix_time[unix_time]:
            prob = codes_by_unix_time[unix_time][
                country_code] / total_user_count
            probs_by_country_code.setdefault(country_code, []).append(prob)

    # get median country prob for each
    output = {}
    for country_code in probs_by_country_code:
        probs = probs_by_country_code[country_code]
        med_prob = median(probs) if len(probs) > 0 else 0.0
        output.setdefault(country_code, med_prob)

    # re-normalize
    total_prob = float(sum(output.values()))
    for country_code in output:
        output[country_code] = output[country_code] / total_prob

    timesuffix = get_time_suffix(min_unix_time, max_unix_time)
    user_info_path = f"{args.prefix}/userinfo_staging_{timesuffix}.json"
    logging.info("Writing user info to {}".format(user_info_path))
    dump_json_data(output, user_info_path, compress=False)
Exemplo n.º 3
0
def __extract_download_time(args, data, startts, stopts):
    key = "time_to_first_byte_recv"
    dt = __get_download_time(data, startts, stopts, key)
    outpath = f"{args.prefix}/tornet.plot.data/{key}.json"
    dump_json_data(dt, outpath, compress=False)

    key = "time_to_last_byte_recv"
    dt = __get_download_time(data, startts, stopts, key)
    outpath = f"{args.prefix}/tornet.plot.data/{key}.json"
    dump_json_data(dt, outpath, compress=False)
Exemplo n.º 4
0
def __parse_tornettools_log(args):
    gen_logs = [
        f for f in os.listdir(args.prefix)
        if f.startswith('tornettools.generate.')
    ]
    if len(gen_logs) == 0:
        logging.warning(
            "Unable to find simulation info in tornettools.generate.log file")
        return

    info = {}
    gen_log_path = f"{args.prefix}/{gen_logs[-1]}"
    with open_readable_file(gen_log_path) as inf:
        for line in inf:
            if "Seeded standard and numpy PRNGs" in line:
                info['tornettools_generate_seed'] = int(
                    line.strip().split()[11].split('=')[1])
            elif "relays using scale factor" in line:
                parts = line.strip().split()
                l = len(parts)
                if l >= 7:
                    info['num_sampled_relays'] = int(parts[6])
                if l >= 9:
                    info['num_public_relays'] = int(parts[8])
                if l >= 14:
                    info['net_scale'] = float(parts[13])
            elif "Generated fingerprints and keys" in line:
                parts = line.strip().split()
                l = len(parts)
                if l >= 14:
                    info['num_dir_authorities'] = int(parts[13].strip('('))
            elif "TGen client processes to emulate" in line:
                parts = line.strip().split()
                l = len(parts)
                if l >= 9:
                    info['num_tgen_markov_clients'] = int(parts[8])
                if l >= 15:
                    info['num_emulated_users'] = int(parts[14])
                if l >= 20:
                    info['num_circuits_ten_minutes'] = int(parts[19])
            elif "perf nodes to benchmark Tor performance" in line:
                parts = line.strip().split()
                l = len(parts)
                if l >= 9:
                    info['num_tgen_perf_clients'] = int(parts[8])
            elif "TGen servers to serve" in line:
                parts = line.strip().split()
                l = len(parts)
                if l >= 9:
                    info['num_tgen_servers'] = int(parts[8])

    outpath = f"{args.prefix}/tornet.plot.data/simulation_info.json"
    dump_json_data(info, outpath, compress=False)
Exemplo n.º 5
0
def __parse_shadow_rusage(args):
    shadow_filepath = f"{args.prefix}/shadow.log"
    if not os.path.exists(shadow_filepath):
        shadow_filepath += ".xz"

    if not os.path.exists(shadow_filepath):
        logging.warning(f"Unable to find cpu usage data at {shadow_filepath}")
        return False

    rusage = {}
    heartbeat = re.compile("_slave_heartbeat")
    with open_readable_file(shadow_filepath) as inf:
        for line in inf:
            if heartbeat.search(line) != None:
                parts = line.strip().split()
                if len(parts) >= 13:
                    sim_time = float(parts[12])  # nanos e.g. 2000000000
                    std = datetime.timedelta(microseconds=sim_time / 1000.0)
                    sim_secs = std.total_seconds()

                    if sim_secs not in rusage:
                        real_time = parts[0]  # time e.g. 00:00:15.436056
                        rt_parts = real_time.split(':')
                        rtd = datetime.timedelta(hours=int(rt_parts[0]),
                                                 minutes=int(rt_parts[1]),
                                                 seconds=float(rt_parts[2]))

                        rund = {
                            keyval.split('=')[0]: keyval.split('=')[1]
                            for keyval in parts if '=' in keyval
                        }
                        rund['real_time'] = rtd.total_seconds()

                        rusage[sim_secs] = rund

    if len(rusage) > 0:
        outpath = f"{args.prefix}/shadow_rusage.json.xz"
        dump_json_data(rusage, outpath, compress=True)
        return True
    else:
        logging.warning(
            f"Unable to parse resource usage data from {shadow_filepath}.")
        return False
Exemplo n.º 6
0
def __parse_free_rusage(args):
    free_filepath = f"{args.prefix}/free.log"
    if not os.path.exists(free_filepath):
        free_filepath += ".xz"

    if not os.path.exists(free_filepath):
        logging.warning(f"Unable to find memory usage data at {free_filepath}")
        return False

    rusage = {}

    last_ts = None
    mem_header = None
    with open_readable_file(free_filepath) as inf:
        for line in inf:
            if "UTC" in line:
                parts = line.strip().split()
                if len(parts) >= 1:
                    ts = float(parts[0])
                    #dt = datetime.datetime.fromtimestamp(ts)
                    #last_ts = dt.timestamp()
                    last_ts = ts
            elif 'total' in line and mem_header == None:
                mem_header = [p.strip() for p in line.strip().split()]
            elif "Mem:" in line:
                parts = [p.strip() for p in line.strip().split()]
                mem_counts = [int(p) for p in parts[1:]]

                memd = {
                    f"mem_{mem_header[i]}": mem_counts[i]
                    for i in range(len(mem_counts))
                }

                rusage.setdefault(last_ts, memd)

    if len(rusage) > 0:
        outpath = f"{args.prefix}/free_rusage.json.xz"
        dump_json_data(rusage, outpath, compress=True)
        return True
    else:
        logging.warning(
            f"Unable to parse memory usage data from {free_filepath}.")
        return False
Exemplo n.º 7
0
def run(args):
    db = {"circuit_rtt": [], "client_goodput": [], "circuit_build_times": [],
        "download_times": {}, "daily_counts": {}, "relay_goodput": {}}

    if args.bandwidth_data_path != None:
        logging.info(f"Parsing bandwidth data stored in '{args.bandwidth_data_path}'")
        db['relay_goodput'] = __parse_bandwidth_data(args.bandwidth_data_path)
        logging.info("Finished parsing bandwidth data")

    if args.onionperf_data_path != None:
        logging.info(f"Extracting onionperf data stored in '{args.onionperf_data_path}'")
        __extract_onionperf_data(args, db)
        logging.info("Finished extracting onionperf data")

    # format we want for filename: tor_metrics_2020-01-01--2020-01-31.json
    days = []
    days.extend(db['daily_counts'].keys())
    days.extend(db['relay_goodput'].keys())
    days.sort()

    out_path = f"{args.prefix}/tor_metrics_{days[0]}--{days[-1]}.json"
    logging.info(f"Saving parsed Tor metrics data to {out_path}")
    dump_json_data(db, out_path, compress=False)
Exemplo n.º 8
0
def stage_relays(args):
    num_processes = args.nprocesses if args.nprocesses > 0 else cpu_count()

    logging.info(
        "Starting to process Tor metrics data using {} processes".format(
            num_processes))

    consensus_paths = get_file_list(args.consensus_path)
    logging.info("Processing {} consensus files from {}...".format(
        len(consensus_paths), args.consensus_path))
    relays, min_unix_time, max_unix_time, network_stats = process(
        num_processes, consensus_paths, parse_consensus,
        combine_parsed_consensus_results)

    servdesc_paths = get_file_list(args.server_descriptor_path)
    logging.info("Processing {} server descriptor files from {}...".format(
        len(servdesc_paths), args.server_descriptor_path))
    sdesc_args = [[p, min_unix_time, max_unix_time] for p in servdesc_paths]
    bandwidths = process(num_processes, sdesc_args, parse_serverdesc,
                         combine_parsed_serverdesc_results)

    found_bandwidths = 0
    for fingerprint in relays:
        if fingerprint in bandwidths:
            # overwrite empty bandwidth with parsed bandwidth info
            relays[fingerprint].bandwidths = bandwidths[fingerprint]
            found_bandwidths += 1

    logging.info("We found bandwidth information for {} of {} relays".format(
        found_bandwidths, len(relays)))
    #for (k, v) in sorted(relays.items(), key=lambda kv: kv[1].bandwidths.max_obs_bw):
    #    logging.info("fp={} capacity={}".format(k, v.bandwidths.max_obs_bw))

    geo = None
    if args.geoip_path is not None:
        geo = GeoIP(args.geoip_path)

    output = {
        'min_unix_time': min_unix_time,
        'max_unix_time': max_unix_time,
        'network_stats': network_stats,
        'relays': {}
    }

    for fingerprint in relays:
        r = relays[fingerprint]

        output['relays'][fingerprint] = {
            'fingerprint':
            r.fingerprint,
            'address':
            r.address,
            'running_frequency':
            float(len(r.weights)) / float(len(
                consensus_paths)),  # frac consensuses in which relay appeared
            'guard_frequency':
            float(r.num_guard) / float(len(
                r.weights)),  # when running, frac consensuses with exit flag
            'exit_frequency':
            float(r.num_exit) / float(len(
                r.weights)),  # when running, frac consensuses with guard flag
            'weight':
            float(median(r.weights)) if len(r.weights) > 0 else 0.0,
            'bandwidth_capacity':
            int(r.bandwidths.max_obs_bw),
            'bandwidth_rate':
            int(median(r.bandwidths.bw_rates))
            if len(r.bandwidths.bw_rates) > 0 else 0,
            'bandwidth_burst':
            int(median(r.bandwidths.bw_bursts))
            if len(r.bandwidths.bw_bursts) > 0 else 0,
        }

        if geo is not None:
            output['relays'][fingerprint][
                'country_code'] = geo.ip_to_country_code(r.address)

    timesuffix = get_time_suffix(min_unix_time, max_unix_time)
    relay_info_path = f"{args.prefix}/relayinfo_staging_{timesuffix}.json"
    logging.info("Writing relay info to {}".format(relay_info_path))
    dump_json_data(output, relay_info_path, compress=False)

    return min_unix_time, max_unix_time
Exemplo n.º 9
0
def __extract_client_goodput(args, data, startts, stopts):
    client_goodput = __get_client_goodput(data, startts, stopts)
    outpath = f"{args.prefix}/tornet.plot.data/perfclient_goodput.json"
    dump_json_data(client_goodput, outpath, compress=False)
Exemplo n.º 10
0
def __extract_error_rate(args, data, startts, stopts):
    errrate_per_client = __get_error_rate(data, startts, stopts)
    outpath = f"{args.prefix}/tornet.plot.data/error_rate.json"
    dump_json_data(errrate_per_client, outpath, compress=False)
Exemplo n.º 11
0
def __extract_round_trip_time(args, data, startts, stopts):
    rtt = __get_round_trip_time(data, startts, stopts)
    outpath = f"{args.prefix}/tornet.plot.data/round_trip_time.json"
    dump_json_data(rtt, outpath, compress=False)
Exemplo n.º 12
0
def __extract_relay_tput(args, data, startts, stopts):
    tput = __get_relay_tput(data, startts, stopts)
    outpath = f"{args.prefix}/tornet.plot.data/relay_goodput.json"
    dump_json_data(tput, outpath, compress=False)
Exemplo n.º 13
0
def __extract_circuit_build_times(args, data, startts, stopts):
    cbt = __get_perfclient_cbt(data, startts, stopts)
    outpath = f"{args.prefix}/tornet.plot.data/perfclient_circuit_build_time.json"
    dump_json_data(cbt, outpath, compress=False)