def get_zx(start_time, method="single", fname="", Zf=False, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param method: the linkage method to be used :param fname: string to be appended to end of plot file name :param Zf: boolean -> if True, will load from file (see code for file name). NOTE: it will save the most recent version you calculated. Make sure the right version of the file exists before setting Zf to true :param **kwas: keyword arguments for vv.get_svl() :return: linkage, dendrogram's output, svl computes and plots dendrogram with respect to distance between clients ''' if Zf is False: kwas['start_time'] = start_time X, fmt, _, ccache = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(X))) dm = np.zeros((len(X) * (len(X) - 1)) // 2, dtype=np.double) k = 0 for i in xrange(0, len(X)-1): for j in xrange(i + 1, len(X)): dm[k] = 1.0 - ccache[X[i]][X[j]] k = k + 1 ccache.dump() Z = linkage(dm, method) df.pickleout(plotsdir+'pickles/'+'Z_'+method+fname+'.pickle', (Z, dm, X)) logger.warning('dumped Z to ' \ +plotsdir+'pickles/'+'Z_'+method+fname+'.pickle') else: Z, dm, X = df.picklein(plotsdir+'pickles/'+'Z_'+method+fname+'.pickle') logger.warning('loaded Z from '+plotsdir+'pickles/'+'Z_'+method+fname+'.pickle') c, coph_dists = cophenet(Z, dm) return Z, X
def arrange_self_data(start_time, gap=0, loops=2, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param gap: the gap (in seconds) between each iteration's dataset :param loops: the number of iterations (datasets) :param **kwas: keyword arguments for vv.get_svl() ''' svld = defaultdict(list) # dict {id: [svl]} allsvl = list() allfmt = set() anssets = defaultdict(set) kwas['return_ccache'] = False for l in xrange(0, loops): kwas['start_time'] = start_time + l * (gap + kwas['duration']) svl, fmt2, anssets2 = vv.get_svl(**kwas) logger.warning("svl len: " + str(len(svl))) allfmt |= set(fmt2) for dom in anssets2: anssets[dom] |= set(anssets2[dom]) for i in xrange(0, len(svl)): svld[svl[i].get_id()].append(svl[i]) allsvl.append(svl[i]) return svld, allsvl, list(allfmt), anssets
def inv_hist(start_time, fname="", thresh=.35, **kwas): logger.info("getting svl...") kwas['start_time'] = start_time kwas['return_ccache'] = False svl, fmt, __ = vv.get_svl(**kwas) logger.info("getting ipsl...") ipsl, dompairs = get_ip_sets(svl) logger.info("getting pairing counts...") pc = vg.get_pairing_counts(ipsl) ipcount = len(pc) logger.info("building inv. graph...") G = vg.build_inv_graph(pc) vg.remove_far_edges(G, thresh) dd = vg.nodes_by_degree(G) vg.remove_degree_below(G, dd, 1) weights = [e[2] for e in G.edges_iter(data='weight')] cc = list(nx.connected_components(G)) #print cc for c in cc: print "****************************" print str(len(c)) print set([dom for ip in c for dom in dompairs[ip]]) weights = [w for a,b,w in G.edges_iter(c, data='weight')] print "median weight: "+str(np.median(weights)) print "average weight: "+str(np.mean(weights)) print "num connected comps: "+str(len(cc)) print "size of connected comps: "+str(np.median([len(z) for z in cc])) plt.figure(figsize=(15, 10)) plt.xlabel('pairwise closeness') plt.ylabel('# of pairs (servers)') plt.hist(weights, bins=100) plt.savefig(plotsdir+fname+'inv_hist.pdf', bbox_inches='tight')
def plot_resolver_comparison(start_time, fname="", xlim=[.6, 1.0], rmask=16, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param fname: string to be appended to end of plot file name :param xlim: x axis limits for plot. Accepts formats: None, [a, b], :param rmask: mask for resolver IPs :param **kwas: keyword arguments for vv.get_svl() :returns: [country, ASN, subnet, prefix] pair dictionaries of closeness lists gets pairwise closeness of probes with different descriptors to find odd behavior (probes in difference descriptors with high closeness scores) NOTE: writes data to files for conveniece ''' print("getting svl...") kwas['start_time'] = start_time svl, fmt, __, ccache = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(svl))) nearbies = cc.nearby_probes_diff_ldns(svl, rmask) vals = defaultdict(list) fmtmask = ipp.make_v4_prefix_mask(rmask) for group in nearbies: for i in xrange(0, len(group)-1): for j in xrange(i+1, len(group)): a = group[i] b = group[j] closeness = ccache[a][b] if a.get_ldns() & fmtmask == b.get_ldns() & fmtmask: vals['same LDNS'].append(closeness) else: vals['diff LDNS'].append(closeness) ccache.dump() fig, ax = plt.subplots(1, 1) for l in vals: ecdf = ECDF(vals[l]) x = list(ecdf.x) y = list(ecdf.y) ax.plot(x, y, label=l) ps.set_dim(fig, ax, xdim=13, ydim=7.5, xlim=xlim) plt.xlabel("pairwise probe closeness") plt.ylabel("CDF of pairs") lgd = ps.legend_setup(ax, 4, "top center", True) filename = plotsdir+"closeness_ldns"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig)
def get_ansset_sizes(start_time, fname="", **kwas): ''' :param start_time: int indicating the earliest query the window should include :param **kwas: keyword arguments for vv.get_svl() :return: (m) matrix of client pairs vs domains, (fmt) list of domains other outputs: -> csv with pairs vs domains matrix (m) -> csv with list of domain pair correlations (corrs) -> csv with list of mean Jaccard for each domain (means) ''' kwas['start_time'] = start_time kwas['return_ccache'] = False svl, fmt, anssets = vv.get_svl(**kwas) anssets = sorted([(z, len(anssets[z])) for z in anssets], key=lambda p: p[1]) df.overwrite(plotsdir + "big_ansset" + fname + ".csv", df.list2col(anssets))
def plot_varying_mc(start_time, fname="", tmin=.75, tmax=1.01, tinc=.025, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param fname: string to be appended to end of plot file name :param tmin: start of threshold range (e.g., a of np.arange(a, b, c)) :param tmax: end of threshold range (e.g., b of np.arange(a, b, c)) :param tinc: the step size of the threshold range (e.g., c of np.arange(a, b, c) :param **kwas: keyword arguments for vv.get_svl() ''' logger.info("getting svl") kwas['start_time'] = start_time svl, fmt, __, ccache = vv.get_svl(**kwas) mwl = np.arange(tmin, tmax, tinc) ccl = vg.get_cc_varying_mc(svl, mwl, ccache) fig, ax = plt.subplots(1, 1) labels = ['country', 'prefix', 'resolver', 'subnet', 'asn'] x = mwl for label in labels: y = [z[label] for z in ccl] ax.plot(x,y, label=label) ax.set_xlabel('minimum closeness') ax.set_ylabel('% of component') ax2 = ax.twinx() y = [z['quantity'] for z in ccl] ax2.plot(x, y, 'k', label='# components') ax2.set_ylabel('# components') ps.set_dim(fig, ax, xdim=13, ydim=7.5) lgd = ps.legend_setup(ax, 5, "top center", True) plt.savefig(plotsdir+'components_'+fname+'.pdf', bbox_inches='tight') plt.savefig(plotsdir+'components_'+fname+'.png', bbox_inches='tight') ccache.dump()
def get_domain_matrix(start_time, fname="", **kwas): ''' :param start_time: int indicating the earliest query the window should include :param **kwas: keyword arguments for vv.get_svl() :return: (m) matrix of client pairs vs domains, (fmt) list of domains other outputs: -> csv with pairs vs domains matrix (m) -> csv with list of domain pair correlations (corrs) -> csv with list of mean Jaccard for each domain (means) ''' kwas['start_time'] = start_time kwas['return_ccache'] = False svl, fmt, anssets = vv.get_svl(**kwas) print "svl len", len(svl) combs = fact(len(svl)) / (fact(2) * fact(len(svl) - 2)) m = np.zeros((combs, len(fmt))) p = 0 for i in xrange(0, len(svl) - 1): a = svl[i] logger.warning(str(i) + ", " + str(a.get_id())) aset = dict() for dom in a: aset[dom] = set(a[dom]) for j in xrange(i + 1, len(svl)): b = svl[j] for k in xrange(0, len(fmt)): dom = fmt[k] domtotal = sum([a[dom][z] for z in a[dom]]) + sum( [b[dom][z] for z in b[dom]]) overlap = aset[dom].intersection(b[dom]) weight = 0 for z in overlap: weight += (a[dom][z] + b[dom][z]) m[p, k] = weight / domtotal p += 1 df.overwrite(plotsdir + "dommatrix" + fname + ".csv", df.list2line(fmt) + "\n") df.append(plotsdir + "dommatrix" + fname + ".csv", df.list2col(m)) C = np.corrcoef(m, rowvar=False) corrs = list() for i in xrange(0, len(fmt) - 1): for j in xrange(i + 1, len(fmt)): corrs.append((fmt[i] + "_" + fmt[j], C[i, j])) corrs = sorted([y for y in corrs if not math.isnan(y[1])], key=lambda z: z[1]) means = sorted(zip(fmt, np.mean(m, axis=0)), key=lambda z: z[1]) df.overwrite(plotsdir + "domcorr" + fname + ".csv", df.list2col(corrs)) df.overwrite(plotsdir + "dommean" + fname + ".csv", df.list2col(means)) meand = dict(means) # get mean jaccard vs # IPs seen mj_ni = [(meand[dom], len(anssets[dom])) for dom in meand] d_mj_ni = sorted([(dom, meand[dom], len(anssets[dom])) for dom in meand], key=lambda z: z[1]) df.overwrite(plotsdir + "jaccard_vs_ipspace" + fname + ".csv", df.list2col(d_mj_ni)) fig, ax = plt.subplots(1, 1) colors = iter(cm.rainbow(np.linspace(0, 1, len(mj_ni)))) for x, y in mj_ni: ax.scatter(x, y, color=next(colors)) plt.xlabel("mean jaccard") plt.ylabel("# IPs observed") ax.grid(b=True, which='major', color='b', linestyle='-') ps.set_dim(fig, ax, ylog=True) filename = plotsdir + "jaccard_vs_ipspace" + fname fig.savefig(filename + '.png', bbox_inches='tight') fig.savefig(filename + '.pdf', bbox_inches='tight') plt.show() plt.close(fig) return m, fmt
def closest_diff_desc(start_time, fname="", xlim=[.6, 1.0], **kwas): ''' :param t: int indicating the earliest query the window should include :param fname: string to be appended to end of plot file name :returns: [country, ASN, subnet, prefix] pair dictionaries of closeness lists gets pairwise closeness of probes with different descriptors to find odd behavior (probes in difference descriptors with high closeness scores) NOTE: writes data to files for conveniece NOTE: accepts vv.get_svl keyword params ''' print("getting svl...") kwas['start_time'] = start_time svl, __, __, ccache = vv.get_svl(**kwas) logger.warning("svl len: " + str(len(svl))) print("getting descriptor lists...") csvl = vv.country_svl(svl) asvl = vv.asn_svl(svl) ssvl = vv.subnet_svl(svl) psvl = vv.prefix_svl(svl) idc = defaultdict(list) # {idA_idB:closeness} iic = dict() # {asnA_asnB: [closeness]} ddc = defaultdict(list) print("\n\ncalculating closeness for ASNs...") asns = [c for c in asvl if len(asvl[c]) > 1] for i in xrange(0, len(asns) - 1): print(asns[i], end=", ") sys.stdout.flush() for a in asvl[asns[i]]: for j in xrange(i + 1, len(asns)): for b in asvl[asns[j]]: closeness = ccache[a][b] ad = str(a.get_asn()) bd = str(b.get_asn()) aid = str(a.get_id()) bid = str(b.get_id()) dist = em.latlong_distance_km(a.get_coordinates(), b.get_coordinates()) dist = distance(closeness, dist) idc[aid + "_" + bd].append((closeness, dist)) idc[bid + "_" + ad].append((closeness, dist)) iic["_".join(sorted([aid, bid]))] = (closeness, dist) ddc["_".join(sorted([ad, bd]))].append((closeness, dist)) ccache.dump() idac = sorted([(k, np.mean([q[0] for q in idc[k]]), np.mean([q[1] for q in \ idc[k]])) for k in idc], key=lambda z: z[2], reverse=True) idac = [(z[0], z[1]) for z in idac] filename = plotsdir + "asn_idac" + fname + ".csv" df.overwrite(filename, df.list2col(idac)) ddac = sorted([(k, np.mean([q[0] for q in ddc[k]]), np.mean([q[1] for q in \ ddc[k]])) for k in ddc], key=lambda z: z[2], reverse=True) ddac = [(z[0], z[1]) for z in ddac] filename = plotsdir + "asn_ddac" + fname + ".csv" df.overwrite(filename, df.list2col(ddac)) iic = sorted([(k, iic[k][0], iic[k][1]) for k in iic], reverse=True, key=lambda z: z[2]) iic = [(z[0], z[1]) for z in iic] filename = plotsdir + "asn_iic" + fname + ".csv" df.overwrite(filename, df.list2col(iic)) # {idA_prefixB: [closeness]} idc = defaultdict(list) # {idA_idB:closeness} iic = dict() # {prefixA_prefixB: [closeness]} ddc = defaultdict(list) print("\n\ncalculating closeness for prefixes...") prefixes = [c for c in psvl if len(psvl[c]) > 1] for i in xrange(0, len(prefixes) - 1): print(prefixes[i], end=", ") sys.stdout.flush() for a in psvl[prefixes[i]]: for j in xrange(i + 1, len(prefixes)): for b in psvl[prefixes[j]]: closeness = ccache[a][b] ad = str(a.get_prefix()) bd = str(b.get_prefix()) aid = str(a.get_id()) bid = str(b.get_id()) dist = em.latlong_distance_km(a.get_coordinates(), b.get_coordinates()) dist = distance(closeness, dist) idc[aid + "_" + bd].append((closeness, dist)) idc[bid + "_" + ad].append((closeness, dist)) iic["_".join(sorted([aid, bid]))] = (closeness, dist) ddc["_".join(sorted([ad, bd]))].append((closeness, dist)) ccache.dump() idac = sorted([(k, np.mean([q[0] for q in idc[k]]), np.mean([q[1] for q in \ idc[k]])) for k in idc], key=lambda z: z[2], reverse=True) idac = [(z[0], z[1]) for z in idac] filename = plotsdir + "prefix_idac" + fname + ".csv" df.overwrite(filename, df.list2col(idac)) ddac = sorted([(k, np.mean([q[0] for q in ddc[k]]), np.mean([q[1] for q in \ ddc[k]])) for k in ddc], key=lambda z: z[2], reverse=True) ddac = [(z[0], z[1]) for z in ddac] filename = plotsdir + "prefix_ddac" + fname + ".csv" df.overwrite(filename, df.list2col(ddac)) iic = sorted([(k, iic[k][0], iic[k][1]) for k in iic], reverse=True, key=lambda z: z[2]) iic = [(z[0], z[1]) for z in iic] filename = plotsdir + "prefix_iic" + fname + ".csv" df.overwrite(filename, df.list2col(iic)) svd = dict() for sv in svl: svd[sv.get_id()] = sv return svd
def plot_closeness_same_desc(start_time, duration, fname="", xlim=[.6, 1.0], rmask=16, loops=31, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param fname: string to be appended to end of plot file name :param xlim: x axis limits for plot. Accepts formats: None, [a, b], :param rmask: mask for resolver IPs :param **kwas: keyword arguments for vv.get_svl() for each descriptor (ASN, country, registered prefix, /24 subnet), plot the CDF of the pairwise closeness of clients, such that the clients in a pair come from the same groups in the descriptor (e.g., same country for the country descriptor) NOTE: plot 4.1 ''' lvals = list() cvals = list() avals = list() svals = list() pvals = list() kwas['duration'] = duration for l in xrange(0, loops): print "getting svl..." kwas['start_time'] = start_time+duration*l svl, fmt, __, ccache = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(svl))) print "getting descriptor lists..." csvl = vv.country_svl(svl) asvl = vv.asn_svl(svl) ssvl = vv.subnet_svl(svl) #osvl = vv.owner_svl(svl) psvl = vv.prefix_svl(svl) lsvl = vv.ldns_svl(svl, rmask, False) fmtmask = ipp.make_v4_prefix_mask(rmask) to_remove = [ '208.67.222.123', # OpenDNS '208.67.220.123', '8.8.8.8', # Google Public DNS '8.8.4.4', '64.6.64.6', # Verisign '64.6.65.6'] # remove massive public DNS providers for ip in to_remove: tmp = ipp.ip2int(ip) & fmtmask if tmp in lsvl: del lsvl[tmp] print "calculating closeness for resolvers..." resolvers = lsvl.keys() for k in resolvers: ksvl = lsvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): lvals.append(ccache[ksvl[a]][ksvl[b]]) print "calculating closeness for countries..." countries = csvl.keys() for k in countries: ksvl = csvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): cvals.append(ccache[ksvl[a]][ksvl[b]]) print "calculating closeness for ASNs..." asns = asvl.keys() for k in asns: ksvl = asvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): avals.append(ccache[ksvl[a]][ksvl[b]]) print "calculating closeness for subnets..." subnets = ssvl.keys() for k in subnets: ksvl = ssvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): svals.append(ccache[ksvl[a]][ksvl[b]]) ''' print "calculating closeness for owners..." ovals = list() owners = osvl.keys() for k in owners: ksvl = osvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): ovals.append(ccache[ksvl[a]][ksvl[b]]) ''' print "calculating closeness for prefixes..." prefixes = psvl.keys() for k in prefixes: ksvl = psvl[k] for a in xrange(0, len(ksvl)-1): for b in xrange(a+1, len(ksvl)): pvals.append(ccache[ksvl[a]][ksvl[b]]) print "plotting..." #vals = [cvals, avals, svals, ovals, pvals] #labels = ['country', 'ASN', 'subnet', 'owner', 'prefix'] vals = [cvals, avals, svals, pvals, lvals] labels = ['country', 'ASN', 'subnet', 'prefix', 'resolver'] fig, ax = plt.subplots(1, 1) for i in xrange(0, len(vals)): print type(vals[i][0]) print labels[i], "\n" print len(vals[i]) ecdf = ECDF(np.array(vals[i])) x = list(ecdf.x) y = list(ecdf.y) ax.plot(x, y, label=labels[i]) ps.set_dim(fig, ax, xdim=13, ydim=7.5) plt.xlabel("pairwise probe closeness") plt.ylabel("CDF of pairs") lgd = ps.legend_setup(ax, 4, "top center", True) filename = plotsdir+"closeness_same_desc"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig) print "saving data..." for i in xrange(0, len(vals)): outstr = df.overwrite(plotsdir+labels[i]+'_same.csv', df.list2col(vals[i])) ccache.dump()
def plot_closeness(start_time, duration, fname="", xlim=[.6, 1.0], loops=15, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param fname: string to be appended to end of plot file name :param xlim: x axis limits for plot. Accepts formats: None, [a, b], :param loops: number of time blocks :param **kwas: keyword arguments for vv.get_svl() plots: 1) CDF for pairwise closeness of each pair 2) CDF for the average pairwise closeness experienced by each probe across all other probes NOTE: plot 3.1 ''' means = defaultdict(list) vals = list() kwas['duration'] = duration for l in xrange(0, loops): print "getting svl..." kwas['start_time'] = start_time+duration*l svl, __, __, ccache = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(svl))) print len(svl) print "calculating closeness for resolvers..." for i in xrange(0, len(svl)-1): for j in xrange(i + 1, len(svl)): vals.append(ccache[svl[i]][svl[j]]) means[svl[i].get_id()].append(vals[-1]) means[svl[j].get_id()].append(vals[-1]) ccache.dump() del ccache, svl, __ gc.collect() print "plotting..." fig, ax = plt.subplots(1, 1) ecdf = ECDF(vals) x = list(ecdf.x) y = list(ecdf.y) ax.plot(x, y, label="pairwise") ecdf = ECDF([np.mean(means[z]) for z in means]) x = list(ecdf.x) y = list(ecdf.y) ax.plot(x, y, label="average (per client)") ps.set_dim(fig, ax, xdim=13, ydim=7.5, xlim=xlim) plt.xlabel("pairwise probe closeness") plt.ylabel("CDF of pairs") lgd = ps.legend_setup(ax, 4, "top center", True) filename = plotsdir+"overall_closeness"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig) print "saving data..." df.overwrite(plotsdir+'overall_closeness'+fname+'.csv', df.list2col(vals)) df.overwrite(plotsdir+'overall_avg_closeness'+fname+'.csv', df.list2col([(z, np.mean(means[z])) for z in means]))
def plot_optimizing_window(start_time, duration, fname="", xlim=None, maxdur=90000*15, incr=30000, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param fname: string to be appended to end of plot file name :param xlim: x axis limits for plot. Accepts formats: None, [a, b], :param maxdur: the outer bound of the duration range to be covered :param incr: the number of seconds to increment the duration by in each loop :param **kwas: keyword arguments for vv.get_svl() makes line plot varying the duration (x axis) vs the closeness to one's self from a different point in time (e.g., for a 10 second duration, self A would be time 0-9, and self B would be time 10-19) ''' allvals = list() allbars = list() allx = list() dur = duration kwas['return_ccache'] = False while dur < maxdur: print "getting svls..." kwas['duration'] = dur kwas['start_time'] = start_time svl, __, __ = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(svl))) svl1 = dict() for sv in svl: svl1[sv.id] = sv kwas['start_time'] = start_time+dur svl, __, __ = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(svl))) svl2 = dict() for sv in svl: svl2[sv.id] = sv print "calculating closeness for subnets...", dur vals = list() for pid in svl1: if pid in svl2: vals.append(vv.closeness(svl1[pid], svl2[pid])) allvals.append(np.mean(vals)) allbars.append(np.std(vals)) allx.append(float(dur)/(60.0*60.0*8.0)) dur += incr fig, ax = plt.subplots(1, 1) ax.errorbar(allx, allvals, yerr=allbars) ps.set_dim(fig, ax, xdim=13, ydim=7.5, xlim=xlim) plt.xlabel("# 8 hour cycles in block duration") plt.ylabel("average self closeness") lgd = ps.legend_setup(ax, 4, "top center", True) filename = plotsdir+"avg_self_closeness"+fname fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight') fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close(fig) print "saving data..." outstr = df.overwrite(plotsdir+fname+'_avg_self_closeness.csv', df.list2col(allvals))