Exemplo n.º 1
0
def process_data():
    days = [13, 14, 15, 16, 17, 18, 19, 20]
    dirn = "/Users/chimpu/research/ml_data/unc/"
    for day in days:
        print "Processing day - ", day
        fcount = process_single_day(day)
        fname = dirn + "/fcount/day_" + str(day)
        write_data(fname, fcount)
        print "Done writing records - ", len(fcount)
Exemplo n.º 2
0
def process_data():
	days = [13,14,15,16,17,18,19,20]
	dirn="/Users/chimpu/research/ml_data/unc/"
	for day in days:
		print "Processing day - ", day
		fcount = process_single_day(day)
		fname = dirn + "/fcount/day_" + str(day)
		write_data(fname, fcount)
		print "Done writing records - ", len(fcount)
Exemplo n.º 3
0
def process_data():
	days = [13,14,15,16,17,18,19,20]
	dirn="/Users/chimpu/research/ml_data/unc/"
	for day in days:
		print "Processing day - ", day
		(inb, outb) = process_single_day(day)
		fname_inb = dirn + "/inb/day_" + str(day)
		fname_outb = dirn + "/outb/day_" + str(day)
		write_data(fname_inb, inb)
		write_data(fname_outb, outb)
		print "Done writing records - ", len(inb)
Exemplo n.º 4
0
def process_data():
    days = [13, 14, 15, 16, 17, 18, 19, 20]
    dirn = "/Users/chimpu/research/ml_data/unc/"
    for day in days:
        print "Processing day - ", day
        (inb, outb) = process_single_day(day)
        fname_inb = dirn + "/inb/day_" + str(day)
        fname_outb = dirn + "/outb/day_" + str(day)
        write_data(fname_inb, inb)
        write_data(fname_outb, outb)
        print "Done writing records - ", len(inb)
Exemplo n.º 5
0
def create_ses_inter():
	dl = get_day_list()
	r = RunSQL("unc.db")
	
	for d in dl:
		q = "select ts from flows where day = " + str(d) + " and term > 1 order by ts"
		print "Running query for day - ", str(d)
		ts = np.array(r.sqlq(q))
		n = len(ts)
		its = ts[1:n] - ts[0:n-1]
		nits = its[np.where(its > 0)]
		fname = get_output_dir("interses") + "/day_" + str(d)
		print "Writing to file - ", fname
		write_data(fname, nits)
def main():
	day_list = [13,14,15,16,17,18,19,20]
	sql_str = "select bytes from flows where day = "	
	unc = sql.RunSQL("unc.db")

	for d in day_list:
		fname = "flow_" + str(d)
		dsql_str = sql_str + str(d)
		
		print "Running query --> SQL: ", dsql_str
		f = unc.sqlq(dsql_str)

		print "Writing data --> file: ", fname
		util.write_data(fname, f)
		del(f)
def main():
    day_list = [13, 14, 15, 16, 17, 18, 19, 20]
    sql_str = "select bytes from flows where day = "
    unc = sql.RunSQL("unc.db")

    for d in day_list:
        fname = "flow_" + str(d)
        dsql_str = sql_str + str(d)

        print "Running query --> SQL: ", dsql_str
        f = unc.sqlq(dsql_str)

        print "Writing data --> file: ", fname
        util.write_data(fname, f)
        del (f)
Exemplo n.º 8
0
def create_ses_count_inter():
	dl = get_day_list()
	r = RunSQL("unc.db")
	r1 = RunSQL("syslog_final.db")

	for d in dl:
		print "Processing day - ", d
		q1 = "select client, ap, start, end from seslen where day = " + str(d)
		l1 = r1.sqlq(q1)

		fcount = list()
		sesinter = list()

		for ses in l1:
			start_ts = ses[2]
			end_ts = ses[3]
			client = ses[0]
			ap = ses[1]

			q = "select ts from flows" + \
				 " where day = " + str(d) + \
				 " and ts >= " + str(start_ts) + \
				 " and ts <= " + str(end_ts) + \
				 " and client = " + str(client) + \
				 " and ap = " + str(ap) + \
				 " order by ts"
			l = r.sqlq(q)
			if ( l != None and len(l) > 0 ):
				fcount.append(len(l))
				n = len(l)
				npl = np.array(l)
				inpl = (npl[1:n] - npl[0:n-1])
				inpl1 = inpl[np.where(inpl>0)]
				if ( len(inpl1) > 0 ):
					sesinter.extend(inpl1.tolist())
		ifname = get_output_dir("intrases") + "/day_" + str(d)
		cfname = get_output_dir("fcount") + "/day_" + str(d)

		print "Writing file - ", ifname
		write_data(ifname, sesinter)

		print "Writing file - ", cfname
		write_data(cfname, fcount)
Exemplo n.º 9
0
def create_seslen():
	dl = get_day_list()
	r = RunSQL("unc-proc.db")
	
	slen = list()
	inb = list()
	outb = list()

	for d in dl:
		q = "select seslen, bin, bout from sessions where day = " + str(d) + " and seslen > 0 and bin > 0 and bout > 0"
		print "Getting data for day - ", d
		slist = r.sqlq(q)
		n = len(slist)
		for i in xrange(n):
			slen.append(slist[i][0])
			inb.append(slist[i][1])
			outb.append(slist[i][2])
		
		fname = "day_" + str(d)
		slenf = get_output_dir("seslen") + "/" + fname
		inbf = get_output_dir("inb") + "/" + fname
		outbf = get_output_dir("outb") + "/" + fname
		
		print "Writing slen - " + slenf
		write_data(slenf, slen)

		print "Writing inb - " + inbf
		write_data(inbf, inb)

		print "Writing outb - " + outbf
		write_data(outbf, outb)
Exemplo n.º 10
0
def create_seslen():
    dl = get_day_list()
    r = RunSQL("unc-proc.db")

    slen = list()
    inb = list()
    outb = list()

    for d in dl:
        q = "select seslen, bin, bout from sessions where day = " + str(
            d) + " and seslen > 0 and bin > 0 and bout > 0"
        print "Getting data for day - ", d
        slist = r.sqlq(q)
        n = len(slist)
        for i in xrange(n):
            slen.append(slist[i][0])
            inb.append(slist[i][1])
            outb.append(slist[i][2])

        fname = "day_" + str(d)
        slenf = get_output_dir("seslen") + "/" + fname
        inbf = get_output_dir("inb") + "/" + fname
        outbf = get_output_dir("outb") + "/" + fname

        print "Writing slen - " + slenf
        write_data(slenf, slen)

        print "Writing inb - " + inbf
        write_data(inbf, inb)

        print "Writing outb - " + outbf
        write_data(outbf, outb)
Exemplo n.º 11
0
def main():
    cats = [None, 'catm\n', 'catd\n', 'cath\n']

    for cat in cats:
        r = acct_util(cat)

        tuf_pfx = "tu_g."
        duf_pfx = "du_g."

        sfx = "all"

        tu = r[1]
        du = r[2]

        if cat != None:
            sfx = cat.strip()

        tuf = tuf_pfx + sfx
        duf = duf_pfx + sfx

        write_data(tuf, tu)
        write_data(duf, du)
Exemplo n.º 12
0
def main():
    cats = [None, "catm\n", "catd\n", "cath\n"]

    for cat in cats:
        r = acct_util(cat)

        tuf_pfx = "tu_g."
        duf_pfx = "du_g."

        sfx = "all"

        tu = r[1]
        du = r[2]

        if cat != None:
            sfx = cat.strip()

        tuf = tuf_pfx + sfx
        duf = duf_pfx + sfx

        write_data(tuf, tu)
        write_data(duf, du)
Exemplo n.º 13
0
def gen_cdf_ccdf():
	r = RunSQL("files_and_analysis.db")
	dsets = r.sqlq("select unique_id, filename from datasets")

	for dpair in dsets:
		dset_id = dpair[0]
		dset_file = dpair[1]

		print "Processing data set - ", dset_id

		x = read_data(dset_file)
		x.sort()
		ec = ecdf(x, issorted=True)
		pts = np.power(10, gen_points(math.log10(min(x)), math.log10(max(x)), 2000))
		
		# dist_list = ["LOGLOGISTIC", "LOGN", "TPARETO", "TRUNCLL"]
		dist_list = ["LOGN"]
		fext_map = {"LOGLOGISTIC": "ll", "LOGN": "lgn", "TPARETO": "tp", "TRUNCLL": "tll"}

		for distname in dist_list:
			print "\t Getting distribution - ", distname
			dist = get_dist(dset_id, distname)
			dec = dist.cdf(ec[:,0])
			dcc = dist.ccdf(pts)

			fdec = np.array([ec[:,0], dec]).transpose()
			fdcc = np.array([pts, dcc]).transpose()

			op_dir = os.path.dirname(dset_file)
			op_ec_file = op_dir + "/" + os.path.basename(dset_file)+"_ecdf" + "." + fext_map[distname]
			op_cc_file = op_dir + "/" + os.path.basename(dset_file)+"_ccdf" + "." + fext_map[distname]
			
			print "\t Writing CDF - ", op_ec_file
			write_data(op_ec_file, fdec)

			print "\t Writing CCF - ", op_cc_file
			write_data(op_cc_file, fdcc)
Exemplo n.º 14
0
def main():

	cats = [None, 'catm\n', 'catd\n', 'cath\n']

	for cat in cats:
		r = acct_util(cat)
		
		tuf = "tu_all.dat"
		duf = "du_all.dat"
		if cat != None:
			tuf = "tu_" + cat.strip() + ".dat"
			duf = "du_" + cat.strip() + ".dat"
		
		tu = r[1]
		du = r[2]
		# Need unsorted values
		if cat == None:
			tudu = np.zeros((len(tu),2))
			tudu[:,0] = tu
			tudu[:,1] = du
			util.write_data("tudu.dat", tudu)

		tu.sort()
		du.sort()

		ctu = util.ecdf(tu,zdisp=True)
		cdu = util.ecdf(du,zdisp=True)

		util.write_data(tuf, ctu)
		util.write_data(duf, cdu)
		

		pcat = "ALL"
		if cat != None:
			pcat = cat.strip()

		print "CATEGORY: ", pcat
		print "Time utilization"
		print "------------------------------------------------"
		util.pstats(tu)
		print
		print "Data utilization"
		print "------------------------------------------------"
		util.pstats(du)
		print
		print
Exemplo n.º 15
0
def main(dt):
	slen = {"catm": "select seslen from data_log where ucat_term='catm\n' and seslen>0 and seslen<18000 order by seslen", \
			  "catd": "select seslen from data_log where ucat_term='catd\n' and seslen>0 and seslen<18000 order by seslen", \
			  "cath": "select seslen from data_log where ucat_term='cath\n' and seslen>0 and seslen<18000 order by seslen", \
			  "all": "select seslen from data_log where seslen>0 and seslen<18000 order by seslen"}

	inb = {"catm": "select bin from data_log where ucat_term='catm\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \
			  "catd": "select bin from data_log where ucat_term='catd\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \
			  "cath": "select bin from data_log where ucat_term='cath\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \
			  "all": "select bin from data_log where bin>0 and seslen>0 and seslen<18000 order by bin"}
	
	outb = {"catm": "select bout from data_log where ucat_term='catm\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \
			  "catd": "select bout from data_log where ucat_term='catd\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \
			  "cath": "select bout from data_log where ucat_term='cath\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \
			  "all": "select bout from data_log where bout>0 and seslen>0 and seslen<18000 order by bout"}

	to_inb = {"all": "select bin from data_log where seslen >= 18000 and bin>0 order by bin"}

	to_outb = {"all": "select bout from data_log where seslen >= 18000 and bout>0 order by bout"}
	
	tslen = {"catm": "select sum(seslen) t from data_log where ucat_term='catm\n' and seslen>0 and seslen<18000 group by user order by t", \
			  "catd": "select sum(seslen) t from data_log where ucat_term='catd\n' and seslen>0 and seslen<18000 group by user order by t", \
			  "cath": "select sum(seslen) t from data_log where ucat_term='cath\n' and seslen>0 and seslen<18000 group by user order by t", \
			  "all": "select sum(seslen) t from data_log where seslen>0 and seslen<18000 group by user order by t"}

	tinb = {"catm": "select sum(bin) t from data_log where ucat_term='catm\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \
			  "catd": "select sum(bin) t from data_log where ucat_term='catd\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \
			  "cath": "select sum(bin) t from data_log where ucat_term='cath\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \
			  "all": "select sum(bin) t from data_log where bin > 0 and seslen>0 and seslen<18000 group by user order by t"}
	
	toutb = {"catm": "select sum(bout) t from data_log where ucat_term='catm\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \
			  "catd": "select sum(bout) t from data_log where ucat_term='catd\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \
			  "cath": "select sum(bout) t from data_log where ucat_term='cath\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \
			  "all": "select sum(bout) t from data_log where bout>0 and seslen>0 and seslen<18000 group by user order by t"}

	dtmap = {"slen": slen, "inb": inb, "outb": outb, "to_inb": to_inb, "to_outb": to_outb, "tslen": tslen, "tinb": tinb, "toutb": toutb}

	if dt not in dtmap:
		raise NotImplementedError("Type - " + dt + " - is not implemented")
	
	qmap = dtmap[dt]
	
	s = sql.RunSQL("azure.db")
	for i in qmap.items():
		q = i[1]
		y = s.sqlq(q)
		x = np.array(y)
		x.sort() # just making sure

		df = i[0] + "_" + dt
		ccf = i[0] + "_ccdf"
		ecf = i[0] + "_ecdf"

		cc = util.ccdf(x)
		ec = util.ecdf(x)

		util.write_data(df, x)
		util.write_data(ccf, cc)
		util.write_data(ecf, ec)


		mle = ml.ModLav.fromFit(x,fit="mlefit")
		mme = ml.ModLav.fromFit(x,fit="mmefit")
		mle_mt = ml.ModLav.fromFit(x,fit="mlefit",mt=True)
		mme_mt = ml.ModLav.fromFit(x,fit="mmefit",mt=True)

		omle = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=True,mt=False);
		omle_mt = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=True,mt=True);
		omme = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=False,mt=False);
		omme_mt = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=False,mt=True);

		mle_opt = omle["fit"][0]
		xm_mle_opt = omle["fit"][1]

		mle_opt_mt = omle_mt["fit"][0]
		xm_mle_opt_mt = omle_mt["fit"][1]

		mme_opt = omme["fit"][0]
		xm_mme_opt = omme["fit"][1]

		mme_opt_mt = omme_mt["fit"][0]
		xm_mme_opt_mt = omme_mt["fit"][1]

		yyy = [("MLE", mle, x.max()), ("MME", mme, x.max()), ("MLE-MT", mle_mt, x.max()), ("MME-MT", mme_mt, x.max()), ("MLE-OPT", mle_opt, xm_mle_opt), ("MLE-OPT-MT", mle_opt_mt, xm_mle_opt_mt), ("MME-OPT", mme_opt, xm_mme_opt), ("MME-OPT-MT", mme_opt_mt, xm_mme_opt_mt)]
		
		n,amin,amax,mu,sigma = len(x), x.min(), x.max(), x.mean(), x.std()
		cv = sigma/mu
		q = ms.mquantiles(x, [0.1, 0.5, 0.9])
		op_str = []
		op_str.append("BASIC STATISTICS")
		op_str.append("----------------------------------------------------------------------")
		op_str.append("Size: " + str(n))
		op_str.append("Range: " + str(amin) + " - " + str(amax))
		op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2]))
		op_str.append("Mean: " + str(mu))
		op_str.append("Sigma: " + str(sigma))
		op_str.append("CV: " + str(cv))
		op_str.append("\n")

		for yy in yyy:
			typ = i[0]
			lbl = yy[0]
			m = yy[1]
			xmx = yy[2]

			op_str.append(lbl)
			op_str.append("----------------------------------------------------------------------")
			op_str.append("Modlav params: " + str(m))
			op_str.append("Xmax: " + str(xmx))
			op_str.append("Xmax/Max: " + str(xmx/amax))
			op_str.append("FIT metric: " + str(m.fitmetric(points=x)))
			op_str.append("K-S metric: " + str(m.ksmetric(points=x)))
			op_str.append("----------------------------------------------------------------------")
			op_str.append("\n")

			flbl = lbl.lower().replace("-", "_")
			fname_pfx = typ + "_" + flbl 

			lx = util.gen_points(math.log10(x.min()),math.log10(xmx),2000)
			ex = np.power(10, lx)

			mcc = m.ccdf(ex)
			mec = m.cdf(ec[:,0])

			fmcc = np.array([ex, mcc]).transpose()
			fmec = np.array([ec[:,0], mec]).transpose()

			util.write_data(fname_pfx+"_ccdf", fmcc)
			util.write_data(fname_pfx+"_ecdf", fmec)
	
		op1_str = []
		for s1 in op_str:
			op1_str.append(s1 + "\n")
		
		txf = open(typ+"_metric", "w+")
		txf.writelines(op1_str)
		txf.close()
Exemplo n.º 16
0
def process_datasets(tag_file_map):

	tags = tag_file_map.keys()
	report_map = {}
	for tag in tags:
		data_file = tag_file_map[tag]
		x = util.read_data(data_file)
		x.sort()
		ec = util.ecdf(x)
		cc = util.ccdf(x)

		fit_map = compute_fits(x)
		insert_db_record(tag, fit_map)

		## Figure out best fit
		bfit = best_fits(fit_map)
		report_map[tag] = (bfit["best_body"], bfit["best_tail"])
		
		## Write files out to the directory
		util.write_data(data_file + "_ecdf", ec)
		util.write_data(data_file + "_ccdf", cc)

		ccpts = np.power(10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000))	
		ecpts = ec[:,0]

		lmme = fit_map["MME"][0]
		lmle = fit_map["MLE"][0]
		lfit = fit_map["FITMIN"][0]

		mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose()
		mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose()
		util.write_data(data_file + "_ecdf.lognmme", mme_ec)
		util.write_data(data_file + "_ccdf.lognmme", mme_cc)

		mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose()
		mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose()
		util.write_data(data_file + "_ecdf.lognmle", mle_ec)
		util.write_data(data_file + "_ccdf.lognmle", mle_cc)

		fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose()
		fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose()
		util.write_data(data_file + "_ecdf.lognfitmin", fit_ec)
		util.write_data(data_file + "_ccdf.lognfitmin", fit_cc)

	for k in report_map:
		print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1]

	return report_map
Exemplo n.º 17
0
def pardd(fname):
	
	inpf = fname
	x1 = util.read_data(inpf)
	x1.sort()
	xmx = x1.max()

	n = 500
	lo = 0.1*xmx
	hi = 10*xmx

	ccf = inpf + "_ccdf"
	ecf = inpf + "_ecdf"

	cc = util.ccdf(x1)
	ec = util.ecdf(x1)

	util.write_data(ccf, cc)
	util.write_data(ecf, ec)

	mle = ml.ModLav.fromFit(x1, fit="mlefit")
	mme = ml.ModLav.fromFit(x1, fit="mmefit")
	mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True)
	mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True)

	no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec), mle.difference(cdf=ec))
	no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec), mme.difference(cdf=ec))
	no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec), mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec))
	no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec), mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec))

	omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False)
	omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True)
	omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False)
	omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True)

	mle_opt = omle["fit"]
	mle_opt_mt = omle_mt["fit"]
	mme_opt = omme["fit"]
	mme_opt_mt = omme_mt["fit"]

	k_mle_opt = omle["ks"]
	k_mle_opt_mt = omle_mt["ks"]
	k_mme_opt = omme["ks"]
	k_mme_opt_mt = omme_mt["ks"]
	
	d_mle_opt = omle["diff"]
	d_mle_opt_mt = omle_mt["diff"]
	d_mme_opt = omme["diff"]
	d_mme_opt_mt = omme_mt["diff"]

	fitlist = [("MLE", no_mle), \
				  ("MME", no_mme), \
				  ("MLE-MT", no_mle_mt), \
				  ("MME-MT", no_mme_mt), \
				  ("MLE-OPT", mle_opt), \
				  ("MLE-OPT-MT", mle_opt_mt), \
				  ("MME-OPT", mme_opt), \
				  ("MME-OPT-MT", mme_opt_mt), \
				  ("KS-MLE-OPT", k_mle_opt), \
				  ("KS-MLE-OPT-MT", k_mle_opt_mt), \
				  ("KS-MME-OPT", k_mme_opt), \
				  ("KS-MME-OPT-MT", k_mme_opt_mt), \
				  ("D-MLE-OPT", d_mle_opt), \
				  ("D-MLE-OPT-MT", d_mle_opt_mt), \
				  ("D-MME-OPT", d_mme_opt), \
				  ("D-MME-OPT-MT", d_mme_opt_mt)]

	n,amin,amax,mu,sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std()
	cv = sigma/mu
	q = ms.mquantiles(x1, [0.1, 0.5, 0.9])
	
	op1_str = []
	op_str = []
	op_str.append("BASIC STATISTICS")
	op_str.append("--------------------------------------------------------------------------")
	op_str.append("Size: " + str(n))
	op_str.append("Range: " + str(amin) + " - " + str(amax))
	op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2]))
	op_str.append("Mean: " + str(mu))
	op_str.append("Sigma: " + str(sigma))
	op_str.append("CV: " + str(cv))
	op_str.append("\n")
	
	best_fit_map = dict()

	for f in fitlist:
		lbl = f[0]
		m = f[1][0]
		mx = f[1][1]
		fitm = f[1][2]
		ksm = f[1][3]
		diffm = f[1][4]

		best_fit_map[lbl] = (m, mx, fitm, ksm, diffm)

		op_str.append(lbl)
		op_str.append("--------------------------------------------------------------------------")
		op_str.append("Modlav params: " + str(m))
		op_str.append("Xmax: " + str(mx))
		op_str.append("Xmax/Max: " + str(mx/xmx))
		## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1)))
		## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1)))
		op_str.append("FIT Metric: " + str(fitm))
		op_str.append("K-S Metric: " + str(ksm))
		op_str.append("DIFF Metric: " + str(diffm))
		op_str.append("--------------------------------------------------------------------------")
		op_str.append("\n")

		flbl = lbl.lower().replace("-", "_")
		fname_pfx = inpf + "_" + flbl

		lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000)
		ex = np.power(10, lx)

		mcc = m.ccdf(ex)
		mec = m.cdf(ec[:,0])

		fmcc = np.array([ex, mcc]).transpose()
		fmec = np.array([ec[:,0], mec]).transpose()

		util.write_data(fname_pfx + "_ccdf", fmcc)
		util.write_data(fname_pfx + "_ecdf", fmec)

	recom = best_fit(best_fit_map, xmx)
	for s1 in op_str:
		op1_str.append(s1 + "\n")
	op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n")

	txf = open(inpf + "_metric", "w+")
	txf.writelines(op1_str)
	txf.close()
Exemplo n.º 18
0
def process_datasets(tag_file_map):

    tags = tag_file_map.keys()
    report_map = {}
    for tag in tags:
        data_file = tag_file_map[tag]
        x = util.read_data(data_file)
        x.sort()
        ec = util.ecdf(x)
        cc = util.ccdf(x)

        fit_map = compute_fits(x)
        insert_db_record(tag, fit_map)

        ## Figure out best fit
        bfit = best_fits(fit_map)
        report_map[tag] = (bfit["best_body"], bfit["best_tail"])

        ## Write files out to the directory
        util.write_data(data_file + "_ecdf", ec)
        util.write_data(data_file + "_ccdf", cc)

        ccpts = np.power(
            10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000))
        ecpts = ec[:, 0]

        lmme = fit_map["MME"][0]
        lmle = fit_map["MLE"][0]
        lfit = fit_map["FITMIN"][0]

        mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose()
        mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose()
        util.write_data(data_file + "_ecdf.lognmme", mme_ec)
        util.write_data(data_file + "_ccdf.lognmme", mme_cc)

        mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose()
        mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose()
        util.write_data(data_file + "_ecdf.lognmle", mle_ec)
        util.write_data(data_file + "_ccdf.lognmle", mle_cc)

        fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose()
        fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose()
        util.write_data(data_file + "_ecdf.lognfitmin", fit_ec)
        util.write_data(data_file + "_ccdf.lognfitmin", fit_cc)

    for k in report_map:
        print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1]

    return report_map
Exemplo n.º 19
0
def pardd(fname):

    inpf = fname
    x1 = util.read_data(inpf)
    x1.sort()
    xmx = x1.max()

    n = 500
    lo = 0.1 * xmx
    hi = 10 * xmx

    ccf = inpf + "_ccdf"
    ecf = inpf + "_ecdf"

    cc = util.ccdf(x1)
    ec = util.ecdf(x1)

    util.write_data(ccf, cc)
    util.write_data(ecf, ec)

    mle = ml.ModLav.fromFit(x1, fit="mlefit")
    mme = ml.ModLav.fromFit(x1, fit="mmefit")
    mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True)
    mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True)

    no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec),
              mle.difference(cdf=ec))
    no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec),
              mme.difference(cdf=ec))
    no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec),
                 mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec))
    no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec),
                 mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec))

    omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False)
    omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True)
    omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False)
    omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True)

    mle_opt = omle["fit"]
    mle_opt_mt = omle_mt["fit"]
    mme_opt = omme["fit"]
    mme_opt_mt = omme_mt["fit"]

    k_mle_opt = omle["ks"]
    k_mle_opt_mt = omle_mt["ks"]
    k_mme_opt = omme["ks"]
    k_mme_opt_mt = omme_mt["ks"]

    d_mle_opt = omle["diff"]
    d_mle_opt_mt = omle_mt["diff"]
    d_mme_opt = omme["diff"]
    d_mme_opt_mt = omme_mt["diff"]

    fitlist = [("MLE", no_mle), \
         ("MME", no_mme), \
         ("MLE-MT", no_mle_mt), \
         ("MME-MT", no_mme_mt), \
         ("MLE-OPT", mle_opt), \
         ("MLE-OPT-MT", mle_opt_mt), \
         ("MME-OPT", mme_opt), \
         ("MME-OPT-MT", mme_opt_mt), \
         ("KS-MLE-OPT", k_mle_opt), \
         ("KS-MLE-OPT-MT", k_mle_opt_mt), \
         ("KS-MME-OPT", k_mme_opt), \
         ("KS-MME-OPT-MT", k_mme_opt_mt), \
         ("D-MLE-OPT", d_mle_opt), \
         ("D-MLE-OPT-MT", d_mle_opt_mt), \
         ("D-MME-OPT", d_mme_opt), \
         ("D-MME-OPT-MT", d_mme_opt_mt)]

    n, amin, amax, mu, sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std()
    cv = sigma / mu
    q = ms.mquantiles(x1, [0.1, 0.5, 0.9])

    op1_str = []
    op_str = []
    op_str.append("BASIC STATISTICS")
    op_str.append(
        "--------------------------------------------------------------------------"
    )
    op_str.append("Size: " + str(n))
    op_str.append("Range: " + str(amin) + " - " + str(amax))
    op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) +
                  " 90% - " + str(q[2]))
    op_str.append("Mean: " + str(mu))
    op_str.append("Sigma: " + str(sigma))
    op_str.append("CV: " + str(cv))
    op_str.append("\n")

    best_fit_map = dict()

    for f in fitlist:
        lbl = f[0]
        m = f[1][0]
        mx = f[1][1]
        fitm = f[1][2]
        ksm = f[1][3]
        diffm = f[1][4]

        best_fit_map[lbl] = (m, mx, fitm, ksm, diffm)

        op_str.append(lbl)
        op_str.append(
            "--------------------------------------------------------------------------"
        )
        op_str.append("Modlav params: " + str(m))
        op_str.append("Xmax: " + str(mx))
        op_str.append("Xmax/Max: " + str(mx / xmx))
        ## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1)))
        ## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1)))
        op_str.append("FIT Metric: " + str(fitm))
        op_str.append("K-S Metric: " + str(ksm))
        op_str.append("DIFF Metric: " + str(diffm))
        op_str.append(
            "--------------------------------------------------------------------------"
        )
        op_str.append("\n")

        flbl = lbl.lower().replace("-", "_")
        fname_pfx = inpf + "_" + flbl

        lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000)
        ex = np.power(10, lx)

        mcc = m.ccdf(ex)
        mec = m.cdf(ec[:, 0])

        fmcc = np.array([ex, mcc]).transpose()
        fmec = np.array([ec[:, 0], mec]).transpose()

        util.write_data(fname_pfx + "_ccdf", fmcc)
        util.write_data(fname_pfx + "_ecdf", fmec)

    recom = best_fit(best_fit_map, xmx)
    for s1 in op_str:
        op1_str.append(s1 + "\n")
    op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n")

    txf = open(inpf + "_metric", "w+")
    txf.writelines(op1_str)
    txf.close()