示例#1
0
def cath_ccdf():
	try:
		plt.show()
	except:
		pass

	# These are already computed values. Please look at the notes
	# on discussion as to why and how they were chosen. The CDF 
	# and CCDF values for the fitted distributions are different.
	# The reasoning for this is also in the notes.

	m2 = ml.ModLav(0.7438, 657.1194, 0.0191)
	m3 = ml.ModLav(0.7294, 644.6747, 0.0104)
	
	x = util.read_data("/home/gautam/research/modlav-plots/seslen/cath_ses")
	cc = util.read_data("/home/gautam/research/modlav-plots/seslen/cath_ccdf")

	plt.loglog(cc[:,0],cc[:,1],'k-',label='Data',linestyle='steps')
	lx=util.gen_points(math.log10(x.min()),math.log10(x.max()),2000)
	ex=np.power(lx,10)
	m2cc=m2.ccdf(ex)
	m3cc=m3.ccdf(ex)
	plt.loglog(ex,m3cc,'k--',label='MLE,MT, No OPT')
	plt.loglog(ex,m2cc,'k-.',label='MLE, MT, OPT')
	plt.grid()
	plt.xlim((1.0,1e5))
	plt.ylim((1e-4,1.1))
	plt.ylabel("P(X > x)")
	plt.xlabel("Session length [seconds]")
	plt.legend(loc=3)
示例#2
0
def dice_matrix(mname, mtype):
	global CAT
	fa = str(mname) + "." + CAT[0]
	fm = str(mname) + "." + CAT[1]
	fd = str(mname) + "." + CAT[2]
	fh = str(mname) + "." + CAT[3]

	a = read_data(fa)
	m = read_data(fm)
	d = read_data(fd)
	h = read_data(fh)

	am = sim_mat(a,m,mtype)
	ad = sim_mat(a,d,mtype)
	ah = sim_mat(a,h,mtype)

	md = sim_mat(m,d,mtype)
	mh = sim_mat(m,h,mtype)

	dh = sim_mat(d,h,mtype)

	simmat = np.zeros([4,4])

	simmat[0,0] = ah
	simmat[0,1] = ad
	simmat[0,2] = am

	simmat[1,0] = mh
	simmat[1,1] = md

	simmat[2,0] = dh

	return simmat
示例#3
0
def dice_matrix(mname, mtype):
    global CAT
    fa = str(mname) + "." + CAT[0]
    fm = str(mname) + "." + CAT[1]
    fd = str(mname) + "." + CAT[2]
    fh = str(mname) + "." + CAT[3]

    a = read_data(fa)
    m = read_data(fm)
    d = read_data(fd)
    h = read_data(fh)

    am = sim_mat(a, m, mtype)
    ad = sim_mat(a, d, mtype)
    ah = sim_mat(a, h, mtype)

    md = sim_mat(m, d, mtype)
    mh = sim_mat(m, h, mtype)

    dh = sim_mat(d, h, mtype)

    simmat = np.zeros([4, 4])

    simmat[0, 0] = ah
    simmat[0, 1] = ad
    simmat[0, 2] = am

    simmat[1, 0] = mh
    simmat[1, 1] = md

    simmat[2, 0] = dh

    return simmat
def computeFits(entry):
	tag = entry["tag"]
	fname = entry["filename"]

	x = util.read_data(fname)
	x_ecdf = util.read_data(fname + "_ecdf")
	x_ccdf = util.read_data(fname + "_ccdf")
	x.sort()

	computeLognormalFit(tag, x, x_ecdf, x_ccdf)
示例#5
0
def computeFits(entry):
    tag = entry["tag"]
    fname = entry["filename"]

    x = util.read_data(fname)
    x_ecdf = util.read_data(fname + "_ecdf")
    x_ccdf = util.read_data(fname + "_ccdf")
    x.sort()

    computeLognormalFit(tag, x, x_ecdf, x_ccdf)
def plot_data(dist, fset, dset, qset):
    plt.interactive(False)
    plt.rcParams['font.size'] = 17.0

    dist_params = pdata.dist_map[dist]
    ext = dist_params["ext"]
    typ = dset["type"]

    emp_key = "emp_" + typ
    emp_file = fset[emp_key]
    dist_key = ext + "_" + typ
    dist_file = fset[dist_key]

    p1 = read_data(emp_file)
    p2 = read_data(dist_file)

    xlabel = dset["xlabel"]
    ylabel = dset["ylabel"]
    l1 = dset["legend1"]
    l2 = dset["legend2"].replace(pdata.dist_replace_string,
                                 dist_params["legend"])
    loc = dset["loc"]

    if (typ == "cdf"):
        plt.plot(p1[:, 0], p1[:, 1], 'k-', label=l1)
        plt.plot(p2[:, 0], p2[:, 1], 'k--', label=l2)
    else:
        plt.loglog(p1[:, 0], p1[:, 1], 'k-', label=l1)
        plt.loglog(p2[:, 0], p2[:, 1], 'k--', label=l2)

    if "xlim" in dset:
        plt.xlim(dset["xlim"])
    if "ylim" in dset:
        plt.ylim(dset["ylim"])
    if "xticks" in dset:
        plt.xticks(dset["xticks"])
    if "yticks" in dset:
        plt.yticks(dset["yticks"])
    plt.grid()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(loc=loc, frameon=False)

    dir = qset["dir"]
    fname = qset["file"]

    eps_file = dset["tag"].lower() + "_" + typ + "_" + ext + ".eps"
    plot_dir = os.path.join(dir, pdata.plot_dir)
    image_file = os.path.join(plot_dir, eps_file)

    prepare_file(image_file)

    plt.savefig(image_file)
    print "\t Created plot", image_file
    plt.close()
def plot_data(dist, fset, dset, qset):
	plt.interactive(False)
	plt.rcParams['font.size'] = 17.0

	dist_params = pdata.dist_map[dist]
	ext = dist_params["ext"]
	typ = dset["type"]

	emp_key = "emp_" + typ
	emp_file = fset[emp_key]
	dist_key = ext + "_" + typ
	dist_file = fset[dist_key]

	p1 = read_data(emp_file)
	p2 = read_data(dist_file)

	xlabel = dset["xlabel"]
	ylabel = dset["ylabel"]
	l1 = dset["legend1"]
	l2 = dset["legend2"].replace(pdata.dist_replace_string, dist_params["legend"])
	loc = dset["loc"]

	if ( typ == "cdf" ):
		plt.plot(p1[:,0], p1[:,1], 'k-', label=l1)
		plt.plot(p2[:,0], p2[:,1], 'k--', label=l2)
	else:
		plt.loglog(p1[:,0], p1[:,1], 'k-', label=l1)
		plt.loglog(p2[:,0], p2[:,1], 'k--', label=l2)

	if "xlim" in dset:
		plt.xlim(dset["xlim"])
	if "ylim" in dset:
		plt.ylim(dset["ylim"])
	if "xticks" in dset:
		plt.xticks(dset["xticks"])
	if "yticks" in dset:
		plt.yticks(dset["yticks"])
	plt.grid()
	plt.xlabel(xlabel)
	plt.ylabel(ylabel)
	plt.legend(loc=loc, frameon=False)

	dir = qset["dir"]
	fname = qset["file"]

	eps_file = dset["tag"].lower() + "_" + typ + "_" + ext + ".eps"
	plot_dir = os.path.join(dir, pdata.plot_dir)
	image_file = os.path.join(plot_dir, eps_file)

	prepare_file(image_file)

	plt.savefig(image_file)
	print "\t Created plot", image_file
	plt.close()
示例#8
0
def catm_ccdf():
	try:
		plt.show()
	except:
		pass

	
	# These are already computed values.
	# The script is only to plot the data.
	# Please refer to the notes on how to re-compute.
	# xmax is the value from the optimization run for mme
	x = util.read_data("/home/gautam/research/modlav-plots/seslen/catm_ses")
	mle=ml.ModLav(0.9398,3003.4797,0.1488)
	mme=ml.ModLav(1.0,3243.2566,0.1894)
	xmax = 17123.2972

	cc = util.ccdf(x)

	lx=util.gen_points(math.log10(x.min()),math.log10(x.max()),2000)
	ex=np.power(lx,10)
	mlecc = mle.ccdf(ex)

	mme_lx=util.gen_points(math.log10(x.min()),math.log10(xmax),2000)
	mme_ex=np.power(mme_lx,10)
	mmecc = mme.ccdf(mme_ex)

	plt.loglog(cc[:,0],cc[:,1],'k-',label='Data',linestyle='steps')
	plt.loglog(ex,mlecc,'k--',label='MLE fit')
	plt.loglog(mme_ex,mmecc,'k-.',label='MME fit')
	plt.xlim((x.min(),x.max()*10))
	plt.ylim((1e-4,1.1))
	plt.grid()
	plt.ylabel("P(X > x)")
	plt.xlabel("Session length [seconds]")
	plt.legend(loc=3)
示例#9
0
def catm_cdf():
	try:
		plt.show()
	except:
		pass

		
	# These are already computed values.
	# The script is only to plot the data.
	# Please refer to the notes on how to re-compute.
	# xmax is the value from the optimization run for mme
	x = util.read_data("/home/gautam/research/modlav-plots/seslen/catm_ses")
	mle=ml.ModLav(0.9398,3003.4797,0.1488)
	mme=ml.ModLav(1.0,3243.2566,0.1894)
	xmax = 17123.2972

	ec = util.ecdf(x)
	mleec = mle.cdf(ec[:,0])
	mmeec = mme.cdf(ec[:,0])

	plt.plot(ec[:,0],ec[:,1],'k-',label='Data',linestyle='steps')
	plt.plot(ec[:,0],mleec,'k--',label='MLE fit')
	plt.plot(ec[:,0],mmeec,'k-.',label='MME fit')
	plt.grid()
	plt.xlabel("Session length [seconds]")
	plt.ylabel("P(X <= x)")
	plt.ylim((0.0,1.0))
	plt.legend(loc=4)
示例#10
0
def get_root_from_data(data_file):
	"""
	Get a random root value from the points specified by the data file
	"""
	fp_df = os.path.join(os.environ["TRACE_DB_LOC"], data_file)
	vals = util.read_data(fp_df)
	n = len(vals)
	r = rng.get_random()
	i_rootv = r.randint(0, n-1)
	rootv = vals[i_rootv]
	return rootv
示例#11
0
def get_root_from_data(data_file):
    """
	Get a random root value from the points specified by the data file
	"""
    fp_df = os.path.join(os.environ["TRACE_DB_LOC"], data_file)
    vals = util.read_data(fp_df)
    n = len(vals)
    r = rng.get_random()
    i_rootv = r.randint(0, n - 1)
    rootv = vals[i_rootv]
    return rootv
示例#12
0
def dice_matrix(mname):
    global CAT
    fa = str(mname) + "." + CAT[0]
    fm = str(mname) + "." + CAT[1]
    fd = str(mname) + "." + CAT[2]
    fh = str(mname) + "." + CAT[3]

    a = read_data(fa)
    m = read_data(fm)
    d = read_data(fd)
    h = read_data(fh)

    a.sort()
    m.sort()
    d.sort()
    h.sort()

    am = dice(a, m, issorted=True)
    ad = dice(a, d, issorted=True)
    ah = dice(a, h, issorted=True)

    md = dice(m, d, issorted=True)
    mh = dice(m, h, issorted=True)

    dh = dice(d, h, issorted=True)

    simmat = np.zeros([4, 4])

    simmat[0, 0] = ah
    simmat[0, 1] = ad
    simmat[0, 2] = am

    simmat[1, 0] = mh
    simmat[1, 1] = md

    simmat[2, 0] = dh

    return simmat
示例#13
0
def dice_matrix(mname):
	global CAT
	fa = str(mname) + "." + CAT[0]
	fm = str(mname) + "." + CAT[1]
	fd = str(mname) + "." + CAT[2]
	fh = str(mname) + "." + CAT[3]

	a = read_data(fa)
	m = read_data(fm)
	d = read_data(fd)
	h = read_data(fh)

	a.sort()
	m.sort()
	d.sort()
	h.sort()

	am = dice(a,m,issorted=True)
	ad = dice(a,d,issorted=True)
	ah = dice(a,h,issorted=True)

	md = dice(m,d,issorted=True)
	mh = dice(m,h,issorted=True)

	dh = dice(d,h,issorted=True)

	simmat = np.zeros([4,4])

	simmat[0,0] = ah
	simmat[0,1] = ad
	simmat[0,2] = am

	simmat[1,0] = mh
	simmat[1,1] = md

	simmat[2,0] = dh

	return simmat
示例#14
0
def all_ccdf():
	try:
		plt.show()
	except:
		pass

	m1 = ml.ModLav(1.0186, 1534.7651, 0.0892)
	m2 = ml.ModLav(1.0, 1539.8953, 0.0855)

	x = util.read_data("/home/gautam/research/modlav-plots/seslen/all_ses")
	cc = util.read_data("/home/gautam/research/modlav-plots/seslen/all_ccdf")

	m1cc = m1.ccdf(cc[:,0])
	m2cc = m2.ccdf(cc[:,1])

	plt.loglog(cc[:,0],cc[:,1],'k-',label='Data',linestyle='steps')
	plt.loglog(cc[:,0],m1cc,'k--',label='MLE fit')
	plt.loglog(cc[:,0],m2cc,'k-.',label='MME fit')
	plt.grid()
	plt.xlabel("Session length [seconds]")
	plt.ylabel("P(X > x)")
	plt.ylim((1e-5,1.1))
	plt.legend(loc=3)
示例#15
0
def process_datasets(tag_file_map):

    tags = tag_file_map.keys()
    report_map = {}
    for tag in tags:
        data_file = tag_file_map[tag]
        x = util.read_data(data_file)
        x.sort()
        ec = util.ecdf(x)
        cc = util.ccdf(x)

        fit_map = compute_fits(x)
        insert_db_record(tag, fit_map)

        ## Figure out best fit
        bfit = best_fits(fit_map)
        report_map[tag] = (bfit["best_body"], bfit["best_tail"])

        ## Write files out to the directory
        util.write_data(data_file + "_ecdf", ec)
        util.write_data(data_file + "_ccdf", cc)

        ccpts = np.power(
            10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000))
        ecpts = ec[:, 0]

        lmme = fit_map["MME"][0]
        lmle = fit_map["MLE"][0]
        lfit = fit_map["FITMIN"][0]

        mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose()
        mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose()
        util.write_data(data_file + "_ecdf.lognmme", mme_ec)
        util.write_data(data_file + "_ccdf.lognmme", mme_cc)

        mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose()
        mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose()
        util.write_data(data_file + "_ecdf.lognmle", mle_ec)
        util.write_data(data_file + "_ccdf.lognmle", mle_cc)

        fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose()
        fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose()
        util.write_data(data_file + "_ecdf.lognfitmin", fit_ec)
        util.write_data(data_file + "_ccdf.lognfitmin", fit_cc)

    for k in report_map:
        print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1]

    return report_map
示例#16
0
def cath_ecdf():
	try:
		plt.show()
	except:
		pass

	m1 = ml.ModLav(0.9147, 659.8731, 0.0493)
	m3 = ml.ModLav(0.7294, 644.6747, 0.0104)
	
	x = util.read_data("/home/gautam/research/modlav-plots/seslen/cath_ses")
	ec = util.read_data("/home/gautam/research/modlav-plots/seslen/cath_ecdf")

	m1ec=m1.cdf(ec[:,0])
	m3ec=m3.cdf(ec[:,0])

	plt.plot(ec[:,0],ec[:,1],'k-',label='Data',linestyle='steps')
	plt.plot(ec[:,0],m3ec,'k--',label='MLE, MT, No OPT')
	plt.plot(ec[:,0],m1ec,'k-.',label='MLE, No MT, OPT')
	plt.ylim((0.0,1.0))
	plt.xlim((1.0,10000))
	plt.xlabel("Session length [seconds]")
	plt.ylabel("P(X <= x)")
	plt.grid()
	plt.legend(loc=4)
示例#17
0
def dice_matrix(mtup):
    print mtup
    global CAT
    mname = mtup[0]
    fa = str(mname) + "." + CAT[0]
    fm = str(mname) + "." + CAT[1]
    fd = str(mname) + "." + CAT[2]
    fh = str(mname) + "." + CAT[3]

    a = read_data(fa)
    m = read_data(fm)
    d = read_data(fd)
    h = read_data(fh)

    am = sim_mat(a, m)
    ad = sim_mat(a, d)
    ah = sim_mat(a, h)

    md = sim_mat(m, d)
    mh = sim_mat(m, h)

    dh = sim_mat(d, h)

    simmat = np.zeros([4, 4])

    simmat[0, 0] = ah
    simmat[0, 1] = ad
    simmat[0, 2] = am

    simmat[1, 0] = mh
    simmat[1, 1] = md

    simmat[2, 0] = dh

    print simmat
    return (mtup, simmat)
示例#18
0
def process_datasets(tag_file_map):

	tags = tag_file_map.keys()
	report_map = {}
	for tag in tags:
		data_file = tag_file_map[tag]
		x = util.read_data(data_file)
		x.sort()
		ec = util.ecdf(x)
		cc = util.ccdf(x)

		fit_map = compute_fits(x)
		insert_db_record(tag, fit_map)

		## Figure out best fit
		bfit = best_fits(fit_map)
		report_map[tag] = (bfit["best_body"], bfit["best_tail"])
		
		## Write files out to the directory
		util.write_data(data_file + "_ecdf", ec)
		util.write_data(data_file + "_ccdf", cc)

		ccpts = np.power(10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000))	
		ecpts = ec[:,0]

		lmme = fit_map["MME"][0]
		lmle = fit_map["MLE"][0]
		lfit = fit_map["FITMIN"][0]

		mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose()
		mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose()
		util.write_data(data_file + "_ecdf.lognmme", mme_ec)
		util.write_data(data_file + "_ccdf.lognmme", mme_cc)

		mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose()
		mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose()
		util.write_data(data_file + "_ecdf.lognmle", mle_ec)
		util.write_data(data_file + "_ccdf.lognmle", mle_cc)

		fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose()
		fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose()
		util.write_data(data_file + "_ecdf.lognfitmin", fit_ec)
		util.write_data(data_file + "_ccdf.lognfitmin", fit_cc)

	for k in report_map:
		print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1]

	return report_map
def computeBasicStats(entry):
	tag = entry["tag"]
	fname = entry["filename"]
	x = util.read_data(fname)
	x.sort()
	s = util.get_stats(x)
	
	t = (entry["tag"], s["size"], s["min"], s["max"], s["pct10"], s["pct50"], s["pct90"], s["mu"], s["sigma"], s["cv"])
	q = "insert into basic_stats(unique_id, size, min, max, pct10, pct50, pct90, mu, sdev, cv) values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
	c = getConnection()
	try:
		print "\t\t Inserting basic stats for " + tag + "[" + fname + "]"
		c.execute(q, t)
		c.commit()
		print "\t\t OK"
	except BaseException as be:
		c.rollback()
		print "\t\t FAILED"
示例#20
0
def computeBasicStats(entry):
    tag = entry["tag"]
    fname = entry["filename"]
    x = util.read_data(fname)
    x.sort()
    s = util.get_stats(x)

    t = (entry["tag"], s["size"], s["min"], s["max"], s["pct10"], s["pct50"],
         s["pct90"], s["mu"], s["sigma"], s["cv"])
    q = "insert into basic_stats(unique_id, size, min, max, pct10, pct50, pct90, mu, sdev, cv) values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
    c = getConnection()
    try:
        print "\t\t Inserting basic stats for " + tag + "[" + fname + "]"
        c.execute(q, t)
        c.commit()
        print "\t\t OK"
    except BaseException as be:
        c.rollback()
        print "\t\t FAILED"
示例#21
0
def gen_cdf_ccdf():
	r = RunSQL("files_and_analysis.db")
	dsets = r.sqlq("select unique_id, filename from datasets")

	for dpair in dsets:
		dset_id = dpair[0]
		dset_file = dpair[1]

		print "Processing data set - ", dset_id

		x = read_data(dset_file)
		x.sort()
		ec = ecdf(x, issorted=True)
		pts = np.power(10, gen_points(math.log10(min(x)), math.log10(max(x)), 2000))
		
		# dist_list = ["LOGLOGISTIC", "LOGN", "TPARETO", "TRUNCLL"]
		dist_list = ["LOGN"]
		fext_map = {"LOGLOGISTIC": "ll", "LOGN": "lgn", "TPARETO": "tp", "TRUNCLL": "tll"}

		for distname in dist_list:
			print "\t Getting distribution - ", distname
			dist = get_dist(dset_id, distname)
			dec = dist.cdf(ec[:,0])
			dcc = dist.ccdf(pts)

			fdec = np.array([ec[:,0], dec]).transpose()
			fdcc = np.array([pts, dcc]).transpose()

			op_dir = os.path.dirname(dset_file)
			op_ec_file = op_dir + "/" + os.path.basename(dset_file)+"_ecdf" + "." + fext_map[distname]
			op_cc_file = op_dir + "/" + os.path.basename(dset_file)+"_ccdf" + "." + fext_map[distname]
			
			print "\t Writing CDF - ", op_ec_file
			write_data(op_ec_file, fdec)

			print "\t Writing CCF - ", op_cc_file
			write_data(op_cc_file, fdcc)
示例#22
0
def fitlogn(dataf):
	x=np.array(read_data(dataf))
	x.sort()

	l1 = Lognormal.fromFit(x)
	l2 = Lognormal.fromFit(x,mmefit=False)

	ec = ecdf(x)
	cc = ccdf(x)

	q1 = l1.fitmetric(cdf=ec)
	q2 = l2.fitmetric(cdf=ec)

	print "File: " + dataf
	if q1 <= q2:
		print "Type: MME"
		print "Lognormal: " + str(l1)
		print "FIT: ", q1
		print "K-S: ", l1.ksmetric(cdf=ec)
	else:
		print "Type: MLE"
		print "Lognormal: " + str(l2)
		print "FIT: ", q2
		print "K-S: ", l2.ksmetric(cdf=ec)
示例#23
0
def fitlogn(dataf):
    x = np.array(read_data(dataf))
    x.sort()

    l1 = Lognormal.fromFit(x)
    l2 = Lognormal.fromFit(x, mmefit=False)

    ec = ecdf(x)
    cc = ccdf(x)

    q1 = l1.fitmetric(cdf=ec)
    q2 = l2.fitmetric(cdf=ec)

    print "File: " + dataf
    if q1 <= q2:
        print "Type: MME"
        print "Lognormal: " + str(l1)
        print "FIT: ", q1
        print "K-S: ", l1.ksmetric(cdf=ec)
    else:
        print "Type: MLE"
        print "Lognormal: " + str(l2)
        print "FIT: ", q2
        print "K-S: ", l2.ksmetric(cdf=ec)
示例#24
0
                else:
                    usage()
                    sys.exit(2)
            else:
                usage()
                sys.exit(2)
    except getopt.GetoptError, opt_err:
        print str(opt_err)
        usage()
        sys.exit(2)

    if input_file == None or lo == None or hi == None or n == None or fit1 == None or mt1 == None:
        usage()
        sys.exit(2)

    x1 = util.read_data(input_file)
    x1.sort()

    rs = paroptfit(x1, hi, lo, n, fit1, mt1)

    print rs

    ## xmax_pts = util.gen_points(lo, hi, n)

    ## ncpus = mp.cpu_count()
    ## proc_pool = Pool(ncpus)
    ## result = proc_pool.map(optfit, xmax_pts)

    # definitions to compare fit and k-s values
    # 2 = index of k-s metric in each tuple of the result
    # 3 = index of fit metric in each tuple of the result
def recursive_model(**kwargs):
	"""
	Recursive forest file model simulator
	Params:
	file = data file to pick initial roots
	nroots = number of roots
	d1 = model for roots
	d1params = (param1, param2,...) parameters for d1
	d2 = model for children nodes
	d2params = (param1, param2, ...) parameters for d2
	g = probability of a new file
	nu = probability of a deletion
	n = number of iterations
	minsize = minimum file size
	"""

	sroots = None
	d1f = None
	d2f = None
	g = None
	nu = None
	minsize = 0.0
	
	nroots = kwargs["nroots"]
	if "minsize" in kwargs:
		minsize = kwargs["minsize"]

	if "file" in kwargs:
		fname = kwargs["file"]
		fl = util.read_data(fname)
		d1f = DFile(flist=fl, minsize=0.0)
	else:
		d1model = kwargs["d1"]
		d1params = kwargs["d1params"]
		d1f = DFile(d=d1model, plist=d1params, minsize=0.0)
	
	d2model = kwargs["d2"]
	d2params = kwargs["d2params"]
	d2f = DFile(d=d2model, plist=d2params)

	gf = rng.get_random() # random.Random()
	nf = rng.get_random() # random.Random()
	fpick = rng.get_random() # random.Random()
	g = kwargs["g"]
	nu = kwargs["nu"]
	n = kwargs["n"]

	sroots = d1f.random(nroots)
	simvs = []
	for sroot in sroots:
		s = {"size": sroot, "deleted": False, "mult_factor": 0.0, "depth": 0}
		simvs.append(s)
		
	for i in xrange(nroots, n+1):
		gvar = gf.random()
		if gvar <= g:
			a = d1f.random(1)
			sa = {"size": a, "deleted": False, "mult_factor": 0.0, "depth": 0}
			simvs.append(sa)
		else:
			idx = fpick.randint(0, len(simvs)-1) 
			nvar = nf.random()
			if nvar <= nu:
				simvs.pop(idx)
			else:
				mf = d2f.random(1)
				idx = fpick.randint(0, len(simvs)-1)
				pick = simvs[idx]

				ns = max(pick["size"]*mf[0], minsize)
				ndel = False
				nmf = pick["mult_factor"] * mf[0]
				ndepth = pick["depth"] + 1

				simvs.append({"size": ns, "deleted": ndel, "mult_factor": nmf, "depth": ndepth})

	sizev=[]	
	mfv=[]
	depthv=[]

	for simv	in simvs:
		sizev.append(simv["size"])
		mfv.append(simv["mult_factor"])
		depthv.append(simv["depth"])
	
	return (np.array(sizev), np.array(mfv), np.array(depthv))
示例#26
0
def create_single_plot(tag_file_map, report_map, dset):
	plt.interactive(False)
	plt.rcParams['font.size'] = 17.0

	dist_params = pdata.dist_map["LOGN"]
	ext = dist_params["ext"]
	typ = dset["type"]

	tag = dset["tag"]

	bfit = report_map[tag][0]
	tfit = report_map[tag][1]

	if bfit == tfit:
		l1 = dset["legend1"]
		l2 = "Body & tail: LOGN-" + bfit

		if typ == "cdf":
			p1 = util.read_data(tag_file_map[tag] + "_ecdf")
			p2 = util.read_data(tag_file_map[tag] + "_ecdf.logn" + bfit.lower())
			plt.plot(p1[:,0], p1[:,1], 'k-', label=l1)
			plt.plot(p2[:,0], p2[:,1], 'k--', label=l2)
		else:	
			p1 = util.read_data(tag_file_map[tag] + "_ccdf")
			p2 = util.read_data(tag_file_map[tag] + "_ccdf.logn" + bfit.lower())
			plt.loglog(p1[:,0], p1[:,1], 'k-', label=l1)
			plt.loglog(p2[:,0], p2[:,1], 'k--', label=l2)
	else:
		l1 = dset["legend1"]
		l2 = "Body: LOGN-" + bfit
		l3 = "Tail: LOGN-" + tfit

		if typ == "cdf":
			p1 = util.read_data(tag_file_map[tag] + "_ecdf")
			p2 = util.read_data(tag_file_map[tag] + "_ecdf.logn" + bfit.lower())
			p3 = util.read_data(tag_file_map[tag] + "_ecdf.logn" + tfit.lower())
			plt.plot(p1[:,0], p1[:,1], 'k-', label=l1)
			plt.plot(p2[:,0], p2[:,1], 'k-.', label=l2)
			plt.plot(p3[:,0], p3[:,1], 'k--', label=l3)
		else:
			p1 = util.read_data(tag_file_map[tag] + "_ccdf")
			p2 = util.read_data(tag_file_map[tag] + "_ccdf.logn" + bfit.lower())
			p3 = util.read_data(tag_file_map[tag] + "_ccdf.logn" + tfit.lower())
			plt.loglog(p1[:,0], p1[:,1], 'k-', label=l1)
			plt.loglog(p2[:,0], p2[:,1], 'k-.', label=l2)
			plt.loglog(p3[:,0], p3[:,1], 'k--', label=l3)
	
	loc = dset["loc"]

	if "xlim" in dset:
		plt.xlim(dset["xlim"])
	if "ylim" in dset:
		plt.ylim(dset["ylim"])
	if "xticks" in dset:
		plt.xticks(dset["xticks"])
	if "yticks" in dset:
		plt.yticks(dset["yticks"])
	plt.grid()
	plt.xlabel(dset["xlabel"])
	plt.ylabel(dset["ylabel"])
	plt.legend(loc=loc, frameon=False)

	eps_file = tag.lower() + "_" + typ + "_" + ext + ".eps"
	plot_path = os.path.join(os.getenv("HOME"), main_dir)
	plot_file = os.path.join(plot_path, eps_file)

	if os.access(plot_file, os.R_OK):
		os.remove(plot_file)

	plt.savefig(plot_file)
	plt.close()
示例#27
0
def pardd(fname):

    inpf = fname
    x1 = util.read_data(inpf)
    x1.sort()
    xmx = x1.max()

    n = 500
    lo = 0.1 * xmx
    hi = 10 * xmx

    ccf = inpf + "_ccdf"
    ecf = inpf + "_ecdf"

    cc = util.ccdf(x1)
    ec = util.ecdf(x1)

    util.write_data(ccf, cc)
    util.write_data(ecf, ec)

    mle = ml.ModLav.fromFit(x1, fit="mlefit")
    mme = ml.ModLav.fromFit(x1, fit="mmefit")
    mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True)
    mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True)

    no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec),
              mle.difference(cdf=ec))
    no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec),
              mme.difference(cdf=ec))
    no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec),
                 mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec))
    no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec),
                 mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec))

    omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False)
    omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True)
    omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False)
    omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True)

    mle_opt = omle["fit"]
    mle_opt_mt = omle_mt["fit"]
    mme_opt = omme["fit"]
    mme_opt_mt = omme_mt["fit"]

    k_mle_opt = omle["ks"]
    k_mle_opt_mt = omle_mt["ks"]
    k_mme_opt = omme["ks"]
    k_mme_opt_mt = omme_mt["ks"]

    d_mle_opt = omle["diff"]
    d_mle_opt_mt = omle_mt["diff"]
    d_mme_opt = omme["diff"]
    d_mme_opt_mt = omme_mt["diff"]

    fitlist = [("MLE", no_mle), \
         ("MME", no_mme), \
         ("MLE-MT", no_mle_mt), \
         ("MME-MT", no_mme_mt), \
         ("MLE-OPT", mle_opt), \
         ("MLE-OPT-MT", mle_opt_mt), \
         ("MME-OPT", mme_opt), \
         ("MME-OPT-MT", mme_opt_mt), \
         ("KS-MLE-OPT", k_mle_opt), \
         ("KS-MLE-OPT-MT", k_mle_opt_mt), \
         ("KS-MME-OPT", k_mme_opt), \
         ("KS-MME-OPT-MT", k_mme_opt_mt), \
         ("D-MLE-OPT", d_mle_opt), \
         ("D-MLE-OPT-MT", d_mle_opt_mt), \
         ("D-MME-OPT", d_mme_opt), \
         ("D-MME-OPT-MT", d_mme_opt_mt)]

    n, amin, amax, mu, sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std()
    cv = sigma / mu
    q = ms.mquantiles(x1, [0.1, 0.5, 0.9])

    op1_str = []
    op_str = []
    op_str.append("BASIC STATISTICS")
    op_str.append(
        "--------------------------------------------------------------------------"
    )
    op_str.append("Size: " + str(n))
    op_str.append("Range: " + str(amin) + " - " + str(amax))
    op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) +
                  " 90% - " + str(q[2]))
    op_str.append("Mean: " + str(mu))
    op_str.append("Sigma: " + str(sigma))
    op_str.append("CV: " + str(cv))
    op_str.append("\n")

    best_fit_map = dict()

    for f in fitlist:
        lbl = f[0]
        m = f[1][0]
        mx = f[1][1]
        fitm = f[1][2]
        ksm = f[1][3]
        diffm = f[1][4]

        best_fit_map[lbl] = (m, mx, fitm, ksm, diffm)

        op_str.append(lbl)
        op_str.append(
            "--------------------------------------------------------------------------"
        )
        op_str.append("Modlav params: " + str(m))
        op_str.append("Xmax: " + str(mx))
        op_str.append("Xmax/Max: " + str(mx / xmx))
        ## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1)))
        ## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1)))
        op_str.append("FIT Metric: " + str(fitm))
        op_str.append("K-S Metric: " + str(ksm))
        op_str.append("DIFF Metric: " + str(diffm))
        op_str.append(
            "--------------------------------------------------------------------------"
        )
        op_str.append("\n")

        flbl = lbl.lower().replace("-", "_")
        fname_pfx = inpf + "_" + flbl

        lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000)
        ex = np.power(10, lx)

        mcc = m.ccdf(ex)
        mec = m.cdf(ec[:, 0])

        fmcc = np.array([ex, mcc]).transpose()
        fmec = np.array([ec[:, 0], mec]).transpose()

        util.write_data(fname_pfx + "_ccdf", fmcc)
        util.write_data(fname_pfx + "_ecdf", fmec)

    recom = best_fit(best_fit_map, xmx)
    for s1 in op_str:
        op1_str.append(s1 + "\n")
    op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n")

    txf = open(inpf + "_metric", "w+")
    txf.writelines(op1_str)
    txf.close()
示例#28
0
def create_single_plot(tag_file_map, report_map, dset):
    plt.interactive(False)
    plt.rcParams['font.size'] = 17.0

    dist_params = pdata.dist_map["LOGN"]
    ext = dist_params["ext"]
    typ = dset["type"]

    tag = dset["tag"]

    bfit = report_map[tag][0]
    tfit = report_map[tag][1]

    if bfit == tfit:
        l1 = dset["legend1"]
        l2 = "Body & tail: LOGN-" + bfit

        if typ == "cdf":
            p1 = util.read_data(tag_file_map[tag] + "_ecdf")
            p2 = util.read_data(tag_file_map[tag] + "_ecdf.logn" +
                                bfit.lower())
            plt.plot(p1[:, 0], p1[:, 1], 'k-', label=l1)
            plt.plot(p2[:, 0], p2[:, 1], 'k--', label=l2)
        else:
            p1 = util.read_data(tag_file_map[tag] + "_ccdf")
            p2 = util.read_data(tag_file_map[tag] + "_ccdf.logn" +
                                bfit.lower())
            plt.loglog(p1[:, 0], p1[:, 1], 'k-', label=l1)
            plt.loglog(p2[:, 0], p2[:, 1], 'k--', label=l2)
    else:
        l1 = dset["legend1"]
        l2 = "Body: LOGN-" + bfit
        l3 = "Tail: LOGN-" + tfit

        if typ == "cdf":
            p1 = util.read_data(tag_file_map[tag] + "_ecdf")
            p2 = util.read_data(tag_file_map[tag] + "_ecdf.logn" +
                                bfit.lower())
            p3 = util.read_data(tag_file_map[tag] + "_ecdf.logn" +
                                tfit.lower())
            plt.plot(p1[:, 0], p1[:, 1], 'k-', label=l1)
            plt.plot(p2[:, 0], p2[:, 1], 'k-.', label=l2)
            plt.plot(p3[:, 0], p3[:, 1], 'k--', label=l3)
        else:
            p1 = util.read_data(tag_file_map[tag] + "_ccdf")
            p2 = util.read_data(tag_file_map[tag] + "_ccdf.logn" +
                                bfit.lower())
            p3 = util.read_data(tag_file_map[tag] + "_ccdf.logn" +
                                tfit.lower())
            plt.loglog(p1[:, 0], p1[:, 1], 'k-', label=l1)
            plt.loglog(p2[:, 0], p2[:, 1], 'k-.', label=l2)
            plt.loglog(p3[:, 0], p3[:, 1], 'k--', label=l3)

    loc = dset["loc"]

    if "xlim" in dset:
        plt.xlim(dset["xlim"])
    if "ylim" in dset:
        plt.ylim(dset["ylim"])
    if "xticks" in dset:
        plt.xticks(dset["xticks"])
    if "yticks" in dset:
        plt.yticks(dset["yticks"])
    plt.grid()
    plt.xlabel(dset["xlabel"])
    plt.ylabel(dset["ylabel"])
    plt.legend(loc=loc, frameon=False)

    eps_file = tag.lower() + "_" + typ + "_" + ext + ".eps"
    plot_path = os.path.join(os.getenv("HOME"), main_dir)
    plot_file = os.path.join(plot_path, eps_file)

    if os.access(plot_file, os.R_OK):
        os.remove(plot_file)

    plt.savefig(plot_file)
    plt.close()
示例#29
0
def pardd(fname):
	
	inpf = fname
	x1 = util.read_data(inpf)
	x1.sort()
	xmx = x1.max()

	n = 500
	lo = 0.1*xmx
	hi = 10*xmx

	ccf = inpf + "_ccdf"
	ecf = inpf + "_ecdf"

	cc = util.ccdf(x1)
	ec = util.ecdf(x1)

	util.write_data(ccf, cc)
	util.write_data(ecf, ec)

	mle = ml.ModLav.fromFit(x1, fit="mlefit")
	mme = ml.ModLav.fromFit(x1, fit="mmefit")
	mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True)
	mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True)

	no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec), mle.difference(cdf=ec))
	no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec), mme.difference(cdf=ec))
	no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec), mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec))
	no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec), mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec))

	omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False)
	omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True)
	omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False)
	omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True)

	mle_opt = omle["fit"]
	mle_opt_mt = omle_mt["fit"]
	mme_opt = omme["fit"]
	mme_opt_mt = omme_mt["fit"]

	k_mle_opt = omle["ks"]
	k_mle_opt_mt = omle_mt["ks"]
	k_mme_opt = omme["ks"]
	k_mme_opt_mt = omme_mt["ks"]
	
	d_mle_opt = omle["diff"]
	d_mle_opt_mt = omle_mt["diff"]
	d_mme_opt = omme["diff"]
	d_mme_opt_mt = omme_mt["diff"]

	fitlist = [("MLE", no_mle), \
				  ("MME", no_mme), \
				  ("MLE-MT", no_mle_mt), \
				  ("MME-MT", no_mme_mt), \
				  ("MLE-OPT", mle_opt), \
				  ("MLE-OPT-MT", mle_opt_mt), \
				  ("MME-OPT", mme_opt), \
				  ("MME-OPT-MT", mme_opt_mt), \
				  ("KS-MLE-OPT", k_mle_opt), \
				  ("KS-MLE-OPT-MT", k_mle_opt_mt), \
				  ("KS-MME-OPT", k_mme_opt), \
				  ("KS-MME-OPT-MT", k_mme_opt_mt), \
				  ("D-MLE-OPT", d_mle_opt), \
				  ("D-MLE-OPT-MT", d_mle_opt_mt), \
				  ("D-MME-OPT", d_mme_opt), \
				  ("D-MME-OPT-MT", d_mme_opt_mt)]

	n,amin,amax,mu,sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std()
	cv = sigma/mu
	q = ms.mquantiles(x1, [0.1, 0.5, 0.9])
	
	op1_str = []
	op_str = []
	op_str.append("BASIC STATISTICS")
	op_str.append("--------------------------------------------------------------------------")
	op_str.append("Size: " + str(n))
	op_str.append("Range: " + str(amin) + " - " + str(amax))
	op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2]))
	op_str.append("Mean: " + str(mu))
	op_str.append("Sigma: " + str(sigma))
	op_str.append("CV: " + str(cv))
	op_str.append("\n")
	
	best_fit_map = dict()

	for f in fitlist:
		lbl = f[0]
		m = f[1][0]
		mx = f[1][1]
		fitm = f[1][2]
		ksm = f[1][3]
		diffm = f[1][4]

		best_fit_map[lbl] = (m, mx, fitm, ksm, diffm)

		op_str.append(lbl)
		op_str.append("--------------------------------------------------------------------------")
		op_str.append("Modlav params: " + str(m))
		op_str.append("Xmax: " + str(mx))
		op_str.append("Xmax/Max: " + str(mx/xmx))
		## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1)))
		## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1)))
		op_str.append("FIT Metric: " + str(fitm))
		op_str.append("K-S Metric: " + str(ksm))
		op_str.append("DIFF Metric: " + str(diffm))
		op_str.append("--------------------------------------------------------------------------")
		op_str.append("\n")

		flbl = lbl.lower().replace("-", "_")
		fname_pfx = inpf + "_" + flbl

		lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000)
		ex = np.power(10, lx)

		mcc = m.ccdf(ex)
		mec = m.cdf(ec[:,0])

		fmcc = np.array([ex, mcc]).transpose()
		fmec = np.array([ec[:,0], mec]).transpose()

		util.write_data(fname_pfx + "_ccdf", fmcc)
		util.write_data(fname_pfx + "_ecdf", fmec)

	recom = best_fit(best_fit_map, xmx)
	for s1 in op_str:
		op1_str.append(s1 + "\n")
	op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n")

	txf = open(inpf + "_metric", "w+")
	txf.writelines(op1_str)
	txf.close()
示例#30
0
				else:
					usage()
					sys.exit(2)
			else:
				usage()
				sys.exit(2)
	except getopt.GetoptError, opt_err:
		print str(opt_err)
		usage()
		sys.exit(2)
	
	if input_file == None or lo == None or hi == None or n == None or fit1 == None or mt1 == None:
		usage()
		sys.exit(2)

	x1 = util.read_data(input_file)
	x1.sort()

	rs = paroptfit(x1, hi, lo, n, fit1, mt1)

	print rs

	## xmax_pts = util.gen_points(lo, hi, n)

	## ncpus = mp.cpu_count()
	## proc_pool = Pool(ncpus)
	## result = proc_pool.map(optfit, xmax_pts)

	# definitions to compare fit and k-s values
	# 2 = index of k-s metric in each tuple of the result
	# 3 = index of fit metric in each tuple of the result