Пример #1
0
def catm_ccdf():
	try:
		plt.show()
	except:
		pass

	
	# These are already computed values.
	# The script is only to plot the data.
	# Please refer to the notes on how to re-compute.
	# xmax is the value from the optimization run for mme
	x = util.read_data("/home/gautam/research/modlav-plots/seslen/catm_ses")
	mle=ml.ModLav(0.9398,3003.4797,0.1488)
	mme=ml.ModLav(1.0,3243.2566,0.1894)
	xmax = 17123.2972

	cc = util.ccdf(x)

	lx=util.gen_points(math.log10(x.min()),math.log10(x.max()),2000)
	ex=np.power(lx,10)
	mlecc = mle.ccdf(ex)

	mme_lx=util.gen_points(math.log10(x.min()),math.log10(xmax),2000)
	mme_ex=np.power(mme_lx,10)
	mmecc = mme.ccdf(mme_ex)

	plt.loglog(cc[:,0],cc[:,1],'k-',label='Data',linestyle='steps')
	plt.loglog(ex,mlecc,'k--',label='MLE fit')
	plt.loglog(mme_ex,mmecc,'k-.',label='MME fit')
	plt.xlim((x.min(),x.max()*10))
	plt.ylim((1e-4,1.1))
	plt.grid()
	plt.ylabel("P(X > x)")
	plt.xlabel("Session length [seconds]")
	plt.legend(loc=3)
Пример #2
0
def paroptfit(x1, hi, lo, n, fit1, mt1):
	"""
	x1: Sorted array of points
	hi: Max. xmax to estimate to
	lo: Min. xmax to estimate to
	n: #. of points from hi to lo for the estimation
	fit1: Type of fit - mlefit or mmefit
	mt1: Mirror xform - True or False
	"""

	global x, fit, mt, c, max_of_x

	## Initialize globals prior to the parallel run
	##
	
	x = x1
	max_of_x = max(x)
	fit = fit1
	mt = mt1
	c = util.ecdf(x)
	
	xmax_pts = util.gen_points(lo, hi, n)
	l_xmax_pts = xmax_pts.tolist()
	l_xmax_pts.append(max_of_x)
	l_xmax_pts.sort()
	xmax_pts = np.array(l_xmax_pts)

	ncpus = mp.cpu_count()
	proc_pool = Pool(ncpus)
	result = proc_pool.map(optfit, xmax_pts)
	
	FIT_COMP_IDX = 2

	return best_fit(result, FIT_COMP_IDX)
Пример #3
0
def cath_ccdf():
	try:
		plt.show()
	except:
		pass

	# These are already computed values. Please look at the notes
	# on discussion as to why and how they were chosen. The CDF 
	# and CCDF values for the fitted distributions are different.
	# The reasoning for this is also in the notes.

	m2 = ml.ModLav(0.7438, 657.1194, 0.0191)
	m3 = ml.ModLav(0.7294, 644.6747, 0.0104)
	
	x = util.read_data("/home/gautam/research/modlav-plots/seslen/cath_ses")
	cc = util.read_data("/home/gautam/research/modlav-plots/seslen/cath_ccdf")

	plt.loglog(cc[:,0],cc[:,1],'k-',label='Data',linestyle='steps')
	lx=util.gen_points(math.log10(x.min()),math.log10(x.max()),2000)
	ex=np.power(lx,10)
	m2cc=m2.ccdf(ex)
	m3cc=m3.ccdf(ex)
	plt.loglog(ex,m3cc,'k--',label='MLE,MT, No OPT')
	plt.loglog(ex,m2cc,'k-.',label='MLE, MT, OPT')
	plt.grid()
	plt.xlim((1.0,1e5))
	plt.ylim((1e-4,1.1))
	plt.ylabel("P(X > x)")
	plt.xlabel("Session length [seconds]")
	plt.legend(loc=3)
Пример #4
0
def process_datasets(tag_file_map):

    tags = tag_file_map.keys()
    report_map = {}
    for tag in tags:
        data_file = tag_file_map[tag]
        x = util.read_data(data_file)
        x.sort()
        ec = util.ecdf(x)
        cc = util.ccdf(x)

        fit_map = compute_fits(x)
        insert_db_record(tag, fit_map)

        ## Figure out best fit
        bfit = best_fits(fit_map)
        report_map[tag] = (bfit["best_body"], bfit["best_tail"])

        ## Write files out to the directory
        util.write_data(data_file + "_ecdf", ec)
        util.write_data(data_file + "_ccdf", cc)

        ccpts = np.power(
            10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000))
        ecpts = ec[:, 0]

        lmme = fit_map["MME"][0]
        lmle = fit_map["MLE"][0]
        lfit = fit_map["FITMIN"][0]

        mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose()
        mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose()
        util.write_data(data_file + "_ecdf.lognmme", mme_ec)
        util.write_data(data_file + "_ccdf.lognmme", mme_cc)

        mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose()
        mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose()
        util.write_data(data_file + "_ecdf.lognmle", mle_ec)
        util.write_data(data_file + "_ccdf.lognmle", mle_cc)

        fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose()
        fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose()
        util.write_data(data_file + "_ecdf.lognfitmin", fit_ec)
        util.write_data(data_file + "_ccdf.lognfitmin", fit_cc)

    for k in report_map:
        print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1]

    return report_map
Пример #5
0
def process_datasets(tag_file_map):

	tags = tag_file_map.keys()
	report_map = {}
	for tag in tags:
		data_file = tag_file_map[tag]
		x = util.read_data(data_file)
		x.sort()
		ec = util.ecdf(x)
		cc = util.ccdf(x)

		fit_map = compute_fits(x)
		insert_db_record(tag, fit_map)

		## Figure out best fit
		bfit = best_fits(fit_map)
		report_map[tag] = (bfit["best_body"], bfit["best_tail"])
		
		## Write files out to the directory
		util.write_data(data_file + "_ecdf", ec)
		util.write_data(data_file + "_ccdf", cc)

		ccpts = np.power(10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000))	
		ecpts = ec[:,0]

		lmme = fit_map["MME"][0]
		lmle = fit_map["MLE"][0]
		lfit = fit_map["FITMIN"][0]

		mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose()
		mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose()
		util.write_data(data_file + "_ecdf.lognmme", mme_ec)
		util.write_data(data_file + "_ccdf.lognmme", mme_cc)

		mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose()
		mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose()
		util.write_data(data_file + "_ecdf.lognmle", mle_ec)
		util.write_data(data_file + "_ccdf.lognmle", mle_cc)

		fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose()
		fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose()
		util.write_data(data_file + "_ecdf.lognfitmin", fit_ec)
		util.write_data(data_file + "_ccdf.lognfitmin", fit_cc)

	for k in report_map:
		print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1]

	return report_map
Пример #6
0
def paroptfit(x1, hi, lo, n, fit1, mt1):
    """
	x1: Sorted array of points
	hi: Max. xmax to estimate to
	lo: Min. xmax to estimate to
	n: #. of points from hi to lo for the estimation
	fit1: Type of fit - mlefit or mmefit
	mt1: Mirror xform - True or False
	"""

    global x, fit, mt, c, max_of_x

    ## Initialize globals prior to the parallel run
    ##

    x = x1
    max_of_x = max(x)
    fit = fit1
    mt = mt1
    c = util.ecdf(x)

    xmax_pts = util.gen_points(lo, hi, n)
    l_xmax_pts = xmax_pts.tolist()
    l_xmax_pts.append(max_of_x)
    l_xmax_pts.sort()
    xmax_pts = np.array(l_xmax_pts)

    ncpus = mp.cpu_count()
    proc_pool = Pool(ncpus)
    result = proc_pool.map(optfit, xmax_pts)

    # definitions to compare fit and k-s values
    # 2 = index of k-s metric in each tuple of the result
    # 3 = index of fit metric in each tuple of the result
    # !!! REMOVING KS COMP !!!
    FIT_COMP_IDX = 2
    # KS_COMP_IDX = 3
    DIFF_COMP_IDX = 3

    return {
        "fit": best_fit(result, FIT_COMP_IDX),
        "diff": best_fit(result, DIFF_COMP_IDX)
    }
Пример #7
0
def paroptfit(x1, hi, lo, n, fit1, mt1):
	"""
	x1: Sorted array of points
	hi: Max. xmax to estimate to
	lo: Min. xmax to estimate to
	n: #. of points from hi to lo for the estimation
	fit1: Type of fit - mlefit or mmefit
	mt1: Mirror xform - True or False
	"""

	global x, fit, mt, c, max_of_x

	## Initialize globals prior to the parallel run
	##
	
	x = x1
	max_of_x = max(x)
	fit = fit1
	mt = mt1
	c = util.ecdf(x)
	
	xmax_pts = util.gen_points(lo, hi, n)
	l_xmax_pts = xmax_pts.tolist()
	l_xmax_pts.append(max_of_x)
	l_xmax_pts.sort()
	xmax_pts = np.array(l_xmax_pts)

	ncpus = mp.cpu_count()
	proc_pool = Pool(ncpus)
	result = proc_pool.map(optfit, xmax_pts)
	
	# definitions to compare fit and k-s values
	# 2 = index of k-s metric in each tuple of the result
	# 3 = index of fit metric in each tuple of the result
	# !!! REMOVING KS COMP !!!
	FIT_COMP_IDX = 2
	# KS_COMP_IDX = 3
	DIFF_COMP_IDX = 3

	return {"fit": best_fit(result, FIT_COMP_IDX), "diff": best_fit(result, DIFF_COMP_IDX)}
Пример #8
0
def gen_cdf_ccdf():
	r = RunSQL("files_and_analysis.db")
	dsets = r.sqlq("select unique_id, filename from datasets")

	for dpair in dsets:
		dset_id = dpair[0]
		dset_file = dpair[1]

		print "Processing data set - ", dset_id

		x = read_data(dset_file)
		x.sort()
		ec = ecdf(x, issorted=True)
		pts = np.power(10, gen_points(math.log10(min(x)), math.log10(max(x)), 2000))
		
		# dist_list = ["LOGLOGISTIC", "LOGN", "TPARETO", "TRUNCLL"]
		dist_list = ["LOGN"]
		fext_map = {"LOGLOGISTIC": "ll", "LOGN": "lgn", "TPARETO": "tp", "TRUNCLL": "tll"}

		for distname in dist_list:
			print "\t Getting distribution - ", distname
			dist = get_dist(dset_id, distname)
			dec = dist.cdf(ec[:,0])
			dcc = dist.ccdf(pts)

			fdec = np.array([ec[:,0], dec]).transpose()
			fdcc = np.array([pts, dcc]).transpose()

			op_dir = os.path.dirname(dset_file)
			op_ec_file = op_dir + "/" + os.path.basename(dset_file)+"_ecdf" + "." + fext_map[distname]
			op_cc_file = op_dir + "/" + os.path.basename(dset_file)+"_ccdf" + "." + fext_map[distname]
			
			print "\t Writing CDF - ", op_ec_file
			write_data(op_ec_file, fdec)

			print "\t Writing CCF - ", op_cc_file
			write_data(op_cc_file, fdcc)
Пример #9
0
def main(dt):
	slen = {"catm": "select seslen from data_log where ucat_term='catm\n' and seslen>0 and seslen<18000 order by seslen", \
			  "catd": "select seslen from data_log where ucat_term='catd\n' and seslen>0 and seslen<18000 order by seslen", \
			  "cath": "select seslen from data_log where ucat_term='cath\n' and seslen>0 and seslen<18000 order by seslen", \
			  "all": "select seslen from data_log where seslen>0 and seslen<18000 order by seslen"}

	inb = {"catm": "select bin from data_log where ucat_term='catm\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \
			  "catd": "select bin from data_log where ucat_term='catd\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \
			  "cath": "select bin from data_log where ucat_term='cath\n' and bin>0 and seslen>0 and seslen<18000 order by bin", \
			  "all": "select bin from data_log where bin>0 and seslen>0 and seslen<18000 order by bin"}
	
	outb = {"catm": "select bout from data_log where ucat_term='catm\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \
			  "catd": "select bout from data_log where ucat_term='catd\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \
			  "cath": "select bout from data_log where ucat_term='cath\n' and bout>0 and seslen>0 and seslen<18000 order by bout", \
			  "all": "select bout from data_log where bout>0 and seslen>0 and seslen<18000 order by bout"}

	to_inb = {"all": "select bin from data_log where seslen >= 18000 and bin>0 order by bin"}

	to_outb = {"all": "select bout from data_log where seslen >= 18000 and bout>0 order by bout"}
	
	tslen = {"catm": "select sum(seslen) t from data_log where ucat_term='catm\n' and seslen>0 and seslen<18000 group by user order by t", \
			  "catd": "select sum(seslen) t from data_log where ucat_term='catd\n' and seslen>0 and seslen<18000 group by user order by t", \
			  "cath": "select sum(seslen) t from data_log where ucat_term='cath\n' and seslen>0 and seslen<18000 group by user order by t", \
			  "all": "select sum(seslen) t from data_log where seslen>0 and seslen<18000 group by user order by t"}

	tinb = {"catm": "select sum(bin) t from data_log where ucat_term='catm\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \
			  "catd": "select sum(bin) t from data_log where ucat_term='catd\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \
			  "cath": "select sum(bin) t from data_log where ucat_term='cath\n' and bin > 0 and seslen>0 and seslen<18000 group by user order by t", \
			  "all": "select sum(bin) t from data_log where bin > 0 and seslen>0 and seslen<18000 group by user order by t"}
	
	toutb = {"catm": "select sum(bout) t from data_log where ucat_term='catm\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \
			  "catd": "select sum(bout) t from data_log where ucat_term='catd\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \
			  "cath": "select sum(bout) t from data_log where ucat_term='cath\n' and bout>0 and seslen>0 and seslen<18000 group by user order by t", \
			  "all": "select sum(bout) t from data_log where bout>0 and seslen>0 and seslen<18000 group by user order by t"}

	dtmap = {"slen": slen, "inb": inb, "outb": outb, "to_inb": to_inb, "to_outb": to_outb, "tslen": tslen, "tinb": tinb, "toutb": toutb}

	if dt not in dtmap:
		raise NotImplementedError("Type - " + dt + " - is not implemented")
	
	qmap = dtmap[dt]
	
	s = sql.RunSQL("azure.db")
	for i in qmap.items():
		q = i[1]
		y = s.sqlq(q)
		x = np.array(y)
		x.sort() # just making sure

		df = i[0] + "_" + dt
		ccf = i[0] + "_ccdf"
		ecf = i[0] + "_ecdf"

		cc = util.ccdf(x)
		ec = util.ecdf(x)

		util.write_data(df, x)
		util.write_data(ccf, cc)
		util.write_data(ecf, ec)


		mle = ml.ModLav.fromFit(x,fit="mlefit")
		mme = ml.ModLav.fromFit(x,fit="mmefit")
		mle_mt = ml.ModLav.fromFit(x,fit="mlefit",mt=True)
		mme_mt = ml.ModLav.fromFit(x,fit="mmefit",mt=True)

		omle = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=True,mt=False);
		omle_mt = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=True,mt=True);
		omme = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=False,mt=False);
		omme_mt = ml.optfit(x,0.1*x.max(),10*x.max(),500,mlefit=False,mt=True);

		mle_opt = omle["fit"][0]
		xm_mle_opt = omle["fit"][1]

		mle_opt_mt = omle_mt["fit"][0]
		xm_mle_opt_mt = omle_mt["fit"][1]

		mme_opt = omme["fit"][0]
		xm_mme_opt = omme["fit"][1]

		mme_opt_mt = omme_mt["fit"][0]
		xm_mme_opt_mt = omme_mt["fit"][1]

		yyy = [("MLE", mle, x.max()), ("MME", mme, x.max()), ("MLE-MT", mle_mt, x.max()), ("MME-MT", mme_mt, x.max()), ("MLE-OPT", mle_opt, xm_mle_opt), ("MLE-OPT-MT", mle_opt_mt, xm_mle_opt_mt), ("MME-OPT", mme_opt, xm_mme_opt), ("MME-OPT-MT", mme_opt_mt, xm_mme_opt_mt)]
		
		n,amin,amax,mu,sigma = len(x), x.min(), x.max(), x.mean(), x.std()
		cv = sigma/mu
		q = ms.mquantiles(x, [0.1, 0.5, 0.9])
		op_str = []
		op_str.append("BASIC STATISTICS")
		op_str.append("----------------------------------------------------------------------")
		op_str.append("Size: " + str(n))
		op_str.append("Range: " + str(amin) + " - " + str(amax))
		op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2]))
		op_str.append("Mean: " + str(mu))
		op_str.append("Sigma: " + str(sigma))
		op_str.append("CV: " + str(cv))
		op_str.append("\n")

		for yy in yyy:
			typ = i[0]
			lbl = yy[0]
			m = yy[1]
			xmx = yy[2]

			op_str.append(lbl)
			op_str.append("----------------------------------------------------------------------")
			op_str.append("Modlav params: " + str(m))
			op_str.append("Xmax: " + str(xmx))
			op_str.append("Xmax/Max: " + str(xmx/amax))
			op_str.append("FIT metric: " + str(m.fitmetric(points=x)))
			op_str.append("K-S metric: " + str(m.ksmetric(points=x)))
			op_str.append("----------------------------------------------------------------------")
			op_str.append("\n")

			flbl = lbl.lower().replace("-", "_")
			fname_pfx = typ + "_" + flbl 

			lx = util.gen_points(math.log10(x.min()),math.log10(xmx),2000)
			ex = np.power(10, lx)

			mcc = m.ccdf(ex)
			mec = m.cdf(ec[:,0])

			fmcc = np.array([ex, mcc]).transpose()
			fmec = np.array([ec[:,0], mec]).transpose()

			util.write_data(fname_pfx+"_ccdf", fmcc)
			util.write_data(fname_pfx+"_ecdf", fmec)
	
		op1_str = []
		for s1 in op_str:
			op1_str.append(s1 + "\n")
		
		txf = open(typ+"_metric", "w+")
		txf.writelines(op1_str)
		txf.close()
Пример #10
0
def pardd(fname):
	
	inpf = fname
	x1 = util.read_data(inpf)
	x1.sort()
	xmx = x1.max()

	n = 500
	lo = 0.1*xmx
	hi = 10*xmx

	ccf = inpf + "_ccdf"
	ecf = inpf + "_ecdf"

	cc = util.ccdf(x1)
	ec = util.ecdf(x1)

	util.write_data(ccf, cc)
	util.write_data(ecf, ec)

	mle = ml.ModLav.fromFit(x1, fit="mlefit")
	mme = ml.ModLav.fromFit(x1, fit="mmefit")
	mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True)
	mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True)

	no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec), mle.difference(cdf=ec))
	no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec), mme.difference(cdf=ec))
	no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec), mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec))
	no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec), mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec))

	omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False)
	omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True)
	omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False)
	omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True)

	mle_opt = omle["fit"]
	mle_opt_mt = omle_mt["fit"]
	mme_opt = omme["fit"]
	mme_opt_mt = omme_mt["fit"]

	k_mle_opt = omle["ks"]
	k_mle_opt_mt = omle_mt["ks"]
	k_mme_opt = omme["ks"]
	k_mme_opt_mt = omme_mt["ks"]
	
	d_mle_opt = omle["diff"]
	d_mle_opt_mt = omle_mt["diff"]
	d_mme_opt = omme["diff"]
	d_mme_opt_mt = omme_mt["diff"]

	fitlist = [("MLE", no_mle), \
				  ("MME", no_mme), \
				  ("MLE-MT", no_mle_mt), \
				  ("MME-MT", no_mme_mt), \
				  ("MLE-OPT", mle_opt), \
				  ("MLE-OPT-MT", mle_opt_mt), \
				  ("MME-OPT", mme_opt), \
				  ("MME-OPT-MT", mme_opt_mt), \
				  ("KS-MLE-OPT", k_mle_opt), \
				  ("KS-MLE-OPT-MT", k_mle_opt_mt), \
				  ("KS-MME-OPT", k_mme_opt), \
				  ("KS-MME-OPT-MT", k_mme_opt_mt), \
				  ("D-MLE-OPT", d_mle_opt), \
				  ("D-MLE-OPT-MT", d_mle_opt_mt), \
				  ("D-MME-OPT", d_mme_opt), \
				  ("D-MME-OPT-MT", d_mme_opt_mt)]

	n,amin,amax,mu,sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std()
	cv = sigma/mu
	q = ms.mquantiles(x1, [0.1, 0.5, 0.9])
	
	op1_str = []
	op_str = []
	op_str.append("BASIC STATISTICS")
	op_str.append("--------------------------------------------------------------------------")
	op_str.append("Size: " + str(n))
	op_str.append("Range: " + str(amin) + " - " + str(amax))
	op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2]))
	op_str.append("Mean: " + str(mu))
	op_str.append("Sigma: " + str(sigma))
	op_str.append("CV: " + str(cv))
	op_str.append("\n")
	
	best_fit_map = dict()

	for f in fitlist:
		lbl = f[0]
		m = f[1][0]
		mx = f[1][1]
		fitm = f[1][2]
		ksm = f[1][3]
		diffm = f[1][4]

		best_fit_map[lbl] = (m, mx, fitm, ksm, diffm)

		op_str.append(lbl)
		op_str.append("--------------------------------------------------------------------------")
		op_str.append("Modlav params: " + str(m))
		op_str.append("Xmax: " + str(mx))
		op_str.append("Xmax/Max: " + str(mx/xmx))
		## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1)))
		## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1)))
		op_str.append("FIT Metric: " + str(fitm))
		op_str.append("K-S Metric: " + str(ksm))
		op_str.append("DIFF Metric: " + str(diffm))
		op_str.append("--------------------------------------------------------------------------")
		op_str.append("\n")

		flbl = lbl.lower().replace("-", "_")
		fname_pfx = inpf + "_" + flbl

		lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000)
		ex = np.power(10, lx)

		mcc = m.ccdf(ex)
		mec = m.cdf(ec[:,0])

		fmcc = np.array([ex, mcc]).transpose()
		fmec = np.array([ec[:,0], mec]).transpose()

		util.write_data(fname_pfx + "_ccdf", fmcc)
		util.write_data(fname_pfx + "_ecdf", fmec)

	recom = best_fit(best_fit_map, xmx)
	for s1 in op_str:
		op1_str.append(s1 + "\n")
	op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n")

	txf = open(inpf + "_metric", "w+")
	txf.writelines(op1_str)
	txf.close()
Пример #11
0
def pardd(fname):

    inpf = fname
    x1 = util.read_data(inpf)
    x1.sort()
    xmx = x1.max()

    n = 500
    lo = 0.1 * xmx
    hi = 10 * xmx

    ccf = inpf + "_ccdf"
    ecf = inpf + "_ecdf"

    cc = util.ccdf(x1)
    ec = util.ecdf(x1)

    util.write_data(ccf, cc)
    util.write_data(ecf, ec)

    mle = ml.ModLav.fromFit(x1, fit="mlefit")
    mme = ml.ModLav.fromFit(x1, fit="mmefit")
    mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True)
    mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True)

    no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec),
              mle.difference(cdf=ec))
    no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec),
              mme.difference(cdf=ec))
    no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec),
                 mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec))
    no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec),
                 mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec))

    omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False)
    omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True)
    omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False)
    omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True)

    mle_opt = omle["fit"]
    mle_opt_mt = omle_mt["fit"]
    mme_opt = omme["fit"]
    mme_opt_mt = omme_mt["fit"]

    k_mle_opt = omle["ks"]
    k_mle_opt_mt = omle_mt["ks"]
    k_mme_opt = omme["ks"]
    k_mme_opt_mt = omme_mt["ks"]

    d_mle_opt = omle["diff"]
    d_mle_opt_mt = omle_mt["diff"]
    d_mme_opt = omme["diff"]
    d_mme_opt_mt = omme_mt["diff"]

    fitlist = [("MLE", no_mle), \
         ("MME", no_mme), \
         ("MLE-MT", no_mle_mt), \
         ("MME-MT", no_mme_mt), \
         ("MLE-OPT", mle_opt), \
         ("MLE-OPT-MT", mle_opt_mt), \
         ("MME-OPT", mme_opt), \
         ("MME-OPT-MT", mme_opt_mt), \
         ("KS-MLE-OPT", k_mle_opt), \
         ("KS-MLE-OPT-MT", k_mle_opt_mt), \
         ("KS-MME-OPT", k_mme_opt), \
         ("KS-MME-OPT-MT", k_mme_opt_mt), \
         ("D-MLE-OPT", d_mle_opt), \
         ("D-MLE-OPT-MT", d_mle_opt_mt), \
         ("D-MME-OPT", d_mme_opt), \
         ("D-MME-OPT-MT", d_mme_opt_mt)]

    n, amin, amax, mu, sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std()
    cv = sigma / mu
    q = ms.mquantiles(x1, [0.1, 0.5, 0.9])

    op1_str = []
    op_str = []
    op_str.append("BASIC STATISTICS")
    op_str.append(
        "--------------------------------------------------------------------------"
    )
    op_str.append("Size: " + str(n))
    op_str.append("Range: " + str(amin) + " - " + str(amax))
    op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) +
                  " 90% - " + str(q[2]))
    op_str.append("Mean: " + str(mu))
    op_str.append("Sigma: " + str(sigma))
    op_str.append("CV: " + str(cv))
    op_str.append("\n")

    best_fit_map = dict()

    for f in fitlist:
        lbl = f[0]
        m = f[1][0]
        mx = f[1][1]
        fitm = f[1][2]
        ksm = f[1][3]
        diffm = f[1][4]

        best_fit_map[lbl] = (m, mx, fitm, ksm, diffm)

        op_str.append(lbl)
        op_str.append(
            "--------------------------------------------------------------------------"
        )
        op_str.append("Modlav params: " + str(m))
        op_str.append("Xmax: " + str(mx))
        op_str.append("Xmax/Max: " + str(mx / xmx))
        ## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1)))
        ## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1)))
        op_str.append("FIT Metric: " + str(fitm))
        op_str.append("K-S Metric: " + str(ksm))
        op_str.append("DIFF Metric: " + str(diffm))
        op_str.append(
            "--------------------------------------------------------------------------"
        )
        op_str.append("\n")

        flbl = lbl.lower().replace("-", "_")
        fname_pfx = inpf + "_" + flbl

        lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000)
        ex = np.power(10, lx)

        mcc = m.ccdf(ex)
        mec = m.cdf(ec[:, 0])

        fmcc = np.array([ex, mcc]).transpose()
        fmec = np.array([ec[:, 0], mec]).transpose()

        util.write_data(fname_pfx + "_ccdf", fmcc)
        util.write_data(fname_pfx + "_ecdf", fmec)

    recom = best_fit(best_fit_map, xmx)
    for s1 in op_str:
        op1_str.append(s1 + "\n")
    op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n")

    txf = open(inpf + "_metric", "w+")
    txf.writelines(op1_str)
    txf.close()