def cath_ccdf(): try: plt.show() except: pass # These are already computed values. Please look at the notes # on discussion as to why and how they were chosen. The CDF # and CCDF values for the fitted distributions are different. # The reasoning for this is also in the notes. m2 = ml.ModLav(0.7438, 657.1194, 0.0191) m3 = ml.ModLav(0.7294, 644.6747, 0.0104) x = util.read_data("/home/gautam/research/modlav-plots/seslen/cath_ses") cc = util.read_data("/home/gautam/research/modlav-plots/seslen/cath_ccdf") plt.loglog(cc[:,0],cc[:,1],'k-',label='Data',linestyle='steps') lx=util.gen_points(math.log10(x.min()),math.log10(x.max()),2000) ex=np.power(lx,10) m2cc=m2.ccdf(ex) m3cc=m3.ccdf(ex) plt.loglog(ex,m3cc,'k--',label='MLE,MT, No OPT') plt.loglog(ex,m2cc,'k-.',label='MLE, MT, OPT') plt.grid() plt.xlim((1.0,1e5)) plt.ylim((1e-4,1.1)) plt.ylabel("P(X > x)") plt.xlabel("Session length [seconds]") plt.legend(loc=3)
def dice_matrix(mname, mtype): global CAT fa = str(mname) + "." + CAT[0] fm = str(mname) + "." + CAT[1] fd = str(mname) + "." + CAT[2] fh = str(mname) + "." + CAT[3] a = read_data(fa) m = read_data(fm) d = read_data(fd) h = read_data(fh) am = sim_mat(a,m,mtype) ad = sim_mat(a,d,mtype) ah = sim_mat(a,h,mtype) md = sim_mat(m,d,mtype) mh = sim_mat(m,h,mtype) dh = sim_mat(d,h,mtype) simmat = np.zeros([4,4]) simmat[0,0] = ah simmat[0,1] = ad simmat[0,2] = am simmat[1,0] = mh simmat[1,1] = md simmat[2,0] = dh return simmat
def dice_matrix(mname, mtype): global CAT fa = str(mname) + "." + CAT[0] fm = str(mname) + "." + CAT[1] fd = str(mname) + "." + CAT[2] fh = str(mname) + "." + CAT[3] a = read_data(fa) m = read_data(fm) d = read_data(fd) h = read_data(fh) am = sim_mat(a, m, mtype) ad = sim_mat(a, d, mtype) ah = sim_mat(a, h, mtype) md = sim_mat(m, d, mtype) mh = sim_mat(m, h, mtype) dh = sim_mat(d, h, mtype) simmat = np.zeros([4, 4]) simmat[0, 0] = ah simmat[0, 1] = ad simmat[0, 2] = am simmat[1, 0] = mh simmat[1, 1] = md simmat[2, 0] = dh return simmat
def computeFits(entry): tag = entry["tag"] fname = entry["filename"] x = util.read_data(fname) x_ecdf = util.read_data(fname + "_ecdf") x_ccdf = util.read_data(fname + "_ccdf") x.sort() computeLognormalFit(tag, x, x_ecdf, x_ccdf)
def plot_data(dist, fset, dset, qset): plt.interactive(False) plt.rcParams['font.size'] = 17.0 dist_params = pdata.dist_map[dist] ext = dist_params["ext"] typ = dset["type"] emp_key = "emp_" + typ emp_file = fset[emp_key] dist_key = ext + "_" + typ dist_file = fset[dist_key] p1 = read_data(emp_file) p2 = read_data(dist_file) xlabel = dset["xlabel"] ylabel = dset["ylabel"] l1 = dset["legend1"] l2 = dset["legend2"].replace(pdata.dist_replace_string, dist_params["legend"]) loc = dset["loc"] if (typ == "cdf"): plt.plot(p1[:, 0], p1[:, 1], 'k-', label=l1) plt.plot(p2[:, 0], p2[:, 1], 'k--', label=l2) else: plt.loglog(p1[:, 0], p1[:, 1], 'k-', label=l1) plt.loglog(p2[:, 0], p2[:, 1], 'k--', label=l2) if "xlim" in dset: plt.xlim(dset["xlim"]) if "ylim" in dset: plt.ylim(dset["ylim"]) if "xticks" in dset: plt.xticks(dset["xticks"]) if "yticks" in dset: plt.yticks(dset["yticks"]) plt.grid() plt.xlabel(xlabel) plt.ylabel(ylabel) plt.legend(loc=loc, frameon=False) dir = qset["dir"] fname = qset["file"] eps_file = dset["tag"].lower() + "_" + typ + "_" + ext + ".eps" plot_dir = os.path.join(dir, pdata.plot_dir) image_file = os.path.join(plot_dir, eps_file) prepare_file(image_file) plt.savefig(image_file) print "\t Created plot", image_file plt.close()
def plot_data(dist, fset, dset, qset): plt.interactive(False) plt.rcParams['font.size'] = 17.0 dist_params = pdata.dist_map[dist] ext = dist_params["ext"] typ = dset["type"] emp_key = "emp_" + typ emp_file = fset[emp_key] dist_key = ext + "_" + typ dist_file = fset[dist_key] p1 = read_data(emp_file) p2 = read_data(dist_file) xlabel = dset["xlabel"] ylabel = dset["ylabel"] l1 = dset["legend1"] l2 = dset["legend2"].replace(pdata.dist_replace_string, dist_params["legend"]) loc = dset["loc"] if ( typ == "cdf" ): plt.plot(p1[:,0], p1[:,1], 'k-', label=l1) plt.plot(p2[:,0], p2[:,1], 'k--', label=l2) else: plt.loglog(p1[:,0], p1[:,1], 'k-', label=l1) plt.loglog(p2[:,0], p2[:,1], 'k--', label=l2) if "xlim" in dset: plt.xlim(dset["xlim"]) if "ylim" in dset: plt.ylim(dset["ylim"]) if "xticks" in dset: plt.xticks(dset["xticks"]) if "yticks" in dset: plt.yticks(dset["yticks"]) plt.grid() plt.xlabel(xlabel) plt.ylabel(ylabel) plt.legend(loc=loc, frameon=False) dir = qset["dir"] fname = qset["file"] eps_file = dset["tag"].lower() + "_" + typ + "_" + ext + ".eps" plot_dir = os.path.join(dir, pdata.plot_dir) image_file = os.path.join(plot_dir, eps_file) prepare_file(image_file) plt.savefig(image_file) print "\t Created plot", image_file plt.close()
def catm_ccdf(): try: plt.show() except: pass # These are already computed values. # The script is only to plot the data. # Please refer to the notes on how to re-compute. # xmax is the value from the optimization run for mme x = util.read_data("/home/gautam/research/modlav-plots/seslen/catm_ses") mle=ml.ModLav(0.9398,3003.4797,0.1488) mme=ml.ModLav(1.0,3243.2566,0.1894) xmax = 17123.2972 cc = util.ccdf(x) lx=util.gen_points(math.log10(x.min()),math.log10(x.max()),2000) ex=np.power(lx,10) mlecc = mle.ccdf(ex) mme_lx=util.gen_points(math.log10(x.min()),math.log10(xmax),2000) mme_ex=np.power(mme_lx,10) mmecc = mme.ccdf(mme_ex) plt.loglog(cc[:,0],cc[:,1],'k-',label='Data',linestyle='steps') plt.loglog(ex,mlecc,'k--',label='MLE fit') plt.loglog(mme_ex,mmecc,'k-.',label='MME fit') plt.xlim((x.min(),x.max()*10)) plt.ylim((1e-4,1.1)) plt.grid() plt.ylabel("P(X > x)") plt.xlabel("Session length [seconds]") plt.legend(loc=3)
def catm_cdf(): try: plt.show() except: pass # These are already computed values. # The script is only to plot the data. # Please refer to the notes on how to re-compute. # xmax is the value from the optimization run for mme x = util.read_data("/home/gautam/research/modlav-plots/seslen/catm_ses") mle=ml.ModLav(0.9398,3003.4797,0.1488) mme=ml.ModLav(1.0,3243.2566,0.1894) xmax = 17123.2972 ec = util.ecdf(x) mleec = mle.cdf(ec[:,0]) mmeec = mme.cdf(ec[:,0]) plt.plot(ec[:,0],ec[:,1],'k-',label='Data',linestyle='steps') plt.plot(ec[:,0],mleec,'k--',label='MLE fit') plt.plot(ec[:,0],mmeec,'k-.',label='MME fit') plt.grid() plt.xlabel("Session length [seconds]") plt.ylabel("P(X <= x)") plt.ylim((0.0,1.0)) plt.legend(loc=4)
def get_root_from_data(data_file): """ Get a random root value from the points specified by the data file """ fp_df = os.path.join(os.environ["TRACE_DB_LOC"], data_file) vals = util.read_data(fp_df) n = len(vals) r = rng.get_random() i_rootv = r.randint(0, n-1) rootv = vals[i_rootv] return rootv
def get_root_from_data(data_file): """ Get a random root value from the points specified by the data file """ fp_df = os.path.join(os.environ["TRACE_DB_LOC"], data_file) vals = util.read_data(fp_df) n = len(vals) r = rng.get_random() i_rootv = r.randint(0, n - 1) rootv = vals[i_rootv] return rootv
def dice_matrix(mname): global CAT fa = str(mname) + "." + CAT[0] fm = str(mname) + "." + CAT[1] fd = str(mname) + "." + CAT[2] fh = str(mname) + "." + CAT[3] a = read_data(fa) m = read_data(fm) d = read_data(fd) h = read_data(fh) a.sort() m.sort() d.sort() h.sort() am = dice(a, m, issorted=True) ad = dice(a, d, issorted=True) ah = dice(a, h, issorted=True) md = dice(m, d, issorted=True) mh = dice(m, h, issorted=True) dh = dice(d, h, issorted=True) simmat = np.zeros([4, 4]) simmat[0, 0] = ah simmat[0, 1] = ad simmat[0, 2] = am simmat[1, 0] = mh simmat[1, 1] = md simmat[2, 0] = dh return simmat
def dice_matrix(mname): global CAT fa = str(mname) + "." + CAT[0] fm = str(mname) + "." + CAT[1] fd = str(mname) + "." + CAT[2] fh = str(mname) + "." + CAT[3] a = read_data(fa) m = read_data(fm) d = read_data(fd) h = read_data(fh) a.sort() m.sort() d.sort() h.sort() am = dice(a,m,issorted=True) ad = dice(a,d,issorted=True) ah = dice(a,h,issorted=True) md = dice(m,d,issorted=True) mh = dice(m,h,issorted=True) dh = dice(d,h,issorted=True) simmat = np.zeros([4,4]) simmat[0,0] = ah simmat[0,1] = ad simmat[0,2] = am simmat[1,0] = mh simmat[1,1] = md simmat[2,0] = dh return simmat
def all_ccdf(): try: plt.show() except: pass m1 = ml.ModLav(1.0186, 1534.7651, 0.0892) m2 = ml.ModLav(1.0, 1539.8953, 0.0855) x = util.read_data("/home/gautam/research/modlav-plots/seslen/all_ses") cc = util.read_data("/home/gautam/research/modlav-plots/seslen/all_ccdf") m1cc = m1.ccdf(cc[:,0]) m2cc = m2.ccdf(cc[:,1]) plt.loglog(cc[:,0],cc[:,1],'k-',label='Data',linestyle='steps') plt.loglog(cc[:,0],m1cc,'k--',label='MLE fit') plt.loglog(cc[:,0],m2cc,'k-.',label='MME fit') plt.grid() plt.xlabel("Session length [seconds]") plt.ylabel("P(X > x)") plt.ylim((1e-5,1.1)) plt.legend(loc=3)
def process_datasets(tag_file_map): tags = tag_file_map.keys() report_map = {} for tag in tags: data_file = tag_file_map[tag] x = util.read_data(data_file) x.sort() ec = util.ecdf(x) cc = util.ccdf(x) fit_map = compute_fits(x) insert_db_record(tag, fit_map) ## Figure out best fit bfit = best_fits(fit_map) report_map[tag] = (bfit["best_body"], bfit["best_tail"]) ## Write files out to the directory util.write_data(data_file + "_ecdf", ec) util.write_data(data_file + "_ccdf", cc) ccpts = np.power( 10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000)) ecpts = ec[:, 0] lmme = fit_map["MME"][0] lmle = fit_map["MLE"][0] lfit = fit_map["FITMIN"][0] mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose() mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmme", mme_ec) util.write_data(data_file + "_ccdf.lognmme", mme_cc) mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose() mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmle", mle_ec) util.write_data(data_file + "_ccdf.lognmle", mle_cc) fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose() fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognfitmin", fit_ec) util.write_data(data_file + "_ccdf.lognfitmin", fit_cc) for k in report_map: print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1] return report_map
def cath_ecdf(): try: plt.show() except: pass m1 = ml.ModLav(0.9147, 659.8731, 0.0493) m3 = ml.ModLav(0.7294, 644.6747, 0.0104) x = util.read_data("/home/gautam/research/modlav-plots/seslen/cath_ses") ec = util.read_data("/home/gautam/research/modlav-plots/seslen/cath_ecdf") m1ec=m1.cdf(ec[:,0]) m3ec=m3.cdf(ec[:,0]) plt.plot(ec[:,0],ec[:,1],'k-',label='Data',linestyle='steps') plt.plot(ec[:,0],m3ec,'k--',label='MLE, MT, No OPT') plt.plot(ec[:,0],m1ec,'k-.',label='MLE, No MT, OPT') plt.ylim((0.0,1.0)) plt.xlim((1.0,10000)) plt.xlabel("Session length [seconds]") plt.ylabel("P(X <= x)") plt.grid() plt.legend(loc=4)
def dice_matrix(mtup): print mtup global CAT mname = mtup[0] fa = str(mname) + "." + CAT[0] fm = str(mname) + "." + CAT[1] fd = str(mname) + "." + CAT[2] fh = str(mname) + "." + CAT[3] a = read_data(fa) m = read_data(fm) d = read_data(fd) h = read_data(fh) am = sim_mat(a, m) ad = sim_mat(a, d) ah = sim_mat(a, h) md = sim_mat(m, d) mh = sim_mat(m, h) dh = sim_mat(d, h) simmat = np.zeros([4, 4]) simmat[0, 0] = ah simmat[0, 1] = ad simmat[0, 2] = am simmat[1, 0] = mh simmat[1, 1] = md simmat[2, 0] = dh print simmat return (mtup, simmat)
def process_datasets(tag_file_map): tags = tag_file_map.keys() report_map = {} for tag in tags: data_file = tag_file_map[tag] x = util.read_data(data_file) x.sort() ec = util.ecdf(x) cc = util.ccdf(x) fit_map = compute_fits(x) insert_db_record(tag, fit_map) ## Figure out best fit bfit = best_fits(fit_map) report_map[tag] = (bfit["best_body"], bfit["best_tail"]) ## Write files out to the directory util.write_data(data_file + "_ecdf", ec) util.write_data(data_file + "_ccdf", cc) ccpts = np.power(10, util.gen_points(math.log10(min(x)), math.log10(max(x)), 2000)) ecpts = ec[:,0] lmme = fit_map["MME"][0] lmle = fit_map["MLE"][0] lfit = fit_map["FITMIN"][0] mme_ec = np.array([ecpts, lmme.cdf(ecpts)]).transpose() mme_cc = np.array([ccpts, lmme.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmme", mme_ec) util.write_data(data_file + "_ccdf.lognmme", mme_cc) mle_ec = np.array([ecpts, lmle.cdf(ecpts)]).transpose() mle_cc = np.array([ccpts, lmle.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognmle", mle_ec) util.write_data(data_file + "_ccdf.lognmle", mle_cc) fit_ec = np.array([ecpts, lfit.cdf(ecpts)]).transpose() fit_cc = np.array([ccpts, lfit.ccdf(ccpts)]).transpose() util.write_data(data_file + "_ecdf.lognfitmin", fit_ec) util.write_data(data_file + "_ccdf.lognfitmin", fit_cc) for k in report_map: print k + " BODY: " + report_map[k][0] + " TAIL: " + report_map[k][1] return report_map
def computeBasicStats(entry): tag = entry["tag"] fname = entry["filename"] x = util.read_data(fname) x.sort() s = util.get_stats(x) t = (entry["tag"], s["size"], s["min"], s["max"], s["pct10"], s["pct50"], s["pct90"], s["mu"], s["sigma"], s["cv"]) q = "insert into basic_stats(unique_id, size, min, max, pct10, pct50, pct90, mu, sdev, cv) values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" c = getConnection() try: print "\t\t Inserting basic stats for " + tag + "[" + fname + "]" c.execute(q, t) c.commit() print "\t\t OK" except BaseException as be: c.rollback() print "\t\t FAILED"
def gen_cdf_ccdf(): r = RunSQL("files_and_analysis.db") dsets = r.sqlq("select unique_id, filename from datasets") for dpair in dsets: dset_id = dpair[0] dset_file = dpair[1] print "Processing data set - ", dset_id x = read_data(dset_file) x.sort() ec = ecdf(x, issorted=True) pts = np.power(10, gen_points(math.log10(min(x)), math.log10(max(x)), 2000)) # dist_list = ["LOGLOGISTIC", "LOGN", "TPARETO", "TRUNCLL"] dist_list = ["LOGN"] fext_map = {"LOGLOGISTIC": "ll", "LOGN": "lgn", "TPARETO": "tp", "TRUNCLL": "tll"} for distname in dist_list: print "\t Getting distribution - ", distname dist = get_dist(dset_id, distname) dec = dist.cdf(ec[:,0]) dcc = dist.ccdf(pts) fdec = np.array([ec[:,0], dec]).transpose() fdcc = np.array([pts, dcc]).transpose() op_dir = os.path.dirname(dset_file) op_ec_file = op_dir + "/" + os.path.basename(dset_file)+"_ecdf" + "." + fext_map[distname] op_cc_file = op_dir + "/" + os.path.basename(dset_file)+"_ccdf" + "." + fext_map[distname] print "\t Writing CDF - ", op_ec_file write_data(op_ec_file, fdec) print "\t Writing CCF - ", op_cc_file write_data(op_cc_file, fdcc)
def fitlogn(dataf): x=np.array(read_data(dataf)) x.sort() l1 = Lognormal.fromFit(x) l2 = Lognormal.fromFit(x,mmefit=False) ec = ecdf(x) cc = ccdf(x) q1 = l1.fitmetric(cdf=ec) q2 = l2.fitmetric(cdf=ec) print "File: " + dataf if q1 <= q2: print "Type: MME" print "Lognormal: " + str(l1) print "FIT: ", q1 print "K-S: ", l1.ksmetric(cdf=ec) else: print "Type: MLE" print "Lognormal: " + str(l2) print "FIT: ", q2 print "K-S: ", l2.ksmetric(cdf=ec)
def fitlogn(dataf): x = np.array(read_data(dataf)) x.sort() l1 = Lognormal.fromFit(x) l2 = Lognormal.fromFit(x, mmefit=False) ec = ecdf(x) cc = ccdf(x) q1 = l1.fitmetric(cdf=ec) q2 = l2.fitmetric(cdf=ec) print "File: " + dataf if q1 <= q2: print "Type: MME" print "Lognormal: " + str(l1) print "FIT: ", q1 print "K-S: ", l1.ksmetric(cdf=ec) else: print "Type: MLE" print "Lognormal: " + str(l2) print "FIT: ", q2 print "K-S: ", l2.ksmetric(cdf=ec)
else: usage() sys.exit(2) else: usage() sys.exit(2) except getopt.GetoptError, opt_err: print str(opt_err) usage() sys.exit(2) if input_file == None or lo == None or hi == None or n == None or fit1 == None or mt1 == None: usage() sys.exit(2) x1 = util.read_data(input_file) x1.sort() rs = paroptfit(x1, hi, lo, n, fit1, mt1) print rs ## xmax_pts = util.gen_points(lo, hi, n) ## ncpus = mp.cpu_count() ## proc_pool = Pool(ncpus) ## result = proc_pool.map(optfit, xmax_pts) # definitions to compare fit and k-s values # 2 = index of k-s metric in each tuple of the result # 3 = index of fit metric in each tuple of the result
def recursive_model(**kwargs): """ Recursive forest file model simulator Params: file = data file to pick initial roots nroots = number of roots d1 = model for roots d1params = (param1, param2,...) parameters for d1 d2 = model for children nodes d2params = (param1, param2, ...) parameters for d2 g = probability of a new file nu = probability of a deletion n = number of iterations minsize = minimum file size """ sroots = None d1f = None d2f = None g = None nu = None minsize = 0.0 nroots = kwargs["nroots"] if "minsize" in kwargs: minsize = kwargs["minsize"] if "file" in kwargs: fname = kwargs["file"] fl = util.read_data(fname) d1f = DFile(flist=fl, minsize=0.0) else: d1model = kwargs["d1"] d1params = kwargs["d1params"] d1f = DFile(d=d1model, plist=d1params, minsize=0.0) d2model = kwargs["d2"] d2params = kwargs["d2params"] d2f = DFile(d=d2model, plist=d2params) gf = rng.get_random() # random.Random() nf = rng.get_random() # random.Random() fpick = rng.get_random() # random.Random() g = kwargs["g"] nu = kwargs["nu"] n = kwargs["n"] sroots = d1f.random(nroots) simvs = [] for sroot in sroots: s = {"size": sroot, "deleted": False, "mult_factor": 0.0, "depth": 0} simvs.append(s) for i in xrange(nroots, n+1): gvar = gf.random() if gvar <= g: a = d1f.random(1) sa = {"size": a, "deleted": False, "mult_factor": 0.0, "depth": 0} simvs.append(sa) else: idx = fpick.randint(0, len(simvs)-1) nvar = nf.random() if nvar <= nu: simvs.pop(idx) else: mf = d2f.random(1) idx = fpick.randint(0, len(simvs)-1) pick = simvs[idx] ns = max(pick["size"]*mf[0], minsize) ndel = False nmf = pick["mult_factor"] * mf[0] ndepth = pick["depth"] + 1 simvs.append({"size": ns, "deleted": ndel, "mult_factor": nmf, "depth": ndepth}) sizev=[] mfv=[] depthv=[] for simv in simvs: sizev.append(simv["size"]) mfv.append(simv["mult_factor"]) depthv.append(simv["depth"]) return (np.array(sizev), np.array(mfv), np.array(depthv))
def create_single_plot(tag_file_map, report_map, dset): plt.interactive(False) plt.rcParams['font.size'] = 17.0 dist_params = pdata.dist_map["LOGN"] ext = dist_params["ext"] typ = dset["type"] tag = dset["tag"] bfit = report_map[tag][0] tfit = report_map[tag][1] if bfit == tfit: l1 = dset["legend1"] l2 = "Body & tail: LOGN-" + bfit if typ == "cdf": p1 = util.read_data(tag_file_map[tag] + "_ecdf") p2 = util.read_data(tag_file_map[tag] + "_ecdf.logn" + bfit.lower()) plt.plot(p1[:,0], p1[:,1], 'k-', label=l1) plt.plot(p2[:,0], p2[:,1], 'k--', label=l2) else: p1 = util.read_data(tag_file_map[tag] + "_ccdf") p2 = util.read_data(tag_file_map[tag] + "_ccdf.logn" + bfit.lower()) plt.loglog(p1[:,0], p1[:,1], 'k-', label=l1) plt.loglog(p2[:,0], p2[:,1], 'k--', label=l2) else: l1 = dset["legend1"] l2 = "Body: LOGN-" + bfit l3 = "Tail: LOGN-" + tfit if typ == "cdf": p1 = util.read_data(tag_file_map[tag] + "_ecdf") p2 = util.read_data(tag_file_map[tag] + "_ecdf.logn" + bfit.lower()) p3 = util.read_data(tag_file_map[tag] + "_ecdf.logn" + tfit.lower()) plt.plot(p1[:,0], p1[:,1], 'k-', label=l1) plt.plot(p2[:,0], p2[:,1], 'k-.', label=l2) plt.plot(p3[:,0], p3[:,1], 'k--', label=l3) else: p1 = util.read_data(tag_file_map[tag] + "_ccdf") p2 = util.read_data(tag_file_map[tag] + "_ccdf.logn" + bfit.lower()) p3 = util.read_data(tag_file_map[tag] + "_ccdf.logn" + tfit.lower()) plt.loglog(p1[:,0], p1[:,1], 'k-', label=l1) plt.loglog(p2[:,0], p2[:,1], 'k-.', label=l2) plt.loglog(p3[:,0], p3[:,1], 'k--', label=l3) loc = dset["loc"] if "xlim" in dset: plt.xlim(dset["xlim"]) if "ylim" in dset: plt.ylim(dset["ylim"]) if "xticks" in dset: plt.xticks(dset["xticks"]) if "yticks" in dset: plt.yticks(dset["yticks"]) plt.grid() plt.xlabel(dset["xlabel"]) plt.ylabel(dset["ylabel"]) plt.legend(loc=loc, frameon=False) eps_file = tag.lower() + "_" + typ + "_" + ext + ".eps" plot_path = os.path.join(os.getenv("HOME"), main_dir) plot_file = os.path.join(plot_path, eps_file) if os.access(plot_file, os.R_OK): os.remove(plot_file) plt.savefig(plot_file) plt.close()
def pardd(fname): inpf = fname x1 = util.read_data(inpf) x1.sort() xmx = x1.max() n = 500 lo = 0.1 * xmx hi = 10 * xmx ccf = inpf + "_ccdf" ecf = inpf + "_ecdf" cc = util.ccdf(x1) ec = util.ecdf(x1) util.write_data(ccf, cc) util.write_data(ecf, ec) mle = ml.ModLav.fromFit(x1, fit="mlefit") mme = ml.ModLav.fromFit(x1, fit="mmefit") mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True) mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True) no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec), mle.difference(cdf=ec)) no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec), mme.difference(cdf=ec)) no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec), mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec)) no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec), mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec)) omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False) omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True) omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False) omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True) mle_opt = omle["fit"] mle_opt_mt = omle_mt["fit"] mme_opt = omme["fit"] mme_opt_mt = omme_mt["fit"] k_mle_opt = omle["ks"] k_mle_opt_mt = omle_mt["ks"] k_mme_opt = omme["ks"] k_mme_opt_mt = omme_mt["ks"] d_mle_opt = omle["diff"] d_mle_opt_mt = omle_mt["diff"] d_mme_opt = omme["diff"] d_mme_opt_mt = omme_mt["diff"] fitlist = [("MLE", no_mle), \ ("MME", no_mme), \ ("MLE-MT", no_mle_mt), \ ("MME-MT", no_mme_mt), \ ("MLE-OPT", mle_opt), \ ("MLE-OPT-MT", mle_opt_mt), \ ("MME-OPT", mme_opt), \ ("MME-OPT-MT", mme_opt_mt), \ ("KS-MLE-OPT", k_mle_opt), \ ("KS-MLE-OPT-MT", k_mle_opt_mt), \ ("KS-MME-OPT", k_mme_opt), \ ("KS-MME-OPT-MT", k_mme_opt_mt), \ ("D-MLE-OPT", d_mle_opt), \ ("D-MLE-OPT-MT", d_mle_opt_mt), \ ("D-MME-OPT", d_mme_opt), \ ("D-MME-OPT-MT", d_mme_opt_mt)] n, amin, amax, mu, sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std() cv = sigma / mu q = ms.mquantiles(x1, [0.1, 0.5, 0.9]) op1_str = [] op_str = [] op_str.append("BASIC STATISTICS") op_str.append( "--------------------------------------------------------------------------" ) op_str.append("Size: " + str(n)) op_str.append("Range: " + str(amin) + " - " + str(amax)) op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2])) op_str.append("Mean: " + str(mu)) op_str.append("Sigma: " + str(sigma)) op_str.append("CV: " + str(cv)) op_str.append("\n") best_fit_map = dict() for f in fitlist: lbl = f[0] m = f[1][0] mx = f[1][1] fitm = f[1][2] ksm = f[1][3] diffm = f[1][4] best_fit_map[lbl] = (m, mx, fitm, ksm, diffm) op_str.append(lbl) op_str.append( "--------------------------------------------------------------------------" ) op_str.append("Modlav params: " + str(m)) op_str.append("Xmax: " + str(mx)) op_str.append("Xmax/Max: " + str(mx / xmx)) ## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1))) ## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1))) op_str.append("FIT Metric: " + str(fitm)) op_str.append("K-S Metric: " + str(ksm)) op_str.append("DIFF Metric: " + str(diffm)) op_str.append( "--------------------------------------------------------------------------" ) op_str.append("\n") flbl = lbl.lower().replace("-", "_") fname_pfx = inpf + "_" + flbl lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000) ex = np.power(10, lx) mcc = m.ccdf(ex) mec = m.cdf(ec[:, 0]) fmcc = np.array([ex, mcc]).transpose() fmec = np.array([ec[:, 0], mec]).transpose() util.write_data(fname_pfx + "_ccdf", fmcc) util.write_data(fname_pfx + "_ecdf", fmec) recom = best_fit(best_fit_map, xmx) for s1 in op_str: op1_str.append(s1 + "\n") op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n") txf = open(inpf + "_metric", "w+") txf.writelines(op1_str) txf.close()
def create_single_plot(tag_file_map, report_map, dset): plt.interactive(False) plt.rcParams['font.size'] = 17.0 dist_params = pdata.dist_map["LOGN"] ext = dist_params["ext"] typ = dset["type"] tag = dset["tag"] bfit = report_map[tag][0] tfit = report_map[tag][1] if bfit == tfit: l1 = dset["legend1"] l2 = "Body & tail: LOGN-" + bfit if typ == "cdf": p1 = util.read_data(tag_file_map[tag] + "_ecdf") p2 = util.read_data(tag_file_map[tag] + "_ecdf.logn" + bfit.lower()) plt.plot(p1[:, 0], p1[:, 1], 'k-', label=l1) plt.plot(p2[:, 0], p2[:, 1], 'k--', label=l2) else: p1 = util.read_data(tag_file_map[tag] + "_ccdf") p2 = util.read_data(tag_file_map[tag] + "_ccdf.logn" + bfit.lower()) plt.loglog(p1[:, 0], p1[:, 1], 'k-', label=l1) plt.loglog(p2[:, 0], p2[:, 1], 'k--', label=l2) else: l1 = dset["legend1"] l2 = "Body: LOGN-" + bfit l3 = "Tail: LOGN-" + tfit if typ == "cdf": p1 = util.read_data(tag_file_map[tag] + "_ecdf") p2 = util.read_data(tag_file_map[tag] + "_ecdf.logn" + bfit.lower()) p3 = util.read_data(tag_file_map[tag] + "_ecdf.logn" + tfit.lower()) plt.plot(p1[:, 0], p1[:, 1], 'k-', label=l1) plt.plot(p2[:, 0], p2[:, 1], 'k-.', label=l2) plt.plot(p3[:, 0], p3[:, 1], 'k--', label=l3) else: p1 = util.read_data(tag_file_map[tag] + "_ccdf") p2 = util.read_data(tag_file_map[tag] + "_ccdf.logn" + bfit.lower()) p3 = util.read_data(tag_file_map[tag] + "_ccdf.logn" + tfit.lower()) plt.loglog(p1[:, 0], p1[:, 1], 'k-', label=l1) plt.loglog(p2[:, 0], p2[:, 1], 'k-.', label=l2) plt.loglog(p3[:, 0], p3[:, 1], 'k--', label=l3) loc = dset["loc"] if "xlim" in dset: plt.xlim(dset["xlim"]) if "ylim" in dset: plt.ylim(dset["ylim"]) if "xticks" in dset: plt.xticks(dset["xticks"]) if "yticks" in dset: plt.yticks(dset["yticks"]) plt.grid() plt.xlabel(dset["xlabel"]) plt.ylabel(dset["ylabel"]) plt.legend(loc=loc, frameon=False) eps_file = tag.lower() + "_" + typ + "_" + ext + ".eps" plot_path = os.path.join(os.getenv("HOME"), main_dir) plot_file = os.path.join(plot_path, eps_file) if os.access(plot_file, os.R_OK): os.remove(plot_file) plt.savefig(plot_file) plt.close()
def pardd(fname): inpf = fname x1 = util.read_data(inpf) x1.sort() xmx = x1.max() n = 500 lo = 0.1*xmx hi = 10*xmx ccf = inpf + "_ccdf" ecf = inpf + "_ecdf" cc = util.ccdf(x1) ec = util.ecdf(x1) util.write_data(ccf, cc) util.write_data(ecf, ec) mle = ml.ModLav.fromFit(x1, fit="mlefit") mme = ml.ModLav.fromFit(x1, fit="mmefit") mle_mt = ml.ModLav.fromFit(x1, fit="mlefit", mt=True) mme_mt = ml.ModLav.fromFit(x1, fit="mmefit", mt=True) no_mle = (mle, xmx, mle.fitmetric(cdf=ec), mle.ksmetric(cdf=ec), mle.difference(cdf=ec)) no_mme = (mme, xmx, mme.fitmetric(cdf=ec), mme.ksmetric(cdf=ec), mme.difference(cdf=ec)) no_mle_mt = (mle_mt, xmx, mle_mt.fitmetric(cdf=ec), mle_mt.ksmetric(cdf=ec), mle_mt.difference(cdf=ec)) no_mme_mt = (mme_mt, xmx, mme_mt.fitmetric(cdf=ec), mme_mt.ksmetric(cdf=ec), mme_mt.difference(cdf=ec)) omle = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", False) omle_mt = parmlfit.paroptfit(x1, hi, lo, n, "mlefit", True) omme = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", False) omme_mt = parmlfit.paroptfit(x1, hi, lo, n, "mmefit", True) mle_opt = omle["fit"] mle_opt_mt = omle_mt["fit"] mme_opt = omme["fit"] mme_opt_mt = omme_mt["fit"] k_mle_opt = omle["ks"] k_mle_opt_mt = omle_mt["ks"] k_mme_opt = omme["ks"] k_mme_opt_mt = omme_mt["ks"] d_mle_opt = omle["diff"] d_mle_opt_mt = omle_mt["diff"] d_mme_opt = omme["diff"] d_mme_opt_mt = omme_mt["diff"] fitlist = [("MLE", no_mle), \ ("MME", no_mme), \ ("MLE-MT", no_mle_mt), \ ("MME-MT", no_mme_mt), \ ("MLE-OPT", mle_opt), \ ("MLE-OPT-MT", mle_opt_mt), \ ("MME-OPT", mme_opt), \ ("MME-OPT-MT", mme_opt_mt), \ ("KS-MLE-OPT", k_mle_opt), \ ("KS-MLE-OPT-MT", k_mle_opt_mt), \ ("KS-MME-OPT", k_mme_opt), \ ("KS-MME-OPT-MT", k_mme_opt_mt), \ ("D-MLE-OPT", d_mle_opt), \ ("D-MLE-OPT-MT", d_mle_opt_mt), \ ("D-MME-OPT", d_mme_opt), \ ("D-MME-OPT-MT", d_mme_opt_mt)] n,amin,amax,mu,sigma = len(x1), x1.min(), xmx, x1.mean(), x1.std() cv = sigma/mu q = ms.mquantiles(x1, [0.1, 0.5, 0.9]) op1_str = [] op_str = [] op_str.append("BASIC STATISTICS") op_str.append("--------------------------------------------------------------------------") op_str.append("Size: " + str(n)) op_str.append("Range: " + str(amin) + " - " + str(amax)) op_str.append("Quantiles: 10% - " + str(q[0]) + " 50% - " + str(q[1]) + " 90% - " + str(q[2])) op_str.append("Mean: " + str(mu)) op_str.append("Sigma: " + str(sigma)) op_str.append("CV: " + str(cv)) op_str.append("\n") best_fit_map = dict() for f in fitlist: lbl = f[0] m = f[1][0] mx = f[1][1] fitm = f[1][2] ksm = f[1][3] diffm = f[1][4] best_fit_map[lbl] = (m, mx, fitm, ksm, diffm) op_str.append(lbl) op_str.append("--------------------------------------------------------------------------") op_str.append("Modlav params: " + str(m)) op_str.append("Xmax: " + str(mx)) op_str.append("Xmax/Max: " + str(mx/xmx)) ## op_str.append("FIT Metric: " + str(m.fitmetric(points = x1))) ## op_str.append("K-S Metric: " + str(m.ksmetric(points = x1))) op_str.append("FIT Metric: " + str(fitm)) op_str.append("K-S Metric: " + str(ksm)) op_str.append("DIFF Metric: " + str(diffm)) op_str.append("--------------------------------------------------------------------------") op_str.append("\n") flbl = lbl.lower().replace("-", "_") fname_pfx = inpf + "_" + flbl lx = util.gen_points(math.log10(x1.min()), math.log10(mx), 2000) ex = np.power(10, lx) mcc = m.ccdf(ex) mec = m.cdf(ec[:,0]) fmcc = np.array([ex, mcc]).transpose() fmec = np.array([ec[:,0], mec]).transpose() util.write_data(fname_pfx + "_ccdf", fmcc) util.write_data(fname_pfx + "_ecdf", fmec) recom = best_fit(best_fit_map, xmx) for s1 in op_str: op1_str.append(s1 + "\n") op1_str.append("RECOMMENDATIONS: " + str(recom) + "\n") txf = open(inpf + "_metric", "w+") txf.writelines(op1_str) txf.close()