def get_number_records_pareto(b=3, power=6, discretize=True): size = 10**power if discretize: sample = np.ceil(pareto.rvs(b=b, size=size)) else: sample = pareto.rvs(b=b, size=size) fig = plt.figure(figsize=(8, 4), dpi=200) ax = fig.add_subplot(111) ax.scatter(range(0, size), sample, c='blue', alpha=0.25, s=0.5, label="realization") record_indexes = GeneralUtils.get_record_indexes(sample) # for c_p in record_indexes: ax.scatter(record_indexes, sample[record_indexes], c='green', marker='*', label="records = {0}".format(len(record_indexes))) ax.legend() if discretize: message = 'Discretized Pareto distribution, alpha={alpha}, n=10^{power}'.format( alpha=b, power=power) else: message = 'Continuous Pareto distribution, alpha={alpha}, n=10^{power}'.format( alpha=b, power=power) plt.title(message) plt.show() return len(record_indexes)
def make_population(self, n_people): age_class = pm.draw_ac(n_people) profession = pm.draw_prof(n_people) health_status = pm.draw_hs(n_people) education_level = pm.draw_el(n_people) income = pareto.rvs( b=1, scale=pm.person_params['income'], size=n_people, ) cobb_c = [pm.person_params['cobb_c']] * n_people cobb_d = [pm.person_params['cobb_d']] * n_people population = pd.DataFrame(list( zip(age_class, profession, health_status, education_level, income, cobb_c, cobb_d)), columns=[ 'age_class', 'profession', 'health_status', 'education_level', 'income', 'cobb_c', 'cobb_d' ]) population.to_sql('person', self.connection, index=False, if_exists='append')
def draw_rdn_powerlaw(self): drawed = pareto.rvs(self.alpha) drawed /= (self.dt**(-self.alpha) - self.tau_c ** (-self.alpha))**(1 / (self.alpha + 1)) if drawed <= self.tau_c: return drawed else: return self.draw_rdn_powerlaw()
def random_sample(self, count): sequence = [] i = 1 while len(sequence) < count: sequence = np.array(pareto.rvs( self.__alpha, size=i * count)) + (self.__low_bound - 1) sequence = sequence[sequence < self.__up_bound] i += 1 return sequence[:count]
def estimate_alpha_pareto_sample(size): sample = np.ceil(pareto.rvs(b=1.5, size=size)) fit_estimating_discrete = pw.Fit(data=sample, discrete=True, estimate_discrete=True) print(fit_estimating_discrete.power_law.alpha) print(fit_estimating_discrete.power_law.sigma) print(fit_estimating_discrete.power_law.xmin) return fit_estimating_discrete.power_law.alpha
def get_pareto_data(shape, location, number_of_data): """ function to generate data from Pareto distribution specified by shape (alpha) and location (gamma) parameters :param shape: alpha parameter of pareto distribution :param location: gamma parameter of pareto distribution :param number_of_data: number of data samples to generate :return: numpy.ndarray with samples from pareto distribution specified by :param shape and :param location. """ data = pareto.rvs(shape, scale=location, size=number_of_data) return data
def generate_pareto(self, n): b = 1.5 to_return = np.zeros(n, dtype=float) for i in range(n): while True: potential_value = pareto.rvs(b, size=1) if potential_value[0] <= 10.0: to_return[i] = potential_value/10.0 break return to_return
def generate_synth_data(anom_mode, noise_percentage): if anom_mode == 1: n_anom1 = 200 n_anom2 = 200 anom_views = [] for i in range(n_anom1): nViews = int(np.random.uniform(low=100, high=1000)) falseEntries = np.ones((nViews, )) anom_views.append(falseEntries) # Dirac Delta for i in range(n_anom2): nViews = int(np.random.uniform(low=100, high=1000)) complete_prob = np.random.uniform(low=0.8, high=0.9) falseEntries = np.random.uniform(low=0.8, high=0.9, size=nViews) compl_indexes = np.random.choice(range(falseEntries.shape[0]), size=int(complete_prob * nViews)) falseEntries[compl_indexes] = 1.0 anom_views.append(falseEntries) elif anom_mode == 2: ## EXPON Param 0.05 n_anom1 = 400 anom_views = [] for i in range(n_anom1): nViews = int(np.random.uniform(low=100, high=1000)) views = expon.rvs(0.05, size=nViews) views[views > 1.0] = 1.0 anom_views.append(views) elif anom_mode == 3: ## n_anom1 = 400 anom_views = [] for i in range(n_anom1): nViews = int(np.random.uniform(low=100, high=1000)) views = pareto.rvs(50.0, size=nViews) - 1.0 views[views > 1.0] = 1.0 anom_views.append(views) anom_views = np.array(anom_views) pbar = ProgressBar() synth_fisk_params = [] for i in pbar(range(len(anom_views))): synth_single = fit_fisk(anom_views[i], 0) synth_fisk_params.append( [synth_single[0], synth_single[1][0], synth_single[1][2]]) print len(synth_fisk_params) synth_fisk_params = np.array(synth_fisk_params) return synth_fisk_params
def KS_MC(a, n_events, n_draws=10000): """ Run MC trials of computing KS D values for data draw from power law with cumulative index a. """ D = [] for _ in range(n_draws): rvs = pareto.rvs(a, size=n_events) aML = ML_index_analytic(rvs, 1.) cdf = lambda x: pareto.cdf(x, aML) D.append(kstest(rvs, cdf)[0]) return np.sort(D)
def sample_service_time(self, time): """ Sample next availability for service unit """ if self.service_time_dist == 'expon': service_avail_time = time + self.exp_dist() elif self.service_time_dist == 'pareto': service_avail_time = time + pareto.rvs(self.mean_service_time) self.next_service = service_avail_time
def GetBoulders(self, pos, R, distr='exp', alpha=1.1): """ we choose to base all this on the volume possible distributions: 'exp' or 'pareto' alpha is the shape parameter for pareto. """ #new, based on volume boul=[] area=R**2*pi uniform=random.uniform nboul=int(self.boulderFreq*area) thresh=0.125/1000.0 #minimum volume meanR=pow(self.meanBoulderV/pi*3.0/4.0, 1.0/3.0) #from mean volume meanV=self.meanBoulderV Vols=[] #randomly dist. if distr=='exp': lambd=1./(meanV-thresh) #for exponential for i in range(nboul): vol=thresh+random.expovariate(lambd) Vols.append(vol) elif distr=='pareto': xm=(alpha-1)/alpha*(meanV-thresh) #pareto for i in range(nboul): vol=thresh+pareto.rvs(alpha, scale=xm) Vols.append(vol) else: raise Exception('unknown distribution %s'%str(distr)) Vols=sorted(Vols, key=lambda vol: -vol) #sort, biggest first to get it in there.. for vol in Vols: placed = False radius=pow(vol/pi*3.0/4.0, 1.0/3.0) #end of the new stuff #print "A: %f n:%f %f %f %f"%(area, nboul, i, nboul, radius) i=0 while not placed: placed=True z=-uniform(0+radius,0.2+radius) x=uniform(pos[0]-R-radius/2., pos[0]+R+radius/2.) y=uniform(pos[1]-R-radius/2., pos[1]+R+radius/2.) for o in boul: if sqrt(pow(x-o.pos[0],2)+pow(y-o.pos[1],2)+pow(z-o.z,2))<(radius+o.radius): placed=False #print "i=%f nboul=%f fcould not place boulder b.x:%f b.y:%f b.z: %f x:%f y:%f z:%f"%(i,nboul,o.pos[0],o.pos[1], o.z,x,y,z) break if placed: boul.append(Boulder([x,y],radius,z)) i+=1 if i>50: print "could not place all stones.. frequency/area is obviously too big." break return boul
def oracle(): rs = pareto.rvs(2./3,size=BATCH_SIZE) # mask for which of the points to sample from [0,1] versus from pareto signs = np.random.random(BATCH_SIZE) < 1/2. # choose a random subset of the batch of samples to come from the "augmented" part of the pareto unit_interval_points = np.random.random(BATCH_SIZE) rs = rs * signs + unit_interval_points * (1 - signs) ids = np.random.choice(len(weights), BATCH_SIZE, p=weights) v = random_unit_vectors() xs = v[:,0] * rs + np.array(centers)[ids] ys = v[:,1] * rs out = np.vstack((xs,ys)).T return out
def get_number_records_pareto_discrete_vs_continuous(b=4, power=6): size = 10**power sample = pareto.rvs(b=b, size=size) fig = plt.figure(figsize=(8, 4), dpi=200) ax = fig.add_subplot(111) ax.scatter(range(0, size), sample, c='blue', alpha=0.25, s=0.5, label="realization") record_indexes = GeneralUtils.get_record_indexes(sample) ax.scatter(record_indexes, sample[record_indexes], c='green', marker='*', label="records = {0}".format(len(record_indexes))) ax.legend() message = 'Continuous Pareto distribution, alpha={alpha}, n=10^{power}'.format( alpha=b, power=power) plt.title(message) plt.show() # hill_estimator(sample) _integer_sample = np.ceil(sample) fig2 = plt.figure(figsize=(8, 4), dpi=200) ax = fig2.add_subplot(111) ax.scatter(range(0, size), _integer_sample, c='blue', alpha=0.25, s=0.5, label="realization") integer_record_indexes = GeneralUtils.get_record_indexes(_integer_sample) ax.scatter(integer_record_indexes, _integer_sample[integer_record_indexes], c='green', marker='*', label="records = {0}".format(len(integer_record_indexes))) ax.legend() message = 'Discretized Pareto distribution, alpha={alpha}, n=10^{power}'.format( alpha=b, power=power) plt.title(message) plt.show() # hill_estimator(_integer_sample) return sample, len(record_indexes), _integer_sample, len( integer_record_indexes)
def grant_wealth(self, person_ids, bank: Bank, transaction_date): """ assign an initial amount of starting wealth per person """ accounts = query_accounts_by_person_id(person_ids, bank.name, 'cash') accounts['transaction_amount'] = pareto.rvs( b=1, scale=pm.person_params['income'], size=accounts.shape[0], ) accounts = accounts[['account_id', 'transaction_amount']] accounts = accounts.rename(columns={'account_id': 'debit_account'}) accounts['credit_account'] = bank.liability_account accounts['transaction_date'] = transaction_date bank.make_transactions(accounts)
def _generate_KS_cube(): """ Generate a grid of D values for KS tests of power-law behavior. """ a_grid = np.arange(0.2, 2, 0.05) n_grid = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150, 200] m = 1000 Dcube = np.zeros([len(a_grid), len(n_grid), m], dtype='f4') for i, a in enumerate(a_grid): for j, n in enumerate(n_grid): D = [] for k in range(m): rvs = pareto.rvs(a, size=n) aML = ML_index_analytic(rvs, 1.) cdf = lambda x: pareto.cdf(x, aML) D.append(kstest(rvs, cdf)[0]) Dcube[i,j] = np.sort(D) np.save(_path_ks_grid, np.array((a_grid, n_grid, Dcube)))
def distribution_generator(flag, para_pow, para_normal, para_zip, t): if flag == "power_law": # dist = np.random.pareto(para_pow, t) dist = pareto.rvs(para_pow, size=t) - 1 dist = dist / max(dist) # dist = 1 - np.random.power(para_pow, t) # R^{k} # dist = np.random.uniform(0,1,t) # dist = (dist ** para_pow) elif flag == "uniform": dist = np.random.uniform(0, 1, t) elif flag == "normal": dist = np.random.normal(0.5, para_normal, t) elif flag == "zipfian": dist = np.random.zipf(para_zip, t) return dist
def generate_flows(num_flows): print "Generating flows..." all_flows = Queue.Queue() inter_arrivals = np.random.exponential(1.0/lamb, num_flows) flow_lengths = pareto.rvs(shape, scale=scale, size=num_flows) if debug_flag: print "average flow length:", sum(flow_lengths)/num_flows prev_time = 0 for i in range(num_flows): curr_time = prev_time + inter_arrivals[i] flow = Flow(curr_time, int(8*packet_size*flow_lengths[i]), inter_arrivals[i]) all_flows.put(flow) prev_time = curr_time if debug_flag: print "flow created: (%.8f, %d)." % (flow.arrival, flow.packet_length) max_packets = int(max(flow_lengths)) print "Finished flow generation." print return inter_arrivals, all_flows, max_packets
def simulate(algorithms, a, alpha, T, trials): cum_regret = np.zeros((len(algorithms), T + 1)) for trial in range(trials): inst_regret = np.zeros((len(algorithms), T + 1)) for alg in algorithms: alg.initialize() for t in range(1, T + 1): for i, alg in enumerate(algorithms): idx = alg.output() arm = alg.active_arms[idx] inst_regret[i, t] = min(abs(arm - 0.4), abs(arm - 0.8)) y = a - min( abs(arm - 0.4), abs(arm - 0.8)) + pareto.rvs(alpha) - alpha / (alpha - 1) alg.observe(t, y) cum_regret += np.cumsum(inst_regret, axis=-1) return cum_regret / trials
def generate_flows(num_flows): print "Generating flows..." all_flows = Queue.Queue() inter_arrivals = np.random.exponential(1.0 / lamb, num_flows) flow_lengths = pareto.rvs(shape, scale=scale, size=num_flows) if debug_flag: print "average flow length:", sum(flow_lengths) / num_flows prev_time = 0 for i in range(num_flows): curr_time = prev_time + inter_arrivals[i] flow = Flow(curr_time, int(8 * packet_size * flow_lengths[i]), inter_arrivals[i]) all_flows.put(flow) prev_time = curr_time if debug_flag: print "flow created: (%.8f, %d)." % (flow.arrival, flow.packet_length) max_packets = int(max(flow_lengths)) print "Finished flow generation." print return inter_arrivals, all_flows, max_packets
def paretoF(sizeSamples, Ex, Dx): n = sizeSamples #генерация выборок values = np.array([ pareto.rvs(k, size=n) for x in range(1000)]) #вычисление выборочных средних meanVal = values.mean(axis = 1) plt.hist(meanVal, normed=True, alpha=0.5, label='hist mean n ' + str(n)) #мат. ожидание м sigma нормального распределения mu = Ex sigma = math.sqrt(Dx/n) print 'мат. ожидание=' , mu print 'sigma=' , sigma # зададим нормальное распределенние norm_rv = sts.norm(loc=mu, scale=sigma) x = np.linspace(0.5,2,100) pdf = norm_rv.pdf(x) plt.plot(x, pdf, 'r-', lw=3, alpha=0.7, label='pareto pdf n ' + str(n)) plt.ylabel('samples') plt.xlabel('$x$') plt.legend(loc='best')
def paretoF(sizeSamples, Ex, Dx): n = sizeSamples #генерация выборок values = np.array([pareto.rvs(k, size=n) for x in range(1000)]) #вычисление выборочных средних meanVal = values.mean(axis=1) plt.hist(meanVal, normed=True, alpha=0.5, label='hist mean n ' + str(n)) #мат. ожидание м sigma нормального распределения mu = Ex sigma = math.sqrt(Dx / n) print 'мат. ожидание=', mu print 'sigma=', sigma # зададим нормальное распределенние norm_rv = sts.norm(loc=mu, scale=sigma) x = np.linspace(0.5, 2, 100) pdf = norm_rv.pdf(x) plt.plot(x, pdf, 'r-', lw=3, alpha=0.7, label='pareto pdf n ' + str(n)) plt.ylabel('samples') plt.xlabel('$x$') plt.legend(loc='best')
def dispatch_rvs(alpha, xmin, xmax, discrete, size=1, random_state=None): if discrete: if np.isinf(xmax): ll = genzipf.rvs(alpha, xmin, size=size, random_state=random_state) else: ll = truncated_zipf.rvs(alpha, xmin, xmax, size=size, random_state=random_state) else: if np.isinf(xmax): ll = pareto.rvs(alpha - 1, scale=xmin, size=size, random_state=random_state) else: ll = truncated_pareto.rvs(alpha - 1, float(xmax) / xmin, scale=xmin, size=size, random_state=random_state) return ll
def iperfFM(self, poisson_mean=100,r_int=0.5, r_tcp=0.8, last_time=1.7, t_threshold=8, size_1=4, size_2=1, base_port=5001): ''' use iperf to generate flow model ''' generate_flows = poisson.rvs(poisson_mean,size=30) for n in range(len(generate_flows)): flows_num = generate_flows[n] print flows_num for i in range(flows_num): # is_interior = True if bernoulli.rvs(r_int,size=1)[0]==1 else False client = random.choice(self.hosts[0:5]) server = client # if not is_interior: while server == client: server = random.choice(self.hosts) is_tcp = True if bernoulli.rvs(r_tcp,size=1)[0]==1 else False flow_t = pareto.rvs(b=last_time,scale=1,size=1)[0] # b: shape parameter if flow_t < t_threshold: flow_s = weibull_min.rvs(c=size_1,scale=5,size=1)[0] # c: shape parameter else: flow_s = weibull_min.rvs(c=size_2,scale=1,size=1)[0] if is_tcp: # self._iperfSingleTCP(hosts=[client,server], ) if flow_t < t_threshold: self._iperfSingleTCPN(hosts=[client,server],bytes=str(flow_s)+'K',port=base_port) # hosts=None, bytes='10K', port=5001 else: self._iperfSingleTCPN(hosts=[client,server],bytes=str(flow_s)+'M',port=base_port) else: # self._iperfSingleUDP(hosts=[client,server],) if flow_t < t_threshold: self._iperfSingleUDPN(hosts=[client,server],bytes=str(flow_s)+'K',port=base_port) else: self._iperfSingleUDPN(hosts=[client,server],bytes=str(flow_s)+'M',port=base_port) base_port = random.randint(base_port,base_port+500) sleep(0.1) print 'iperfFM test has done'
def sim(n, r, k, f, d): ''' function for creating values for hill estimator. n = sample size r = number of estimates k = parameter for hill estimator f = file path for estimates d = specifies the distribution ''' hill_est = np.array([]) if d == "pareto": for _ in range(0, r): X = pareto.rvs(3, size=n) hill_est = np.append(hill_est, hill(X, k)) if d == "cauchy": for _ in range(0, r): X = cauchy.rvs(1, size=n) hill_est = np.append(hill_est, hill(X, k)) estimates = pd.DataFrame(hill_est, columns=np.array([d])) estimates.to_csv("%s/%s-%s-%s-%s.csv" % (f, n, r, k, d), sep=",", encoding="utf-8")
yList = [] step = 0 xList = sorted(set(data)) dataLen = len(data) for elem in xList: count = data.count(elem) step += count yList.append(step / dataLen) return xList, yList def print_quantile(n): quan = [0.1, 0.5, 0.7] for qq in quan: print("level =", qq, "n =", n, end=": ") res = [] for i in range(5): data = np.zeros(n) for iteration in range(n): xi = np.random.rand() r = xm / xi ** (1 / a) data[iteration] = r x, y = efr(data.tolist()) res.append(quantile(x,y, qq)) print(res) [a, xm] = [5, 1] N = [5, 10, 100, 1000, 10000] for n in N: print_quantile(n) print("Theoretical quantiles: ", np.quantile(pareto.rvs(a, scale=xm,size=10**5), [0.1, 0.5, 0.7]))
#теоретическая плотность распределения случайной величины left = pareto.ppf(0.01, k) right = pareto.ppf(0.99, k) x = np.linspace(left, right, 100) plt.plot(x, pareto.pdf(x, k), 'r-', lw=5, alpha=0.7, label='pareto pdf') plt.legend(loc='best') # In[150]: # values = np.array([pareto.rvs(k, size=10) for x in range(10)]) # print values # plt.hist(values.mean(axis=1), normed=True) m = [] for _ in xrange(20): m.append(np.mean(pareto.rvs(k, size=1000))) # plt.hist(m, normed=True, alpha=0.5, label='hist samples') mean = pareto.mean(k) EX = mean print mean std = pareto.std(k) print std DX = std**2 print DX n = 50 values = np.array([pareto.rvs(k, size=n) for x in range(1000)]) # print 'values ', values # print 'mean ', values.mean(axis = 1) meanAr = values.mean(axis=1)
def tracking_example4(): ''' Shows the RLS algorithm tracking a process with fat tailed noise. We compare performance with and without a clamped input range. Obviously, simply clipping the input range is a pretty naive method for dealing with fat tailed noise. ''' np.random.seed(2718) N = 1000 #Length of data lmbda = 0.98 #Forgetting factor p = 6 #Filter order #Filter for generating d(n) b = [1, -0.5, .3] a = [1, 0.2, 0.16, -0.21, -0.0225] sv2 = .25 #Innovations noise variance beta = 1.25 psv = 0.025 t = np.linspace(0, 1, N) f = 2 v = 2*np.sin(2*np.pi*f*t) + \ gaussian.rvs(size = N, scale = math.sqrt(sv2)) #Innovations d = lfilter(b, a, v) #Desired process d = d + pareto.rvs(beta, size = N, scale = math.sqrt(psv)) #fat tailed noise def clamp(x, M, m): return M*(x >= M) + m*(x <= m) + x*(x < M and x > m) M = 4. m = -4. d_clamp = np.array([clamp(di, M, m) for di in d]) #Initialize RLS filter and then #Get function closure implementing 1 step prediction #-------CLAMPED INPUT----------- F = RLS(p = p, lmbda = lmbda) ff_fb = one_step_pred_setup(F) d_hat_clamp = np.array([0] + [ff_fb(di) for di in d_clamp])[:-1] err_clamp = (d - d_hat_clamp) MSE_avg_clamp = np.average(abs(err_clamp)**2) #--------UNCLAMPED INPUT--------- F = RLS(p = p, lmbda = lmbda) ff_fb = one_step_pred_setup(F) #Run it through the filter and get the error d_hat = np.array([0] + [ff_fb(di) for di in d])[:-1] err = (d - d_hat) MSE_avg = np.average(abs(err)**2) plt.subplot(2,1,1) plt.plot(range(N), d, linewidth = 1, linestyle = ':', label = 'True Process') plt.plot(range(N), d_clamp, linewidth = 1, linestyle = '--', label = 'Clamped Process') plt.plot(range(N), d_hat, linewidth = 1, label = 'Prediction') plt.plot(range(N), d_hat_clamp, linewidth = 1, label = 'Prediction (clamped)') plt.legend() plt.xlabel('$n$') plt.ylabel('Process Value') plt.title('RLS tracking a process ' \ '$\\lambda = %s$, $p = %d$' % (lmbda, p)) plt.subplot(2,1,2) plt.plot(range(N), err, linewidth = 2, label = 'err') plt.plot(range(N), err_clamp, linewidth = 2, label = 'err (clamped)') plt.hlines(MSE_avg, 0, N, linestyle = '--', label = 'MSE', linewidth = 3, color = 'r') plt.hlines(MSE_avg_clamp, 0, N, linestyle = '--', label = 'MSE (clamped)', linewidth = 3, color = 'y') plt.legend() plt.xlabel('$n$') plt.ylabel('Error') plt.title('Prediction Error') plt.show() return
def normal_and_pareto(): return [norm.rvs(size=1)[0], pareto.rvs(b, size=1)[0]]
import butools import butools.fitting as bfit import butools.trace as btrace import butools.ph as bph from scipy.stats import erlang from scipy.stats import pareto import numpy as np import matplotlib.pyplot as plt butools.verbose = True # generate 10000 samples from a Pareto distribution tr = pareto.rvs(3, size=10000) # plot the empirical CDF of the samples (xt, yt) = btrace.CdfFromTrace(tr) plt.plot(xt, yt, label='Trace') # try fitting using 3, 5, and 7 states nstates = [3, 5, 7] for ns in nstates: # use phase type distributions to fit this trace # tr: the trace (samples) to be fitted # ns: number of transient states to use alpha, A, logli = bfit.PHFromTrace(tr, ns) # alpha: initial probability of the Markov chain # A: transition rate matrix between transient states # plot the fitted CDF intBounds = np.linspace(0, 20, 1000)
def generateMonteCarlo(rand_seed, N_MC=1e8, d=2, grid_size=4, alpha=4, independence=False, tau=0, verbose=False, pickle=True): """ Approximate the angular measure with Monte Carlo procedure Params: @N_MC (int): MC sample size @d (int): dimension @grid_size (int): paving size of the L_inf sphere @alpha (float > 1): Dirichlet concentration param @independence (bool): central or axis concentration param @tau (float): min angular region from the axis to avoid on the L_inf sphere @verbose (bool): bool to print output @pickle (bool): dump the generated samples Returns: None if pickle is True or rectangle (dict) containing the angular measure """ if independence: # extreme features may be large independently if alpha: alpha_ = np.ones(d)/alpha else: alpha_ = np.ones(d)/d else: # extremes features are large simultaneously if alpha: alpha_ = np.ones(d)*alpha else: alpha_ = np.ones(d)*d # initialize paving of the L_inf sphere grid = np.linspace(tau, 1, num=grid_size+1) # initialize rectangle dictionary rectangle = dict() # samples radius and norm for MC estimation theta = dirichlet.rvs(alpha=alpha_, size=N_MC, random_state=rand_seed) R = pareto.rvs(b=1, size=N_MC, random_state=rand_seed) # polar decomposition X = R*theta X_MC = R.reshape(-1, 1) * theta # norms of all the generated samples norm_X_MC = np.linalg.norm(X_MC, axis=1, ord=np.inf) # display information if verbose: print("MC sample size :", N_MC) print("dimension :", d) print("grid_size of cube:", grid_size) print("-------------------------------------------------------") # loop over the extremes generated samples for idx, theta_i in enumerate(theta[norm_X_MC >= 1]): # display information if idx % int(1e6) == 0: if verbose: print(idx, time.ctime()) # current face key = str(theta_i.argmax()) + '-' # loop over the faces of the current sample for l in range(len(theta_i)): if l != theta_i.argmax(): key += str(np.min(np.where(theta_i[l] / np.linalg.norm(theta_i, ord=np.inf) <= grid ))) # update value of MC estimate if key in rectangle.keys(): rectangle[key] += d/N_MC else: rectangle[key] = d/N_MC if pickle: # save result with open('rectangle.'+'N_MC='+str(np.format_float_scientific(N_MC))+ '.d='+str(d)+'.alpha='+str(alpha_)+'.tau='+str(tau)+ '.grid_size='+str(grid_size)+'.pickle', 'wb') as handle: pickle.dump(rectangle, handle, protocol=pickle.HIGHEST_PROTOCOL) else: # return rectangle dictionary return rectangle
def make_data(rand_seed, n_train, n_test, d, tau, quantile=95, kappa=1., alpha_plus=None, alpha_minus=None, verbose=False): """ Generate data of three different types: - Theta (with known margins) - hat_Theta (based on the Pareto standardization) - hat_Theta_M (based on the Pareto standardization + Truncation) Params: @rand_seed (int): random_seed for reproducibility @n (int): number of samples to generate @d (int): dimension of the samples @tau (float): angular region to avoid @quantile (int): [0, 100] quantile of the ||\widehat T(X_i)|| to remove samples @kappa (float): multiplicative factor alpha_plus (float): Dirichlet concentration coef for the data labeled +1 alpha_minus (float): Dirichlet concentration coef for the data labeled -1 Returns: @Theta_train : Angle of train extreme samples based on V = d R * Theta such that R * Theta > 1 @hat_Theta_train : Angle of train extreme samples based on \hat V = \hat T(X) @hat_Theta_train_M: @Theta_test : Angle of test extreme samples based on V = d R * Theta such that R * Theta > 1 @hat_Theta_test : Angle of test extreme samples based on \hat V = \hat T(X) @y_train : label corresponding to extreme train samples V @y_hat_train : label corresponding to extreme train samples \hat V @y_hat_train_M : @y_test : label of extreme test samples V @y_hat_test : label of extreme test samples \hat V """ # threshold for selection extremes k = np.sqrt(n_train) # sanity check to have samples on the sphere if not (n_train/k) * tau > d or verbose: print("Condition (n_train / k) * tau > d is ", (n_train / k) * tau > d) n_train = int(n_train/2) # because we generate data labeled +1 and data labeled -1 n_test = int(n_test/2) # because we generate data labeled +1 and data labeled -1 # weights of Dirichlet for data labeled +1 if alpha_plus: alpha_plus_ = np.ones(d)*alpha_plus else: alpha_plus_ = np.ones(d)/d # weights of Dirichlet for data labeled -1 if alpha_minus: alpha_minus_ = np.ones(d)*alpha_minus else: alpha_minus_ = np.ones(d)*d # Generate angular vectors: train data theta_plus_train = dirichlet.rvs(alpha=alpha_plus_, size=n_train, random_state=rand_seed) theta_minus_train = dirichlet.rvs(alpha=alpha_minus_, size=n_train, random_state=rand_seed) # Generate angular vectors: test data theta_plus_test = dirichlet.rvs(alpha=alpha_plus_, size=n_test, random_state=rand_seed + 700) #+ 700 for test samples theta_minus_test = dirichlet.rvs(alpha=alpha_minus_, size=n_test, random_state=rand_seed + 700) # Generate Radius of train data R_plus_train = pareto.rvs(b=1, size=n_train, random_state=rand_seed) R_minus_train = pareto.rvs(b=1, size=n_train, random_state=rand_seed + 123) #+ 123 to change radius # Generate Radius of test data R_plus_test = pareto.rvs(b=1, size=n_test, random_state=rand_seed + 700) R_minus_test = pareto.rvs(b=1, size=n_test, random_state=rand_seed + 123 + 700) # Build X = R * theta for train data X_plus_train = R_plus_train.reshape(-1, 1) * theta_plus_train X_minus_train = R_minus_train.reshape(-1, 1) * theta_minus_train # Build X = R * theta for test data X_plus_test = R_plus_test.reshape(-1, 1) * theta_plus_test X_minus_test = R_minus_test.reshape(-1, 1) * theta_minus_test # Build V = d * X for train data V_plus_train = d * X_plus_train#[np.min(X_plus_train, axis=1) > 1] V_minus_train = d * X_minus_train#[np.min(X_minus_train, axis=1) > 1] V_train = np.vstack((V_plus_train, V_minus_train)) is_min_V_train_g_d = np.min(V_train, axis=1) > d # Build V = d * X for test data V_plus_test = d * X_plus_test#[np.min(X_plus_test, axis=1) > 1] V_minus_test = d * X_minus_test#[np.min(X_minus_test, axis=1) > 1] V_test = np.vstack((V_plus_test, V_minus_test)) # sanity check is_min_V_test_g_d = np.min(V_test, axis=1) > d # Labels for both V_train and V_test y_train = np.hstack((np.ones(n_train), np.zeros(n_train))) y_test = np.hstack((np.ones(n_test), np.zeros(n_test))) #label for both train and test for all data y_hat_train = np.hstack((np.ones(n_train), np.zeros(n_train))) y_hat_test = np.hstack((np.ones(n_test), np.zeros(n_test))) # building X_train and X_test X_train = np.vstack((X_plus_train, X_minus_train)) X_test = np.vstack((X_plus_test, X_minus_test)) # Pareto standardization order_X = order(X_train) hat_V_train = transform(order_X, X_train) hat_V_test = transform(order_X, X_test) #Computing norms for train and test norm_V_train = np.linalg.norm(V_train, axis=1, ord=np.inf) norm_hat_V_train = np.linalg.norm(hat_V_train, axis=1, ord=np.inf) norm_V_test = np.linalg.norm(V_test, axis=1, ord=np.inf) norm_hat_V_test = np.linalg.norm(hat_V_test, axis=1, ord=np.inf) # Assessing train samples with norms greater than n / k is_extreme_V_train = norm_V_train > (2 * n_train) / k # we multiply by 2 cause we divided by 2 before is_extreme_hat_V_train = norm_hat_V_train > (2 * n_train) / k # M value M = np.percentile(norm_hat_V_train[is_extreme_hat_V_train], q=quantile) if M < 1: print("Warning M < 1:", M) # Assessing train samples with norms smaller than M #(M being a quantile of ||\hat V||[||\hat V|| > n/k]) is_smaller = norm_hat_V_train <= M # Computing angular vectors Theta_train = V_train / norm_V_train.reshape(-1, 1) Theta_test = V_test / norm_V_test.reshape(-1, 1) hat_Theta_train = hat_V_train / norm_hat_V_train.reshape(-1,1) hat_Theta_test = hat_V_test / norm_hat_V_test.reshape(-1,1) # Assessing train samples which are tau far from axes is_tau_valid_V_train = np.min(Theta_train, axis=1) > tau is_tau_valid_hat_V_train = np.min(hat_Theta_train, axis=1) > tau # Assessing test samples which are tau far from axes is_tau_valid_V_test = np.min(Theta_test, axis=1) > tau is_tau_valid_hat_V_test = np.min(hat_Theta_test, axis=1) > tau # Finding the samples which verify both conditions for F known and F unknown #bool_condition_train = (is_extreme_V_train * is_extreme_hat_V_train) * (is_tau_valid_V_train * is_tau_valid_hat_V_train) #defining the angular train samples on the truncated subspace hat_Theta_train_M = hat_Theta_train[is_extreme_hat_V_train * is_tau_valid_hat_V_train * is_smaller ] y_hat_train_M = y_hat_train[is_extreme_hat_V_train * is_tau_valid_hat_V_train * is_smaller ] # defining the angular train samples verifying all conditions Theta_train = Theta_train[is_extreme_V_train * is_tau_valid_V_train]#[bool_condition_train] hat_Theta_train = hat_Theta_train[is_extreme_hat_V_train * is_tau_valid_hat_V_train] is_extreme_V_test = norm_V_test >= kappa * (2 * n_train) / k # we multiply by 2 cause we divided by 2 before is_extreme_hat_V_test = norm_hat_V_test >= kappa * (2 * n_train) / k #bool_condition_test = (is_extreme_V_test * is_extreme_hat_V_test) * (is_tau_valid_V_test * is_tau_valid_hat_V_test) hat_Theta_test = hat_Theta_test[is_extreme_hat_V_test * is_tau_valid_hat_V_test] Theta_test = Theta_test[is_extreme_V_test * is_tau_valid_V_test] if verbose: # csq of (n_train / k) * tau > d print((is_min_V_train_g_d[is_extreme_V_train * is_tau_valid_V_train]).mean()) print((is_tau_valid_V_train * is_extreme_V_train == is_tau_valid_hat_V_train * is_extreme_hat_V_train).mean()) print("shapes") print("Theta_train.shape:",Theta_train.shape) print("hat_Theta_train.shape:",hat_Theta_train.shape) print("Theta_test.shape:",Theta_test.shape) print("hat_Theta_test.shape:",hat_Theta_test.shape) # Focusing on extreme samples y_train = y_train[is_extreme_V_train * is_tau_valid_V_train] y_hat_train = y_hat_train[is_extreme_hat_V_train * is_tau_valid_hat_V_train] y_test = y_test[is_extreme_V_test * is_tau_valid_V_test] y_hat_test = y_hat_test[is_extreme_hat_V_test * is_tau_valid_hat_V_test] return Theta_train, hat_Theta_train, hat_Theta_train_M, Theta_test, hat_Theta_test, y_train, y_hat_train, y_hat_train_M, y_test, y_hat_test
def generateParetoStandardization(rand_seed, n=100, d=2, grid_size=4, alpha=4, independence=False, tau=0,quantile = 66, verbose=False, pickle_=True,): """ Estimate the influence of the Pareto standardization - Phi (with known margins) - hat Phi (based on the Pareto standardization) - hat_Phi_M (based on the Pareto standardization + Truncation) Params: @rand_seed (int): random_seed for reproducibility @n (int): number of generated data @d (int): sample size @grid_size (int): the paving size of the L_inf sphere @alpha (float > 1): Dirichlet concentration param @independence (bool): central or axis concentration param @tau (float): min angular region from the axis to avoid on the L_inf sphere @quantile ([0,100]): build M such that a ratio quantile of extreme points are kept @verbose (bool): bool to print output @pickle (bool): dump the generated samples Returns None if pickle is True or rectangle (dict) containing the angular measure """ #################################### # sanity check to make sure that given n is an int n = int(n) # setting k to define the extreme region thrshld k = np.sqrt(n) # sanity check if np.sqrt(n)*tau <= d: print("error n/k * \tau > d is false:", np.sqrt(n)*tau, d) if independence: ## extreme features may be large independently if alpha: alpha_ = np.ones(d) * 1 / (alpha) else: alpha_ = np.ones(d) * 1/(d) else: # extreme features are large independently if alpha: alpha_ = np.ones(d) * (alpha) else: alpha_ = np.ones(d) * (d) grid = np.linspace(tau, 1, num=grid_size+1) #np.arange(tau, 1+1/grid_size , 1/grid_size) # initialize all rectangle dictionaries # true input data rectangle_V = dict() # Pareto standardized data rectangle_hat_V = dict() # Pareto standardized data + Truncation rectange_hat_V_M = dict() # sample radius and angular components for simulation study theta = dirichlet.rvs(alpha=alpha_, size=n, random_state=rand_seed) R = pareto.rvs(b=1, size=n, random_state=rand_seed) X = R.reshape(-1, 1) * theta V = d * X#[np.min(X, axis=1) > 1] hat_V = transform(order(X), X) norm_V = np.linalg.norm(V, axis=1, ord=np.inf) norm_hat_V = np.linalg.norm(hat_V, axis=1, ord=np.inf) is_extreme_V = norm_V >= n / k is_extreme_hatV = norm_hat_V >= n / k # all samples with norms smaller than a given quantile M = np.percentile(norm_hat_V[is_extreme_hatV], q=quantile) if M < 1: print("Warning M < 1:", M) is_smaller = norm_hat_V <= M is_V_tau_valide = np.min(V / norm_V.reshape(-1, 1), axis=1) >= tau is_hat_V_tau_valide = np.min(hat_V / norm_hat_V.reshape(-1,1), axis=1) >= tau N_removed = np.sum(is_extreme_hatV * is_hat_V_tau_valide) - np.sum(is_extreme_hatV * is_hat_V_tau_valide * is_smaller) if verbose: print("The dimension of the problem :", d) print("The size of the grid for the cube :", grid_size) print("The number of points to compute phi by MC :", N_MC) print("-------------------------------------------------------") ## Computing the mass on V = d * X[X > 1] for idx, V_i in enumerate(V[is_extreme_V * is_V_tau_valide]): key = str(V_i.argmax()) + '-' for l in range(len(V_i)): if l != V_i.argmax(): key += str(np.min(np.where(V_i[l] / np.linalg.norm(V_i, ord=np.inf) <= grid ))) if key in rectangle_V.keys(): rectangle_V[key] += 1 / k #* is_tau_valide else: rectangle_V[key] = 1 / k #* is_tau_valide #----------------------------------------------------------------------- ## Computing the mass on hat V with the regular estimator for idx, V_i in enumerate(hat_V[is_extreme_hatV * is_hat_V_tau_valide]): key = str(V_i.argmax()) + '-' for l in range(len(V_i)): if l != V_i.argmax(): key += str(np.min(np.where(V_i[l] / np.linalg.norm(V_i, ord=np.inf) <= grid ))) if key in rectangle_hat_V.keys(): rectangle_hat_V[key] += 1 / k else: rectangle_hat_V[key] = 1 / k #----------------------------------------------------------------------- ## Computing the mass on hat V with the truncated estimator for idx, V_i in enumerate(hat_V[is_extreme_hatV * is_hat_V_tau_valide * is_smaller]): key = str(V_i.argmax()) + '-' for l in range(len(V_i)): if l != V_i.argmax(): key += str(np.min(np.where(V_i[l] / np.linalg.norm(V_i, ord=np.inf) <= grid ))) if key in rectange_hat_V_M.keys(): rectange_hat_V_M[key] += (M/(M-1)) * (1 / k) else: rectange_hat_V_M[key] =(M/(M-1)) * (1 / k) if pickle_: with open('rectangleV.'+'N_MC='+str(np.format_float_scientific(N_MC))+'.d='+str(d)+'.alpha='+str(alpha_)+'.tau='+str(tau)+'.grid_size='+str(grid_size)+'.pickle', 'wb') as handle: pickle.dump(rectangle, handle, protocol=pickle.HIGHEST_PROTOCOL) else: return rectangle_hat_V,rectange_hat_V_M, rectangle_V, N_removed
# In[42]: import math import numpy as np import matplotlib.pyplot as plt from scipy.stats import pareto import scipy.stats as sts # get_ipython().magic(u'matplotlib inline') # In[43]: # Выбор параметров для распределения k = 10 #Сгенерируйте из него выборку объёма 1000 sampleRange = pareto.rvs(k, size=1000) #Постройте гистограмму выборки и нарисуйте поверх неё теоретическую плотность распределения вашей случайной величины. plt.hist(sampleRange, normed=True, bins=20, alpha=0.5, label='hist samples pareto') plt.ylabel('number of samples') plt.xlabel('$x$') #теоретическая плотность распределения случайной величины left = pareto.ppf(0.01, k) right = pareto.ppf(0.99, k) x = np.linspace(left, right, 100) plt.plot(x, pareto.pdf(x, k), 'r-', lw=5, alpha=0.7, label='pareto pdf') plt.legend(loc='best') # In[57]:
# Display the probability density function (``pdf``): x = np.linspace(pareto.ppf(0.01, b), pareto.ppf(0.99, b), 100) ax.plot(x, pareto.pdf(x, b), 'r-', lw=5, alpha=0.6, label='pareto pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = pareto(b) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = pareto.ppf([0.001, 0.5, 0.999], b) np.allclose([0.001, 0.5, 0.999], pareto.cdf(vals, b)) # True # Generate random numbers: r = pareto.rvs(b, size=1000) # And compare the histogram: ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()
# m = [] # for _ in xrange(20): # m.append(np.mean(pareto.rvs(k, size=1000))) # # plt.hist(m, normed=True, alpha=0.5, label='hist samples') EX = pareto.mean(k) print EX std = pareto.std(k) print std DX = std**2 print DX print # Для нескольких значений n (например, 5, 10, 50) сгенерируйте 1000 выборок объёма n и # постройте гистограммы распределений их выборочных средних. n = 100 values = np.array([ pareto.rvs(k, size=n) for x in range(1000)]) meanVal = values.mean(axis = 1) plt.hist(meanVal, normed=True, alpha=0.5, label='hist mean n ' + str(n)) mu = EX sigma = math.sqrt(DX/n) # зададим нормально распределенную случайную величину norm_rv = sts.norm(loc=mu, scale=sigma) x = np.linspace(0.5,2,100) # print x pdf = norm_rv.pdf(x) plt.plot(x, pdf, 'r-', lw=3, alpha=0.7, label='pareto pdf') plt.show()
# hill = HillEstimator(reg_sizes, "Reflexive pareto RW", n) # # hill.plot_estimator() # # qi = QiEstimator(sizes=reg_sizes, name="Reflexive pareto RW", step_n=n, r=r) # # qi.plot_estimator() def powerlaw_usage(): n = 100000 pareto_rw = ParetoReflectiveRandomWalk(step_n=n, name="Reflexive pareto RW") pareto_rw.generate_path() reg_sizes = pareto_rw.get_regeneration_block_sizes() fit = powerlaw.Fit(reg_sizes) return fit.power_law.alpha # print(powerlaw_usage()) steps = pareto.rvs(1, size=1000) fit = powerlaw.Fit(steps) print(fit.power_law.alpha)
left = pareto.ppf(0.01, k) right = pareto.ppf(0.99, k) x = np.linspace(left, right, 100) plt.plot(x, pareto.pdf(x, k), 'r-', lw=5, alpha=0.7, label='pareto pdf') plt.legend(loc='best') # In[150]: # values = np.array([pareto.rvs(k, size=10) for x in range(10)]) # print values # plt.hist(values.mean(axis=1), normed=True) m = [] for _ in xrange(20): m.append(np.mean(pareto.rvs(k, size=1000))) # plt.hist(m, normed=True, alpha=0.5, label='hist samples') mean = pareto.mean(k) EX = mean print mean std = pareto.std(k) print std DX = std**2 print DX n = 50 values = np.array([ pareto.rvs(k, size=n) for x in range(1000)]) # print 'values ', values # print 'mean ', values.mean(axis = 1) meanAr = values.mean(axis = 1)
# from scipy.stats.rv_continuous import rvs import matplotlib.pyplot as plt import numpy as np import pandas as pd import statistics from scipy.stats import pareto from matplotlib import colors as mcolors colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS) b = 0.9 np.random.seed(seed=200) case01 = pareto.rvs(b, loc=0, scale=1, size=5) np.random.seed(seed=200) case02 = pareto.rvs(b, loc=0, scale=1, size=5) assert any(case01 == case02) ''' rnorm2 <- function(n,mean,sd) { mean+sd*scale(rnorm(n)) } r <- rnorm2(100,4,1) mean(r) ## 4 sd(r) ## 1 ''' ''' # Define real pars mu and sigma, sample 100x trueMu <- 5 trueSig <- 2
def nuevo_enlace2( i ): #Se selecciona un nodo y se le conecta a otro cercano. Se perturba la posición del primero. global g, delta_omega, b, delta_c max_intentos = 1000 exito = False n_nodes = len(g.nodes()) n_enlaces = len(g.edges()) max_enalces = 0.5 * n_nodes * (n_nodes - 1.0) x0 = g.node[i]['x'] y0 = g.node[i]['y'] next_neighbors = [] #J: primero busco los 2º vecinos for ii in nx.neighbors(g, i): for nnb in nx.neighbors(g, ii): #print('i=',i,'ii=',ii) next_neighbors.append( nnb) #J: Solo consideramos enlaces con los 2º vecinos #print('next_neigh=',next_neighbors) if (n_enlaces < max_enalces) and (len(next_neighbors) > 0): #neighbs = nx.neighbors(g,i) intentos = 0 while (exito != True) and (intentos < max_intentos): intentos += 1 #J: Hay que dejar siempre una vía de escape en los while #i = rd.choice(g.nodes()) #El agente se elige en el update2() rad = pareto.rvs( b, size=1 ) #Generate random numbers form a pareto density distribution b/(x^(1+b)) rad = rad[ 0] #El comando anterior genera una lista con 1 elemento, que extraemos aquí #candidates0 = [nb for nb in g.nodes() if ( ((g.node[nb]['x']-x0)**2 + (g.node[nb]['y']-y0)**2) < rad**2) and (nb != i) ] candidates0 = [ nb for nb in next_neighbors if (((g.node[nb]['x'] - x0)**2 + (g.node[nb]['y'] - y0)**2) < rad**2) and (nb != i) ] candidates = [ nb for nb in candidates0 if (nb in nx.non_neighbors(g, i)) ] #J: antes de elegir vecino comprueba que no tiene enlace n_candidates = len( candidates) #se limita por los de su especie y la otra #print('i:',i,'n_candidates= ',n_candidates) if n_candidates > 0: j = rd.choice(candidates) if j in nx.non_neighbors( g, i ): #nx.non_neighbors(g,i) proporciona la lista de no-vecinos de i g.add_edge(i, j) exito = True #angulo = rd.random()*2*math.pi #modulo = rd.random()*delta_c #g.node[i]['x'] += modulo*np.cos(angulo) #g.node[i]['y'] += modulo*np.sin(angulo) return i #devuelve el valor del nodo al que se ha conectado el nodo recien creado else: #Si el grafo es completo, añadimos un nuevo nodo #i = rd.choice(g.nodes()) #J: Si no puede añadir enlace VUELVE #nuevo_nodo2(i) return i
def generate_Anom_data(anom_mode, num_noise_samples, params): anom_views = [] if anom_mode == 1: for i in range(num_noise_samples): nViews = int(np.random.uniform(low=100, high=1000)) falseEntries = np.ones((nViews, )) anom_views.append(falseEntries) elif anom_mode == 2: for i in range(num_noise_samples): nViews = int(np.random.uniform(low=100, high=1000)) complete_prob = np.random.uniform(low=0.8, high=0.9) falseEntries = np.random.uniform(low=0.8, high=0.9, size=nViews) compl_indexes = np.random.choice(range(falseEntries.shape[0]), size=int(complete_prob * nViews)) falseEntries[compl_indexes] = 1.0 anom_views.append(falseEntries) elif anom_mode == 3: ## EXPON for i in range(num_noise_samples): nViews = int(np.random.uniform(low=100, high=10000)) views = expon.rvs(loc=params[0], scale=params[1], size=nViews) views[views > 1.0] = 1.0 anom_views.append(views) elif anom_mode == 4: ## Pareto for i in range(num_noise_samples): nViews = int(np.random.uniform(low=100, high=1000)) views = pareto.rvs(params[0], loc=params[1], size=nViews) - 1.0 views[views > 1.0] = 1.0 anom_views.append(views) elif anom_mode == 5: ## LogNorm for i in range(num_noise_samples): nViews = int(np.random.uniform(low=100, high=1000)) views = lognorm.rvs(s=params[0], scale=params[1], size=nViews) views[views > 1.0] = 1.0 anom_views.append(views) elif anom_mode == 6: ## Weibull_Min for i in range(num_noise_samples): nViews = int(np.random.uniform(low=100, high=1000)) views = weibull_min.rvs(params[0], scale=params[1], size=nViews) views[views > 1.0] = 1.0 anom_views.append(views) elif anom_mode == 7: ## Uniform for i in range(num_noise_samples): nViews = int(np.random.uniform(low=100, high=10000)) views = np.random.uniform(low=params[0], high=params[1], size=nViews) views[views > 1.0] = 1.0 anom_views.append(views) elif anom_mode == 8: ## Uniform SHORT for i in range(num_noise_samples): nViews = int(np.random.uniform(low=100, high=10000)) views = np.random.uniform(low=params[0], high=params[1], size=nViews) views[views > 1.0] = 1.0 anom_views.append(views) elif anom_mode == 9: ## Gamma Short for i in range(num_noise_samples): nViews = int(np.random.uniform(low=100, high=10000)) views = gamma.rvs(params[0], scale=params[1], loc=0.0, size=nViews) views[views > 1.0] = 1.0 anom_views.append(views) elif anom_mode == 10: for i in range(num_noise_samples): nViews = int(np.random.uniform(low=100, high=10000)) views = 1 - gamma.rvs( params[0], scale=params[1], loc=0.0, size=nViews) views[views > 1.0] = 1.0 views[views < 0.0] = 0.0 anom_views.append(views) anom_views = np.array(anom_views) pbar = ProgressBar() synth_fisk_params = [] for i in pbar(range(len(anom_views))): synth_single = fit_fisk(anom_views[i], 0) synth_fisk_params.append( [synth_single[0], synth_single[1][0], synth_single[1][2]]) print len(synth_fisk_params) synth_fisk_params = np.array(synth_fisk_params) return synth_fisk_params
def account_balance(): """Generate account balances according to a Pareto distribution. We should expect balances to be distributed as with other income distributions. The power exponent is chosen here to replicate the 80-20 rule.""" return pareto.rvs(1.161)