def sample_exposure_to_death(): exposure_to_onset = lognorm.rvs(s=eto_shape, loc=eto_loc, scale=eto_scale) onset_to_death = 1000 truncate = 40 while onset_to_death > truncate: onset_to_death = lognorm.rvs(s=otd_shape, loc=otd_loc, scale=otd_scale) return exposure_to_onset + onset_to_death
def test_PDF_lognormal_distance(): ''' Test the lognormal width based distance measure. ''' from scipy.stats import lognorm from numpy.random import seed seed(13493099) data1 = lognorm.rvs(0.4, loc=0.0, scale=1.0, size=5000) data2 = lognorm.rvs(0.5, loc=0.0, scale=1.0, size=5000) test_dist = \ PDF_Distance(data1, data2, do_fit=True, normalization_type='normalize_by_mean') test_dist.distance_metric() # Based on the samples, these are the expected stderrs. actual_dist = (0.5 - 0.4) / np.sqrt(0.004**2 + 0.005**2) # The distance value can scatter by a couple based on small variations. # With the seed set, this should always be true. assert np.abs(test_dist.lognormal_distance - actual_dist) < 2.
class TestQDM(NumpyTestCase.NumpyTestCase): badinput = 0.5 nanarray = np.array([1, 2, 3, 4, np.nan]) obsdist = lognorm.rvs(0.57, size=100) obsp = lognorm.fit(obsdist) refdist = lognorm.rvs(0.45, size=100) refp = lognorm.fit(refdist) futdist = lognorm.rvs(0.55, size=100) futp = lognorm.fit(futdist) x = np.linspace(0, 1, 101) qobs = np.quantile(obsdist, x) qref = np.quantile(refdist, x) qfut = np.quantile(futdist, x) def testQDMInput(self): """Test input is array-like""" self.assertRaises(TypeError, qdm, 0.5, 0.5, 0.5) def testQDMNanInput(self): """Test input array has no nan values""" self.assertRaises(ValueError, qdm, self.nanarray, self.nanarray, self.nanarray) def testRefInput(self): """Test using reference data as future returns obs dist params""" testqfut = qdm(self.obsdist, self.refdist, self.refdist) testp = lognorm.fit(testqfut) self.assertAlmostEqual(self.obsp[0], testp[0], places=2) self.assertAlmostEqual(self.obsp[1], testp[1], places=2) self.assertAlmostEqual(self.obsp[2], testp[2], places=2)
def estimating_val_with_log(mu, theta_2): try: Value = lognorm.rvs(s=theta_2**0.5, scale=np.exp(mu)) except ValueError: try: Value = lognorm.rvs(s=10**-9, scale=np.exp(mu)) except ValueError: Value = 0.0 return Value
def main(mean = 0.5, sd = 1.2): for x in np.linspace(1, 100000, num=16): max_sizes = [0.00001, 5, 10, 20, 50, 100, 250, 10**10] titles = ['0-4', '5-9', '10-19', '20-49', '50-99', '100-249', '250+'] binned_sample_exp = {titles[i]: lognorm.cdf(max_sizes[i + 1], sd, scale=np.exp(mean)) - lognorm.cdf(max_sizes[i], sd, scale=np.exp(mean)) for i in range(len(max_sizes) - 1)} binned_sample_gen = analysis.sort_sample(lognorm.rvs(sd, scale=np.exp(mean), size=int(x))) binned_sample_gen = {s: v / int(x) for s, v in binned_sample_gen.items()} print(binned_sample_gen, binned_sample_exp) with Pool() as p: data = p.starmap(simulation_one_parameter_set.parameter_expectation, [(int(x), mean, sd) for x in np.linspace(0, 100000, num=16)]) mean_with = [] sd_with = [] mean_without = [] sd_without = [] for d in data: mean_with.append(d[0] - mean) sd_with.append(d[1] - sd) mean_without.append(d[2]) sd_without.append(d[3]) plt.plot(np.linspace(0, 100000, num=16), mean_with) plt.plot(np.linspace(0, 100000, num=16), sd_with) plt.show()
def Generate_d_rs_out_sample(mus, cov_matrix, cv, out_sample_size) -> List[List[float]]: n = len(mus) mus, cov_matrix = np.asarray(mus), np.asarray(cov_matrix) stds = np.sqrt([cov_matrix[i, i] for i in range(n)]) component = 4 component_size = int(out_sample_size / component) # 1: two_points d_rs_1 = np.asarray([ mus - stds + np.random.randint(0, 2) * stds * 2 for _ in range(component_size) ]) # 2: independent normal d_rs_2 = [np.random.normal(mus, stds) for _ in range(component_size)] # 3: uniform d_rs_3 = np.asarray([ np.random.uniform(low=mus - np.sqrt(3) * stds, high=mus + np.sqrt(3) * stds) for _ in range(component_size) ]) # 4: log normal d_rs_4 = np.asarray([ lognorm.rvs(s=0.31 * (cv / 0.33), scale=mu, size=component_size) for mu in mus ]).T d_rs = np.concatenate([d_rs_1, d_rs_2, d_rs_3, d_rs_4]) # clip d_rs[d_rs <= 0] = 0 d_rs = d_rs.tolist() return d_rs
def lognorm_rvs(ln_params, size): # Parameters # [loop_prob, ln_trunc_fit, ln_trueloop_fit, nll] num_falseLoop = int(round(ln_params[0] * size)) num_trueLoop = int((1 - ln_params[0]) * size) falseEntries = np.array([]) trueEntries = np.array([]) if ~np.isnan(ln_params[1][1]) and (ln_params[1][1] != 0.0): falseEntries = np.zeros((0, )) falseEntries = trunclognormprior_rvs(ln_params[1][0], ln_params[1][1], ln_params[1][2], size=num_falseLoop) if ~np.isnan(ln_params[2][0]) and (ln_params[2][0] != 0.0): trueEntries = lognorm.rvs(ln_params[2][0], ln_params[2][1], ln_params[2][2], size=num_trueLoop) if (~np.isnan(ln_params[1][1])) and (~np.isnan(ln_params[2][0])): entries = np.concatenate((falseEntries, -trueEntries), axis=0) elif ~np.isnan(ln_params[1][1]): entries = falseEntries elif ~np.isnan(ln_params[2][0]): entries = -trueEntries else: entries = np.array([1.0] * size) print entries.shape[0] return entries
def Probability_establishments_within_cluster(naics, establishment, df): values = {'Sector': 2, 'Subsector': 3, 'Industry Group': 4, 'NAICS Industry': 5} df_interest = df.loc[df['NAICS code'] == naics] if df_interest.empty: PAU_class = PAU_DB(2008) df['NAICS structure'] = df['NAICS code'].apply(lambda x: PAU_class._searching_naics(x, naics)) df['NAICS structure'] = df['NAICS structure'].map(values) Max = df['NAICS structure'].max() df_interest = df[df['NAICS structure'] == Max] mean = df_interest['Mean value of shipments ($1,000)'].iloc[0] sd = df_interest['SD value of shipments ($1,000)'].iloc[0] # measure-of-size (MOS) (e.g., value of shipments, number of employees, etc.), # which was highly correlated with pollution abatement operating costs # Method of moments mu = np.log(mean**2/(sd**2 + mean**2)**0.5) theta_2 = np.log(sd**2/mean**2 + 1) MOS = lognorm.rvs(s = theta_2**0.5, scale = np.exp(mu), size = int(establishment)) Best = max(MOS) Worst = min(MOS) - 10**(np.log10(min(MOS)) - 2) # For avoiding 0 probability MOS.sort() # High values of MOS represent a possible high value of PAA. Establishments with high values of PAOC and PACE had a probability of 1 of being selected MOS_std = {str(idx + 1):[(val - Worst)/(Best - Worst), val*10**3] for idx, val in enumerate(MOS)} return MOS_std
def create_random_variables(no_calls): #generate random times in an hour for each call to start random_call_start_times = np.random.random_sample(size=no_calls) * 3600 #get random length of calls - comment out either line 22 or lines 24&25 to decide which distribution you want #lognormal distribution random_call_length2 = lognorm.rvs(s=skewness, loc=average_no_calls, size=no_calls) #decaying exponential distribution # random_call_length2 = np.random.random_sample(size = no_calls) # random_call_length2=[math.log(1-random_call_length2[c])/ -(1/900) for c in range(0,len(random_call_length2))] ############### #If using decaying exponential distribution, comment out these lines from 37 to 43 #normalise data between 0-3600 seconds np.seterr(all='raise') try: random_call_length2 = random_call_length2 - min(random_call_length2) random_call_length2 = random_call_length2 / max(random_call_length2) random_call_length2 = random_call_length2 * maxValue except FloatingPointError: print("Invalid value caught and programme continues") ############### #assign call length to call start time call_dict = {} call_dict = { int(random_call_start_times[p]): int(random_call_length2[p]) for p in range(0, no_calls) } return call_dict
def lognormal(dim, loc, mean, sigma, seed=5007): """ """ # ========================================================== # Set the random number seed # ========================================================== np.random.seed(seed) # ========================================================== # Determine the number of cells that need to be filled # ========================================================== numCells = len(loc[0]) # ========================================================== # Determine the <scale> parameter: <mean = scale exp(s^2/2)> # ========================================================== scale = mean * np.exp(-0.5 * sigma**2) # ========================================================== # Sample values # ========================================================== values = lognorm.rvs(sigma, scale=scale, size=numCells) # ========================================================== # Make cube and fill # ========================================================== cube = makeCube(dim) cube[loc] = values # ========================================================== # Return the cube # ========================================================== return (cube)
def __init__(self, syn_strength, n_synapses=100, chan_density=1.0): # average per-synapse strength self.syn_strength = syn_strength # number of synapses to model self.n_synapses = n_synapses # Light power is unevenly distributed across synapses self.stim_power_scale = np.random.uniform(0.5, 1.0, size=n_synapses) # fraction of open channels per synapse needed for 50% release probability self.release_threshold = lognorm.rvs( 0.3, size=n_synapses) * 0.5 / chan_density # per-synapse strength scaling self.synapse_strength_scale = lognorm.rvs(0.5, size=n_synapses)
def biased_regresser(size, ty, beta=1.0): # value = [] # if dataset_name == "SS": ## Best fit for citation dataset value = lognorm.rvs(1.604389429520587, 48.91174576443938, 77.36426476362374, size=size) elif dataset_name == "JEE": # Best fit for JEE scoress if ty == 0: value = johnsonsu.rvs(-1.3358254338685507, 1.228621987785165, -16.10471198333935, 25.658144591068066, size=size) ## Men if ty == 1: value = johnsonsu.rvs(-1.1504808824385124, 1.3649066883190795, -12.879957294149737, 27.482272133428403, size=size) ## Women else: print("Unknown dataset_name=%x", dataset_name) exit() # if ty == 1: value *= (beta + 1e-4) # return [{'val': val, 'real_type': ty} for val in value]
def generate(max_time, n_sequences, filename='stationary_renewal'): times, nll = [], [] for _ in range(n_sequences): s = np.sqrt(np.log(6*6+1)) mu = -s*s/2 tau = lognorm.rvs(s=s, scale=np.exp(mu), size=1000) lpdf = lognorm.logpdf(tau, s=s, scale=np.exp(mu)) T = tau.cumsum() T = T[T < max_time] lpdf = lpdf[:len(T)] score = -np.sum(lpdf) times.append(T) nll.append(score) if filename is not None: mean_number_items = sum(len(t) for t in times) / len(times) nll = [n/mean_number_items for n in nll] np.savez(f'{dataset_dir}/{filename}.npz', arrival_times=times, nll=nll, t_max=max_time, mean_number_items=mean_number_items) else: return times
def draw_new_params(self, param_names, heterogeneity): for param in param_names: mean = self.parameters[param] std = mean*(heterogeneity/100.) sigma, scale = lognorm_params(mean, std) sample = log_pdf.rvs(sigma, 0, scale, size = 1) self.parameters[param] = sample
def plot_bias(): for sampl in np.arange(10, 45, 5): errs = [] ests = [] real_val = lognorm.ppf(0.5, 1, 0) for _ in range(100000): x = lognorm.rvs(1, 0, size=sampl) #est_val = estimate_median(x) est_val = np.median(x) err = (real_val - est_val) / real_val errs.append(err) ests.append(est_val) print(np.mean(errs)) plt.hist(ests, bins=np.arange(0, 4, .1)) plt.axvline(real_val, label="actual median", color="black") plt.axvline(np.mean(ests), label="avg estimated value of median on sample size: " + str(sampl), color="purple") plt.axvline(np.median(ests), label="median estimated value of median on sample size: " + str(sampl), color="orange") plt.legend() plt.title("Sample size = " + str(sampl)) plt.savefig('plots/sample_' + str(sampl) + '.png') plt.close() print('processed sample size ' + str(sampl))
def FUNC_norm_gen(p1, p2, num): """ FUNC_norm_gen Generated S1 events using the effective marginalized pdf in S1. inputs: s1, shape, scale """ return lognorm.rvs(p1, loc=0, scale=p2, size=num)
def _make_dark_frame(self, temperature, alpha=0.0488 / u.Kelvin, beta=-12.772, shape=0.4, seed=None): """ Function to create a dark current 'image' in electrons per second per pixel given an image sensor temperature and a set of coefficients for a simple dark current model. Modal dark current for the image sensor as a whole is modelled as D.C. = 10**(alpha * T + beta) where T is the temperature in Kelvin. Individual pixel dark currents are random uncorrelated values from a log normal distribution so that there is a semi-realistic tail of 'hot pixels'. For reproducible dark frames the random number generator seed can optionally be specified. """ temperature = temperature.to( u.Kelvin, equivalencies=u.equivalencies.temperature()) mode = 10**(alpha * temperature + beta) * u.electron / (u.second) scale = mode * np.exp(shape**2) if seed: np.random.seed(seed) dark_frame = lognorm.rvs(shape, scale=scale, size=(self.wcs._naxis2, self.wcs._naxis1)) return mode, dark_frame
def PoS_portfolio(self, PoS, size, scale=1): PoS_list = [PoS for i in range(size)] self.success_rates = [np.random.random() <= PoS for PoS in PoS_list] self.portfolio = [lognorm.rvs(sigma, scale=scale_factor) for prod in self.success_rates if prod == True] return(sum(self.portfolio), sum(self.portfolio)/size)
def generate_random_data_from_dist(param, shape, nrows, ncols): if shape == 'normal': data = norm.rvs(0, param, size=(nrows, ncols)) # link the two sliders and make the param for t dfs (yolked to sample size in other slider) # elif shape=='t': # data = t.rvs(df=ncols-1) elif shape == 'lognormal': data = lognorm.rvs(param, size=(nrows, ncols)) elif shape == 'contaminated chi-squared': # data = chi2.rvs(4, 0, param, size=size) data = chi2.rvs(4, size=(nrows, ncols)) contam_inds = np.random.randint(ncols, size=int(param * ncols)) data[:, contam_inds] *= 10 elif shape == 'contaminated normal': sub_size = round(param * ncols) norm_size = int(ncols - sub_size) standard_norm_values = norm.rvs(0, 1, size=(nrows, norm_size)) contam_values = norm.rvs(0, 10, size=(nrows, sub_size)) #print(standard_norm_values.shape) #print(contam_values.shape) data = np.concatenate([standard_norm_values, contam_values], axis=1) #print(data.shape) elif shape == 'exponential': data = expon.rvs(0, param, size=(nrows, ncols)) return data
def get_gini(size): y = lognorm.rvs(s=1, size=size) # comparision gini_ineqpy = ineqpy.gini(income=y) gini_pysal = Gini(y).g gini_diff = abs(gini_ineqpy - gini_pysal) return n, gini_ineqpy, gini_pysal, gini_diff
def sample(self, E): """ Sample a reco/true energy given a true/reco energy. """ mu, sigma = self._get_lognormal_params(E) return lognorm.rvs(sigma, loc=0, scale=mu)
def _boots(self, df, newx, shape, scale, dist=lognorm): xr = lognorm.rvs(size=len(df['Prediction']), s=shape, loc=0, scale=scale) this_shape, this_loc, this_scale = lognorm.fit(xr, floc=0) this_fit = dist.cdf(newx, s=this_shape, loc=0, scale=this_scale) return list(this_fit)
def random_doc2vec(min_count_scale=10, **kwargs): return ('doc2vec', { 'vector_size': int(np.floor(beta.rvs(loc=2, a=2, b=3, scale=100, size=1)).item()), # Dimensionality of the feature vectors. 'window': max(1, int(norm.rvs(loc=5, scale=1, size=1).item())), # the maximum distance between the current and predicted word within a sentence. 'min_count': int(np.floor(beta.rvs(loc=2, a=2, b=2.5, scale=min_count_scale, size=1)).item()), # Ignores all words with total frequency lower than this. 'max_vocab_size': int(np.floor(np.exp(lognorm.rvs(loc=9, s=0.1, scale=4, size=1)))), # Limits the vocabulary; if there are more unique words than this, then prune the infrequent ones 'sample': uniform.rvs(loc=0, scale=1e-5, size=1).item(), # The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). 'dm_mean': bernoulli.rvs(0.4, size=1).item(), # whether to use the sum of the context word vectors instead of the mean 'dm_concat': bernoulli.rvs(0.05, size=1).item(), # whether to use concatenation of context vectors rather than sum/average })
def gen_lognorm_params(self, pname, std, n = 20): """ generate lognormally distributed parameters for given SD and parameter name """ mean = self.parameters[pname] sigma, scale = lognorm_params(mean, std) sample = log_pdf.rvs(sigma, 0, scale, size = n) return sample
def sample_topic_lambda_sigma(self): while True: _lambda = lognorm.rvs(s=self.lambda_0_sigma, loc=0, scale=self.lambda_0_scale, size=1)[0] if _lambda > 0.05 and _lambda < 0.10: break while True: _sigma = lognorm.rvs(s=self.sigma_siama, loc=0, scale=self.sigma_scale, size=1)[0] if _sigma > 0.7 and _sigma < 1.1: break return _lambda, _sigma
def get_value(self): """ Get a random value following the distribution Returns ----------- value Value obtained following the distribution """ from scipy.stats import lognorm return lognorm.rvs(self.s, self.loc, self.scale)
def sample(n_samples, std=6): """Draw samples from the distribution. Args: n_samples: Number of samples to generate. std: Standart deviation of f*(t). """ s = np.sqrt(np.log(std**2 + 1)) mu = -0.5 * s * s inter_times = lognorm.rvs(s=s, scale=np.exp(mu), size=n_samples) arrival_times = inter_times.cumsum() return arrival_times
def sample(self, samples, random_seed=None): """ :param samples: int :param random_seed: int :return: np.array(samples, self.dimension) """ if random_seed is not None: np.random.seed(random_seed) points = lognorm.rvs(s=self.scale, scale=self.param, size=samples) points = points.reshape([samples, self.dimension]) return points**2
def knowledge_gain_coef(self, topics, model): if model == 'ST': # lambda_list = self._topic_lambda_dis['ST'][0] # lambda_probs = self._topic_lambda_dis['ST'][1] # return np.random.choice(lambda_list,size=len(topics),p=lambda_probs,replace=True) _scale, _sigma = self._topic_lambda_dis['ST'] return lognorm.rvs(s=_sigma, loc=0, scale=_scale, size=len(topics)) elif model == 'MT': ## 对主题以及位置进行计算 t_num = defaultdict(list) for i, t in enumerate(topics): t_num[t].append(i) ## 对每一个主题 lambdas = [] indexes = [] for t in t_num.keys(): ins = t_num[t] # lambda_list = self._topic_lambda_dis[t][0] # lambda_probs = self._topic_lambda_dis[t][1] # t_ls = np.random.choice(lambda_list,size=len(ins),p=lambda_probs,replace=True) _scale, _sigma = self._topic_lambda_dis[t] t_ls = lognorm.rvs(s=_sigma, loc=0, scale=_scale, size=len(ins)) lambdas.extend(t_ls) indexes.extend(ins) ## 根据index对lambdas进行排序 return [ lambdas[i] for i in sorted(range(len(indexes)), key=lambda x: indexes[x]) ]
def sampledist(DistributionName, Mean, Std): # This function is used for generate random variable (loss and downtime) given distribution of the variable if DistributionName == 'Normal': temp = norm.rvs(loc=Mean, scale=Std, size=1, random_state=None) return temp[0] # else: return np.exp(norm.rvs(loc=np.log(Mean), scale=Std, size=1, random_state=None)) elif DistributionName == 'LogNormal': p = np.poly1d([1, -1, 0, 0, -(Std / Mean)**2]) r = p.roots sol = r[(r.imag == 0) & (r.real > 0)].real shape = np.sqrt(np.log(sol)) scale = Mean * sol return lognorm.rvs(shape, 0, scale, size=1)[0]
def test_PDF_fitting(): ''' Test distribution fitting for PDFs By default, we use the lognormal distribution, and only test it here. ''' from scipy.stats import lognorm from numpy.random import seed seed(13493099) data1 = lognorm.rvs(0.4, loc=0.0, scale=1.0, size=50000) test = PDF(data1).run() npt.assert_almost_equal(0.40, test.model_params[0], decimal=2) npt.assert_almost_equal(1.0, test.model_params[1], decimal=1)
def make_csd(shape, scale, npart, show_plot=False): """Create cell size distribution and save it to file.""" if shape == 0: rads = [scale + 0 * x for x in range(npart)] else: rads = lognorm.rvs(shape, scale=scale, size=npart) with open('diameters.txt', 'w') as fout: for rad in rads: fout.write('{0}\n'.format(rad)) if shape == 0: xpos = linspace(scale / 2, scale * 2, 100) else: xpos = linspace(lognorm.ppf(0.01, shape, scale=scale), lognorm.ppf(0.99, shape, scale=scale), 100) plt.plot(xpos, lognorm.pdf(xpos, shape, scale=scale)) plt.hist(rads, normed=True) plt.savefig('packing_histogram.png') plt.savefig('packing_histogram.pdf') if show_plot: plt.show()
def hpml(xs, ys, l0=1, noise=0.001, K=K_SE): xs = asarray(xs); ys = ascolumn(ys) def nll(l): # negative log likelihood #if l < 0.001: return 1e10 Kxx = K(xs, l=l) Kxx += (noise**2) * eye_like(Kxx) res = (ys.T).dot(pinvh(Kxx)).dot(ys) + slogdet(Kxx)[1] res = squeeze(res) #print l,res return res def nll_prime(l): Kxx,Kps = K(xs, l=l, deriv=True) Kxx += (noise**2) * eye_like(Kxx) KxxI = pinvh(Kxx) a = KxxI.dot(ys) aaT = outer(a,a) # a . a.T KI_aaT = KxxI - aaT # K^-1 - aaT res = [] for Kp in Kps: grad = trace_prod(KI_aaT, Kp) res.append(grad) return asarray(res) #l = fmin_cg(nll, l0, maxiter=10, disp=False, epsilon=.001) #l = fmin_cg(nll, l0, disp=False, epsilon=.001) l = fmin_cg(nll, l0, fprime=nll_prime, disp=False)#, maxiter=10, disp=False) best_nll = nll(l) nlls = set([int(best_nll/noise)]) for i in xrange(20): cur_l0 = lognorm.rvs(1, size=size(l0)) cur_l = fmin_cg(nll, cur_l0, fprime=nll_prime, disp=False) cur_nll = nll(cur_l) nlls.add(int(cur_nll/noise)) if cur_nll < best_nll: #print 'LL up by', best_nll - cur_nll best_nll = cur_nll l = cur_l #print len(nlls), 'suff. uniq. LL optima:', sorted([x*noise for x in nlls]) return absolute(l), len(nlls)
def mcprices(S0, K, T, r, sigma, N=5000): """ Call and put option prices using log-normal Monte-Carlo method Parameters ---------- S0 : Current price of the underlying stock K : Strike price of the option T : Time to maturity of the option r : Risk-free rate of return (continuously-compounded) sigma : Stock price volatility N : Number of stock prices to simulate Returns ------- c : Call option price p : Put option price Notes ----- r, T, and sigma must be expressed in consistent units of time """ scale = S0*exp((r-sigma**2/2)*T) shape = sigma*sqrt(T) ST_sim = lognorm.rvs(shape,scale=scale, size=N) call_pay_off = maximum(ST_sim - K, 0) put_pay_off = maximum(K - ST_sim, 0) discount = exp(-r*T) return (call_pay_off.mean()*discount, put_pay_off.mean()*discount)
def draw(self): return lognorm.rvs(self.shape, self.location, self.scale) * self.multiplier
log_a = (log_a-np.mean (log_a))/np.std (log_a) log_b = (log_b-np.mean (log_b))/np.std (log_b) log_c = (log_c-np.mean (log_c))/np.std (log_c) print kstest (log_a, 'norm') print kstest (log_b, 'norm') print kstest (log_c, 'norm') plb.hist (b) plb.hist (log_b, bins=20) plb.hist (a, bins=100) plb.hist (log_a, bins=10) shape, loc, scale = lognorm.fit(a) rnd_a = lognorm.rvs(shape, scale=scale, loc=loc, size=len(a)) plb.hist(rnd_a, bins=20, alpha=0.5) plb.hist(a, bins=20, color='r', alpha=0.5) shape, loc, scale = lognorm.fit(c) rnd_c = lognorm.rvs(shape, scale=scale, loc=loc, size=len(c)) plb.hist(rnd_c, bins=30, alpha=0.5) plb.hist(c, bins=30, color='r', alpha=0.5) shape, loc, scale = lognorm.fit(b) rnd_b = lognorm.rvs(shape, scale=scale, loc=loc, size=len(b)) plb.hist(rnd_b, bins=20, alpha=0.5) plb.hist(b, bins=20, color='r', alpha=0.5) np.mean (b) shape = np.std (b)
import numpy as np from scipy.stats import uniform, lognorm import pystan # Data np.random.seed(1056) # set seed to replicate example nobs= 5000 # number of obs in model x1 = uniform.rvs(size=nobs) # random uniform variable beta0 = 2.0 # intercept beta1 = 3.0 # linear predictor sigma = 1.0 # dispersion xb = beta0 + beta1 * x1 # linear predictor, xb exb = np.exp(xb) y = lognorm.rvs(sigma, scale=exb, size=nobs) # create y as adjusted # random normal variate # Fit mydata = {} mydata['N'] = nobs mydata['x1'] = x1 mydata['y'] = y stan_lognormal = """ data{ int<lower=0> N; vector[N] x1; vector[N] y; } parameters{
import powerlaw # 5.22.1 Continuous distributions p681 scipy manual from scipy.stats import lognorm from numpy import rint sdln=lognorm.rvs(1.3,loc=0,scale=10,size=10) print "lognormal float data", sdln[1:5] lnresults = powerlaw.distribution_fit(sdln, distribution='lognormal', discrete=False) print lnresults sdlnint=rint(sdln).astype(int) print "lognormal int data", sdlnint[1:5] lnintresults = powerlaw.distribution_fit(sdlnint, distribution='lognormal') #lnintresults = powerlaw.distribution_fit(sdlnint, distribution='lognormal', discrete=True) print lnintresults
def simple_packing(shape, scale, number_of_cells): "Simple and fast algorithm for packing" Rad = lognorm.rvs(shape, scale=scale, size=number_of_cells) print(Rad) Rad /= 2 Rads1 = list(range(number_of_cells)) t = 0 for i in range(number_of_cells): c = abs(Rad[t]) Rads1[t] = c.astype(np.float) t = t + 1 Rads1 = sorted(Rads1) v = 0.00 for i in range(number_of_cells): v = v + ((2.00 * Rads1[i])**3.00) centers = [[0 for i in range(3)] for j in range(number_of_cells)] v = v * 1.40 lc = v**(1.00 / 3.00) K = 0 while K == 0: j = -1 h = 0 timeout = time.time() + 10 while number_of_cells >= j and h == 0: if time.time() > timeout: h = 1 break j = j + 1 if j == number_of_cells: K = 1 break PickCenterX, PickCenterY, PickCenterZ =\ lc * random.random(),\ lc * random.random(),\ lc * random.random() while (lc - Rads1[j] >= PickCenterX and lc - Rads1[j] >= PickCenterY and lc - Rads1[j] >= PickCenterZ and Rads1[j] < PickCenterX and Rads1[j] < PickCenterY and Rads1[j] < PickCenterZ): PickCenterX, PickCenterY, PickCenterZ =\ lc * random.random(),\ lc * random.random(),\ lc * random.random() centers[j][0], centers[j][1], centers[j][2] =\ PickCenterX, PickCenterY, PickCenterZ KeepCentreX, KeepCentreY, KeepCentreZ, KeepR =\ PickCenterX, PickCenterY, PickCenterZ, Rads1[j] if j > 0: for t in range(0, j): if ((((((KeepCentreX - centers[t][0])**2.00) + ((KeepCentreY - centers[t][1])**2.00) + ((KeepCentreZ - centers[t][2])**2.00))**0.50) - (KeepR + Rads1[t])) < 0.000) and t != j: centers[j][0], centers[j][0], centers[j][0] = 0, 0, 0 j = j - 1 break data = zip(*centers) data.append(Rads1) data = np.array(zip(*data)) data[:, 3] = 2 * data[:, 3] return data
counter = 0 for sig in sig_grid: # Compute optimal redistribution scheme of the government policy_grid.append(fsolve( lambda policies: foc(policies, psi, sig, start=0, end=10), x0=x0 )) opt_tax.append(policy_grid[counter][0]) opt_trans.append(policy_grid[counter][1]) ## Simulate distribution of wages and compute the distribution of ## consumption and hours worked given the optimal redistribution scheme ## calculated above wage_grid.append( lognorm.rvs(s=sig, scale=np.exp(- sig**2 / 2), size=n_obs) ) print("Check the mean ", np.mean(wage_grid[counter]), " approx. 1???") hours_grid.append(hours(wage_grid[counter], opt_tax[counter], psi)) cons_grid.append(cons(wage_grid[counter], opt_tax[counter], psi, opt_trans[counter])) counter += 1 ## Plot distributions of optimal wages, consumption and hours worked fig = plt.figure() plt.subplot(3, 3, 1) plt.hist(wage_grid[0], bins=100)
# coding:utf-8 import numpy as np from scipy.stats import lognorm import matplotlib.pyplot as plt r = lognorm.rvs(1, loc=10, scale=1, size=10000) plt.subplot(211) plt.hist(r, bins=100) plt.subplot(212) plt.xscale("log") plt.hist(np.log(r), bins=100) plt.show()
def main(): usage = 'usage: %prog [options] <gtf> <fasta>' parser = OptionParser(usage) parser.add_option('-b', dest='bam_length', help='Obtain read length via sampling a distribution from a BAM file [Default: %default]') parser.add_option('-e', dest='error_rate', type='float', default=0, help='Error rate (uniform on reads) [Default: %default]') parser.add_option('-f', dest='fpkm_file', help='Cufflinks .fpkm_tracking file to use for FPKMs [Default: %default]') parser.add_option('-l', dest='read_length', type='int', default=30, help='Read length [Default: %default]') parser.add_option('-n', dest='num_reads', type='int', default=100000, help='Number of reads [Default: %default]') parser.add_option('-o', dest='output_prefix', default='reads', help='Output files prefix [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide GTF file and fasta file') else: gtf_file = args[0] fasta_file = args[1] if options.bam_length: read_length_distribution = bam_length_distribution(options.bam_length) else: read_length_distribution = {options.read_length:1} # read GTF gene_id to transcript_id's mapping g2t = gff.g2t(gtf_file) # get transcript lengths transcript_lengths = {} for line in open(gtf_file): a = line.split('\t') if a[2] == 'exon': transcript_id = gff.gtf_kv(a[8])['transcript_id'] transcript_lengths[transcript_id] = transcript_lengths.get(transcript_id,0) + int(a[4])-int(a[3])+1 if options.fpkm_file: transcript_copies = {} fpkm_in = open(options.fpkm_file) line = fpkm_in.readline() for line in fpkm_in: a = line.split('\t') transcript_copies[a[0]] = float(a[9]) fpkm_in.close() if sum(transcript_copies.values()) == 0: print >> sys.stderr, 'FPKM file shows no expression. Exiting.' exit(1) else: # sample gene copies gene_copies_raw = lognorm.rvs(1,size=len(g2t)) gene_copies_raw_sum = sum(gene_copies_raw) gene_copies = dict(zip(g2t.keys(), [gcr/gene_copies_raw_sum for gcr in gene_copies_raw])) # sample transcript copies transcript_copies = {} for gene_id in g2t: relative_copies = dict(zip(g2t[gene_id], lognorm.rvs(1,size=len(g2t[gene_id])))) relative_sum = sum(relative_copies.values()) for transcript_id in g2t[gene_id]: transcript_copies[transcript_id] = gene_copies[gene_id]*relative_copies[transcript_id]/relative_sum # determine transcript probabilities as a function of copy and length transcript_weights = {} for transcript_id in transcript_copies: if transcript_lengths[transcript_id] >= min(read_length_distribution.keys()): weight = 0 for read_length in read_length_distribution: weight += read_length_distribution[read_length]*transcript_copies[transcript_id]*(transcript_lengths[transcript_id]-read_length+1) if weight > 0: transcript_weights[transcript_id] = weight weights_sum = sum(transcript_weights.values()) transcript_probs = dict([(tid,transcript_weights[tid]/weights_sum) for tid in transcript_weights]) # open fasta file fasta = pysam.Fastafile(fasta_file) # open output files fastq_out = open('%s.fastq' % options.output_prefix, 'w') gff_out = open('%s_txome.gff' % options.output_prefix, 'w') # for each transcript read_index = 1 for transcript_id in transcript_probs: expected_reads = transcript_probs[transcript_id]*options.num_reads if expected_reads == 0: sampled_reads = 0 else: sampled_reads = poisson.rvs(expected_reads) for s in range(sampled_reads): read_length = sample_read_length(read_length_distribution) if transcript_lengths[transcript_id] > read_length: pos = random.randint(0, transcript_lengths[transcript_id]-read_length) seq = fasta.fetch(transcript_id, pos, pos+read_length).upper() if seq: eseq = inject_errors(seq, options.error_rate) print >> fastq_out, '@read%d\n%s\n+\n%s' % (read_index,eseq,'I'*read_length) print >> gff_out, '\t'.join([transcript_id, 'sim', 'read', str(pos+1), str(pos+read_length), '.', '+', '.', 'read%d'%read_index]) read_index += 1 else: print >> sys.stderr, 'Missing fasta sequence %s:%d-%d' % (transcript_id,pos,(pos+read_length)) fastq_out.close() gff_out.close() # map back to genome subprocess.call('tgff_cgff.py -c %s %s_txome.gff > %s_genome.gff' % (gtf_file, options.output_prefix, options.output_prefix), shell=True)
return baseArray ############################################################### ############################################################### # # # Build some # # Random Dataframes # # For mass building # # # ############################################################### ############################################################### failSeries = expon.rvs(scale=20, size=100) # Calculate lognorm parameters muLog = np.log(15/np.sqrt(1+(10/15**2))) sigLog = np.sqrt(np.log(1 + 10/15**2)) recoverSeries = np.exp(lognorm.rvs(sigLog, loc=muLog, size=100)) failPerf = 0.1 * uniform.rvs(size=100) recoveryPerf = 0.9 + 0.2 * uniform.rvs(size=100) paramArray = pd.DataFrame({'FailTime': failSeries, 'RecoverTime': failSeries + recoverSeries, 'FailPerformance': failPerf, 'RecoveryPerformance': recoveryPerf}) paramArray2 = pd.DataFrame({'FailTime': 15, 'RecoverTime': 15 + recoverSeries, 'FailPerformance': failPerf, 'RecoveryPerformance': recoveryPerf})
from scipy import stats from scipy.stats import lognorm rrr=lognorm.rvs(10,loc=0,scale=2,size=1000) print rrr[1:10] print "log normal fit", lognorm.fit(rrr,5,loc=0,scale=3) rrr[1:10] from numpy import rint from numpy import around ppp = around(rrr) print ppp[1:10] print lognorm.fit(ppp,5,loc=0,scale=3)
# Trying out different broad distributions with linear and logarithmic PDFs: n_points = 100000 # power law: # slope = -2! one_over_rands = 1/np.random.rand(n_points) # http://en.wikipedia.org/wiki/Power_law # exponential distribution exps = expon.rvs(size=1000) # http://en.wikipedia.org/wiki/Exponential_distribution # lognormal (looks like a normal distribution in a log-log scale!) lognorms = lognorm.rvs(1.0, size=1000) # http://en.wikipedia.org/wiki/Log-normal_distribution fig = plt.figure(figsize=(15,15)) fig.suptitle('Different broad distribution PDFs in lin-lin, log-log, and lin-log axes') n_bins = 30 for i, (rands, name) in enumerate(zip([one_over_rands, exps, lognorms], ["power law", "exponential", "lognormal"])): # linear-linear scale ax = fig.add_subplot(4, 3, i+1) ax.hist(rands, n_bins, normed=True) ax.text(0.5,0.9, "PDF, lin-lin: " + name, transform=ax.transAxes) # log-log scale ax = fig.add_subplot(4, 3, i+4) bins = np.logspace(np.log10(np.min(rands)), np.log10(np.max(rands)), num=n_bins)
def __init__(self, a, b, n, name, pa=0.1, pb=0.9, lognormal=False, Plot=True): mscale.register_scale(ProbitScale) if Plot: fig = plt.figure(facecolor="white") ax1 = fig.add_subplot(121, axisbelow=True) ax2 = fig.add_subplot(122, axisbelow=True) ax1.set_xlabel(name) ax1.set_ylabel("ECDF and Best Fit CDF") prop = matplotlib.font_manager.FontProperties(size=8) if lognormal: sigma = (log(b) - log(a)) / ((erfinv(2 * pb - 1) - erfinv(2 * pa - 1)) * (2 ** 0.5)) mu = log(a) - erfinv(2 * pa - 1) * sigma * (2 ** 0.5) cdf = arange(0.001, 1.000, 0.001) ppf = map(lambda v: lognorm.ppf(v, sigma, scale=exp(mu)), cdf) x = lognorm.rvs(sigma, scale=exp(mu), size=n) x.sort() print "generating lognormal %s, p50 %0.3f, size %s" % (name, exp(mu), n) x_s, ecdf_x = ecdf(x) best_fit = lognorm.cdf(x, sigma, scale=exp(mu)) if Plot: ax1.set_xscale("log") ax2.set_xscale("log") hist_y = lognorm.pdf(x_s, std(log(x)), scale=exp(mu)) else: sigma = (b - a) / ((erfinv(2 * pb - 1) - erfinv(2 * pa - 1)) * (2 ** 0.5)) mu = a - erfinv(2 * pa - 1) * sigma * (2 ** 0.5) cdf = arange(0.001, 1.000, 0.001) ppf = map(lambda v: norm.ppf(v, mu, scale=sigma), cdf) print "generating normal %s, p50 %0.3f, size %s" % (name, mu, n) x = norm.rvs(mu, scale=sigma, size=n) x.sort() x_s, ecdf_x = ecdf(x) best_fit = norm.cdf((x - mean(x)) / std(x)) hist_y = norm.pdf(x_s, loc=mean(x), scale=std(x)) pass if Plot: ax1.plot(ppf, cdf, "r-", linewidth=2) ax1.set_yscale("probit") ax1.plot(x_s, ecdf_x, "o") ax1.plot(x, best_fit, "r--", linewidth=2) n, bins, patches = ax2.hist(x, normed=1, facecolor="green", alpha=0.75) bincenters = 0.5 * (bins[1:] + bins[:-1]) ax2.plot(x_s, hist_y, "r--", linewidth=2) ax2.set_xlabel(name) ax2.set_ylabel("Histogram and Best Fit PDF") ax1.grid(b=True, which="both", color="black", linestyle="-", linewidth=1) # ax1.grid(b=True, which='major', color='black', linestyle='--') ax2.grid(True) return