def get_percentiles(src_dict): from scipy.stats import stats as spst K,V = zip(*(list(sorted(src_dict.items(), key=itemgetter(1))))) # unzip the ordered src_dict u_pct = [spst.percentileofscore(V,v,kind='weak') for v in V] # upper percentile list (under OR EQUAL) l_pct = [spst.percentileofscore(V,v,kind='strict') for v in V] # lower percentile list (under) u_pct_dict = dict(zip(K,map(float,u_pct))) l_pct_dict = dict(zip(K,map(float,l_pct))) return u_pct_dict, l_pct_dict
def bayesian_random_effects(data, labels, group, n_samples=2000, n_burnin=500): import pymc as pm #preparing the data donors = data[group].unique() donors_lookup = dict(zip(donors, range(len(donors)))) data['donor_code'] = data[group].replace(donors_lookup).values n_donors = len(data[group].unique()) donor_idx = data['donor_code'].values #setting up the model with pm.Model() as hierarchical_model: # Hyperpriors for group nodes group_intercept_mean = pm.Normal('group intercept (mean)', mu=0., sd=100**2) group_intercept_variance = pm.Uniform('group intercept (variance)', lower=0, upper=100) group_slope_mean = pm.Normal('group slope (mean)', mu=0., sd=100**2) group_slope_variance = pm.Uniform('group slope (variance)', lower=0, upper=100) individual_intercepts = pm.Normal('individual intercepts', mu=group_intercept_mean, sd=group_intercept_variance, shape=n_donors) individual_slopes = pm.Normal('individual slopes', mu=group_slope_mean, sd=group_slope_variance, shape=n_donors) # Model error residuals = pm.Uniform('residuals', lower=0, upper=100) expression_est = individual_slopes[donor_idx] * data[labels[0]].values + individual_intercepts[donor_idx] # Data likelihood expression_like = pm.Normal('expression_like', mu=expression_est, sd=residuals, observed=data[labels[1]]) start = pm.find_MAP() step = pm.NUTS(scaling=start) hierarchical_trace = pm.sample(n_samples, step, start=start, progressbar=True) mean_slope = hierarchical_trace['group slope (mean)'][n_burnin:].mean() zero_percentile = percentileofscore(hierarchical_trace['group slope (mean)'][n_burnin:], 0) print "Mean group level slope was %g (zero was %g percentile of the posterior distribution)"%(mean_slope, zero_percentile) pm.summary(hierarchical_trace[n_burnin:], vars=['group slope (mean)']) pm.traceplot(hierarchical_trace[n_burnin:]) selection = donors fig, axis = plt.subplots(2, 3, figsize=(12, 6), sharey=True, sharex=True) axis = axis.ravel() xvals = np.linspace(data[labels[0]].min(), data[labels[0]].max()) for i, c in enumerate(selection): c_data = data.ix[data[group] == c] c_data = c_data.reset_index(drop = True) z = list(c_data['donor_code'])[0] for a_val, b_val in zip(hierarchical_trace['individual intercepts'][n_burnin::10][z], hierarchical_trace['individual slopes'][n_burnin::10][z]): axis[i].plot(xvals, a_val + b_val * xvals, 'g', alpha=.1) axis[i].plot(xvals, hierarchical_trace['individual intercepts'][n_burnin:][z].mean() + hierarchical_trace['individual slopes'][n_burnin:][z].mean() * xvals, 'g', alpha=1, lw=2.) axis[i].hexbin(c_data[labels[0]], c_data[labels[1]], mincnt=1, cmap=plt.cm.YlOrRd_r) axis[i].set_title(c) axis[i].set_xlabel(labels[0]) axis[i].set_ylabel(labels[1]) plt.show() return mean_slope, zero_percentile
def bayesian_random_effects(data, labels, group, n_samples=2000, n_burnin=500): import pymc as pm #preparing the data donors = data[group].unique() donors_lookup = dict(zip(donors, range(len(donors)))) data['donor_code'] = data[group].replace(donors_lookup).values n_donors = len(data[group].unique()) donor_idx = data['donor_code'].values #setting up the model with pm.Model() as hierarchical_model: # Hyperpriors for group nodes group_intercept_mean = pm.Normal('group intercept (mean)', mu=0., sd=100**2) group_intercept_variance = pm.Uniform('group intercept (variance)', lower=0, upper=100) group_slope_mean = pm.Normal('group slope (mean)', mu=0., sd=100**2) group_slope_variance = pm.Uniform('group slope (variance)', lower=0, upper=100) individual_intercepts = pm.Normal('individual intercepts', mu=group_intercept_mean, sd=group_intercept_variance, shape=n_donors) individual_slopes = pm.Normal('individual slopes', mu=group_slope_mean, sd=group_slope_variance, shape=n_donors) # Model error residuals = pm.Uniform('residuals', lower=0, upper=100) expression_est = individual_slopes[donor_idx] * data[labels[0]].values + individual_intercepts[donor_idx] # Data likelihood expression_like = pm.Normal('expression_like', mu=expression_est, sd=residuals, observed=data[labels[1]]) start = pm.find_MAP() step = pm.NUTS(scaling=start) hierarchical_trace = pm.sample(n_samples, step, start=start, progressbar=True) mean_slope = hierarchical_trace['group slope (mean)'][n_burnin:].mean() zero_percentile = percentileofscore(hierarchical_trace['group slope (mean)'][n_burnin:], 0) #print "Mean group level slope was %g (zero was %g percentile of the posterior distribution)"%(mean_slope, zero_percentile) #pm.summary(hierarchical_trace[n_burnin:], vars=['group slope (mean)']) #pm.traceplot(hierarchical_trace[n_burnin:]) #selection = donors #fig, axis = plt.subplots(2, 3, figsize=(12, 6), sharey=True, sharex=True) #axis = axis.ravel() #xvals = np.linspace(data[labels[0]].min(), data[labels[0]].max()) #for i, c in enumerate(selection): # c_data = data.ix[data[group] == c] # c_data = c_data.reset_index(drop = True) # z = list(c_data['donor_code'])[0] # for a_val, b_val in zip(hierarchical_trace['individual intercepts'][n_burnin::10][z], hierarchical_trace['individual slopes'][n_burnin::10][z]): # axis[i].plot(xvals, a_val + b_val * xvals, 'g', alpha=.1) # axis[i].plot(xvals, hierarchical_trace['individual intercepts'][n_burnin:][z].mean() + hierarchical_trace['individual slopes'][n_burnin:][z].mean() * xvals, # 'g', alpha=1, lw=2.) # axis[i].hexbin(c_data[labels[0]], c_data[labels[1]], mincnt=1, cmap=plt.cm.YlOrRd_r) # axis[i].set_title(c) # axis[i].set_xlabel(labels[0]) # axis[i].set_ylabel(labels[1]) # #plt.show() return mean_slope, zero_percentile
def createFeldmanRanking(protectedCandidates, nonProtectedCandidates, k): """ creates a ranking that promotes the protected candidates by adjusting the distribution of the qualifications of the protected and non-protected group IMPORTANT: THIS METHOD MODIFIES THE ORIGINAL LIST OF PROTECTED CANDIDATES! I.e. it modifies the qualification of the protected candidates. If the original list has to be preserved, it has to be deep-copied into a new data structure, before handed over into this method. steps: 1. take a protected candidate x 2. determine the percentile of that candidate within their group percentile(x) 3. find a non-protected candidate y that has the same percentile(y) == percentile(x) 4. assign the score of y to x 5. goto 1 Parameters: ---------- :param protectedCandidates: array of protected candidates :param nonProtectedCandidates: array of non-protected candidates :param k: length of the ranking to return Return: ------ a ranking of protected and non-protected candidates, which tries to have a better share of protected and non-protected candidates """ # ensure candidates are sorted by descending qualificiations protectedCandidates.sort(key=lambda candidate: candidate.qualification, reverse=True) nonProtectedCandidates.sort(key=lambda candidate: candidate.qualification, reverse=True) protectedQualifications = [ protectedCandidates[i].qualification for i in range(len(protectedCandidates)) ] nonProtectedQualifications = [ nonProtectedCandidates[i].qualification for i in range(len(nonProtectedCandidates)) ] # create same distribution for protected and non-protected candidates for i, candidate in enumerate(protectedCandidates): if i >= k: # only need to adapt the scores for protected candidates up to required length # the rest will not be considered anyway break # find percentile of protected candidate p = percentileofscore(protectedQualifications, candidate.qualification) # find score of a non-protected in the same percentile score = scoreatpercentile(nonProtectedQualifications, p) candidate.qualification = score # create a colorblind ranking return createFairRanking(k, protectedCandidates, nonProtectedCandidates, ESSENTIALLY_ZERO, 0.1)
def fitspline_tilt(date, tilt, days, date_end): if len(tilt) < 10: return [], [], [], [], [], [] date_start = date_end - timedelta(days=days) subdays = array('f') subtilt = array('f') d = 0 while d <= len(date) - 1: if date[d] > date_end: break days_before = date_end - date[d] days_before = 0 - (days_before.days + (days_before.seconds / (60 * 60 * 24.))) if days_before < -days: d = d + 1 continue elif days_before > 0: break else: subdays.append(days_before) subtilt.append(tilt[d]) d = d + 1 if len(subtilt) < 10: return [], [], [], [], [], [] subtilt = np.asarray(subtilt) s0 = UnivariateSpline(subdays, subtilt, s=0) d2s0 = abs(s0(subdays, 2)) weight = [ 100 - st.percentileofscore(d2s0, score, kind='rank') for score in d2s0 ] sf = round(np.var(d2s0) / 100., 1) s = UnivariateSpline(subdays, subtilt, w=weight, s=sf) if np.sum([a for a in np.isnan(s(subdays))]) > 0: s = UnivariateSpline(subdays, subtilt, w=weight) if days <= 30: subdays_fine = np.linspace(min(subdays), max(subdays), (max(subdays) - min(subdays)) * 50) elif days <= 90: subdays_fine = np.linspace(min(subdays), max(subdays), (max(subdays) - min(subdays)) * 3) else: subdays_fine = np.linspace(min(subdays), max(subdays), 100) subtilt_fine = s(subdays_fine) subdtilt_fine = s(subdays_fine, 1) return subtilt, subdays, subtilt_fine, subdtilt_fine, subdays_fine, s
def permutation_test(X_scaled, Y_scaled, X_saliences, Y_saliences, singular_values, inertia, n_perm, verbose=False, algorithm="randomized"): n_components = X_saliences.shape[1] singular_values_samples = np.zeros((n_components, n_perm)) if verbose: my_perc = pyprind.ProgBar(n_perm, stream=1, title='running permutations', monitor=True) #import warnings #warnings.filterwarnings("ignore") for perm_i in range(n_perm): _permute_and_calc_singular_values(X_scaled, Y_scaled, X_saliences, Y_saliences, singular_values_samples, perm_i, n_components, algorithm=algorithm) if verbose: my_perc.update() if verbose: print(my_perc) print("calculating p values") saliences_p_vals = np.zeros((n_components, )) for component_i in range(n_components): saliences_p_vals[component_i] = old_div( (100.0 - percentileofscore(singular_values_samples[component_i, :], singular_values[component_i])), 100.0) inertia_p_val = old_div( (100.0 - percentileofscore(singular_values_samples.sum(axis=0), inertia)), 100.0) return saliences_p_vals, inertia_p_val
def month_wise_company_relative_score(self, data, company_val): company_val = int(company_val) interval_dict = dict(fetch_data_list(IntervalData)) crs_dataset = {} for e_id,e_name in interval_dict.items(): percentile_of_score = 0.0 filtered_data = map(lambda x: (list(x).pop(0),float(list(x).pop(2))), filter(lambda x: x[1]==e_id , data)) filtered_data_dict = dict(filtered_data) if filtered_data_dict.has_key(company_val): company_rating = filtered_data_dict[company_val] del filtered_data_dict[company_val] percentile_of_score = stats.percentileofscore(filtered_data_dict.values(), company_rating, kind='mean') crs_dataset[e_name] = round(percentile_of_score, 2) return crs_dataset
def feldmanRanking(protectedCandidates, nonProtectedCandidates, k, dataSetName): # ensure candidates are sorted by descending qualificiations nonProtectedCandidates.sort(key=lambda candidate: candidate.learnedScores, reverse=True) nonProtectedQualifications = [ nonProtectedCandidates[i].learnedScores for i in range(len(nonProtectedCandidates)) ] protectedCandidates.sort(key=lambda candidate: candidate.learnedScores, reverse=True) protectedQualifications = [ protectedCandidates[i].learnedScores for i in range(len(protectedCandidates)) ] ranking = [] # create same distribution for protected and non-protected candidates for i, candidate in enumerate(protectedCandidates): if i >= k: # only need to adapt the scores for protected candidates up to required length # the rest will not be considered anyway break # find percentile of protected candidate p = percentileofscore(protectedQualifications, candidate.learnedScores) # find score of a non-protected in the same percentile score = scoreatpercentile(nonProtectedQualifications, p) candidate.qualification = score ranking.append(candidate) ranking += nonProtectedCandidates # create a colorblind ranking ranking.sort(key=lambda candidate: candidate.qualification, reverse=True) rankingResultsPath = "FeldmanEtAl/" + dataSetName + "ranking.csv" return ranking, rankingResultsPath
print "\nRESULTS\n" for FS in FS_list: if a==0: analysis='seepage, ru' else: print "\nEarthquake analysis not yet available." break analysis='earthquake, kh' a=a+1 #plotting histogram of FS plt.figure() plt.hist(FS,bins=25,normed=True, cumulative=False, histtype='bar',) stats1_0=st.percentileofscore(FS,1.0) stats1_2=st.percentileofscore(FS,1.2) plt.title('Factor of safety analysis (with '+analysis+')\n%FS<1: '+str(round(stats1_0))+' %FS<1.2: '+str(round(stats1_2))) plt.xlabel("Factor of safety") plt.ylabel("normed frequency") print analysis+" analysis" print '%FS<1.0: ',round(stats1_0,1) print '%FS<1.2: ',round(stats1_2,1), if round(stats1_2,1)>0.05: print "; Failure by infinite slope mechanism is admissible." else: print "; Failure by infinite slope mechanism is NOT admissible." print 'mean FS: ', round(np.mean(FS),1) #parameters where FS~1:
def permutation_test(X_scaled, Y_scaled, X_saliences, Y_saliences, singular_values, inertia, n_perm, verbose=True, algorithm="randomized"): n_components = X_saliences.shape[1] print "Starting permutations" #if verbose: # my_perc = pyprind.ProgBar(n_perm, stream=1, title='running permutations', monitor=True) #import warnings #warnings.filterwarnings("ignore") ################################################################################ #create pool to run permutations in parallel #procrustes=False iterable = np.arange(n_perm) P = Pool(processes=20) func = partial(_permute_and_calc_singular_values_pool, X_scaled, Y_scaled, X_saliences, Y_saliences, n_components, True, algorithm) results = P.map(func, iterable) P.close() P.join() #cpu-count #multiprocessing.cpu_count() #if verbose: # my_perc.update() #if verbose: # print my_perc # print "calculating p values" ################################################################################ ################################################################################ #use a list of processes and output queue # output = Queue() # # # Setup a list of processes that we want to run # processes = [Process(target=_permute_and_calc_singular_values, args=(X_scaled, Y_scaled, a, b, n_components, algorithm, output, x)) for x in range(4)] # # # Run processes # for p in processes: # p.start() # # # Exit the completed processes # for p in processes: # p.join() # # # Get process results from the output queue # results = [output.get() for p in processes] # # print(results) ################################################################################ print "end permutations" singular_values_samples = np.array(results).reshape( n_perm, n_components) #reshape results from list to np.array singvals_p_vals = np.zeros((n_components)) #initialize saliences_p_vals for component_i in range(n_components): #percentileofscore compares rank to list of ranks (here singular value of component to bootstrapped #list of singular values singvals_p_vals[component_i] = ( 100.0 - percentileofscore(singular_values_samples[:, component_i], singular_values[component_i])) / 100.0 #inertia describes explained variance inertia_p_val = (100.0 - percentileofscore( singular_values_samples.sum(axis=0), inertia)) / 100.0 return singvals_p_vals, inertia_p_val, singular_values_samples
# Data likelihood model_like = pm.Normal('model_like', mu=model_est, sd=residuals, observed=labels['loss_avg_dec_thr']) start = pm.find_MAP() step = pm.NUTS(scaling=start) hierarchical_trace = pm.sample(n_samples, step, start=start, tune=1000, progressbar=True) mean_slope = hierarchical_trace['group slope (mean)'][n_burnin:].mean() zero_percentile = percentileofscore( hierarchical_trace['group slope (mean)'][n_burnin:], 0) print "Mean group level slope was %g (zero was %g percentile of the posterior distribution)" % ( mean_slope, zero_percentile) #show the distribution of the group slope (mean) #pm.summary(hierarchical_trace[n_burnin:], hierarchical_trace['group slope (mean)']) #optionally, show distribution of the group & individual intercepts, mean, variance, and residuals pm.summary(hierarchical_trace[n_burnin:]) #traceplot pm.traceplot(hierarchical_trace[n_burnin:]) #plot regression slopes across studies # selection = studies # fig, axis = plt.subplots(2, 3, figsize=(12, 6), sharey=True, sharex=True) # axis = axis.ravel()
#plt.hist(sing_vals_distr[:,comp]) #plt.hist(sing_vals[comp]) #plt.xlim(-50) plt.ylabel('singular value of first component', fontsize=22) plt.xlabel('# permutations', fontsize=22) plt.tick_params(axis='both', which='major', labelsize=18) plt.tick_params(axis='both', which='minor', labelsize=18) #plt.show() ##inertia/total amount of variance explained inertia=np.load('inertia.npy') inertia_sampled=np.load("singular_values_sampled.npy") #p-value calculated with the script used to be wrong, inertia_p=np.load('inertia_p.npy') #corrected: inertia_p_val = (100.0-percentileofscore(inertia_sampled.sum(axis=1), inertia))/100.0 print "inertia calculated based on original and sum of permutations p: %.5d" %(len(inertia_sampled[inertia_sampled>inertia])/np.shape(inertia_sampled)[0]) print "inertia calculated based on original and sum of permutations p: %.5d" %(inertia_p_val) plt.figure() plt.hist(inertia_sampled.sum(axis=1)) #plt.show() #explained variance fig3=plt.figure() plt.plot(sing_vals, 'ro') exVar=sing_vals[comp]/inertia print "explained variance of comp %i : %.3f" %(comp, exVar) #plt.show()
#13. Deterministic results for current H,B and material property (Global Minimum) FS_bis_list=np.asarray(FS_bis_list) ch=np.asarray(ch) ck=np.asarray(ck) cR=np.asarray(cR) cx1=np.asarray(cx1) cx2=np.asarray(cx2) exit_pt=cx1*100/(H/tan(rad(B))) if plot_deterministic: plt.figure(100) plot_FS_results(FS_bis_list,maxFS,ch,ck,cR,cx1,cx2,H,B,min_x1_c,max_x2_c,print_arc_centers) print "" print "min FS: ", round(FS_bis.min(),2) print "mean FS: ", round(FS_bis.mean(),2) print "max FS: ", round(FS_bis.max(),2) print "% FS<1: ",round(st.percentileofscore(FS_bis,1.0),2) print "toe exits: ", np.mean(exit_pt) plt.show() HB_FS.append(FS_bis_list.min()) HB_ch.append(ch[np.argmin(FS_bis_list)]) HB_ck.append(ck[np.argmin(FS_bis_list)]) HB_cR.append(cR[np.argmin(FS_bis_list)]) HB_cx1.append(cx1[np.argmin(FS_bis_list)]) HB_cx2.append(cx2[np.argmin(FS_bis_list)]) #14. Probabilistic results for current H,B and all material properties (Overall slope) HB_FS=np.asarray(HB_FS) HB_ch=np.asarray(HB_ch) HB_ck=np.asarray(HB_ck) HB_cR=np.asarray(HB_cR)