Пример #1
0
def get_percentiles(src_dict):
    from scipy.stats import stats as spst
    K,V = zip(*(list(sorted(src_dict.items(), key=itemgetter(1))))) # unzip the ordered src_dict
    u_pct = [spst.percentileofscore(V,v,kind='weak') for v in V]    # upper percentile list (under OR EQUAL)
    l_pct = [spst.percentileofscore(V,v,kind='strict') for v in V]  # lower percentile list (under)
    u_pct_dict = dict(zip(K,map(float,u_pct)))
    l_pct_dict = dict(zip(K,map(float,l_pct)))
    return u_pct_dict, l_pct_dict
Пример #2
0
def bayesian_random_effects(data, labels, group, n_samples=2000, n_burnin=500):
    import pymc as pm
    #preparing the data
    donors = data[group].unique()
    donors_lookup = dict(zip(donors, range(len(donors))))
    data['donor_code'] = data[group].replace(donors_lookup).values
    n_donors = len(data[group].unique())
    donor_idx = data['donor_code'].values
    
    #setting up the model
    with pm.Model() as hierarchical_model:
        # Hyperpriors for group nodes
        group_intercept_mean = pm.Normal('group intercept (mean)', mu=0., sd=100**2)
        group_intercept_variance = pm.Uniform('group intercept (variance)', lower=0, upper=100)
        group_slope_mean = pm.Normal('group slope (mean)', mu=0., sd=100**2)
        group_slope_variance = pm.Uniform('group slope (variance)', lower=0, upper=100)
        
        individual_intercepts = pm.Normal('individual intercepts', mu=group_intercept_mean, sd=group_intercept_variance, shape=n_donors)
        individual_slopes = pm.Normal('individual slopes', mu=group_slope_mean, sd=group_slope_variance, shape=n_donors)
        
        # Model error
        residuals = pm.Uniform('residuals', lower=0, upper=100)
        
        expression_est =  individual_slopes[donor_idx] * data[labels[0]].values + individual_intercepts[donor_idx]
        
        # Data likelihood
        expression_like = pm.Normal('expression_like', mu=expression_est, sd=residuals, observed=data[labels[1]])

        start = pm.find_MAP()
        step = pm.NUTS(scaling=start)
        hierarchical_trace = pm.sample(n_samples, step, start=start, progressbar=True)
        
    mean_slope = hierarchical_trace['group slope (mean)'][n_burnin:].mean()
    zero_percentile = percentileofscore(hierarchical_trace['group slope (mean)'][n_burnin:], 0)
    print "Mean group level slope was %g (zero was %g percentile of the posterior distribution)"%(mean_slope, zero_percentile)
    
    pm.summary(hierarchical_trace[n_burnin:], vars=['group slope (mean)'])
        
    pm.traceplot(hierarchical_trace[n_burnin:])
    
    selection = donors
    fig, axis = plt.subplots(2, 3, figsize=(12, 6), sharey=True, sharex=True)
    axis = axis.ravel()
    xvals = np.linspace(data[labels[0]].min(), data[labels[0]].max())
    for i, c in enumerate(selection):
        c_data = data.ix[data[group] == c]
        c_data = c_data.reset_index(drop = True)
        z = list(c_data['donor_code'])[0]
        for a_val, b_val in zip(hierarchical_trace['individual intercepts'][n_burnin::10][z], hierarchical_trace['individual slopes'][n_burnin::10][z]):
            axis[i].plot(xvals, a_val + b_val * xvals, 'g', alpha=.1)
        axis[i].plot(xvals, hierarchical_trace['individual intercepts'][n_burnin:][z].mean() + hierarchical_trace['individual slopes'][n_burnin:][z].mean() * xvals, 
                     'g', alpha=1, lw=2.)
        axis[i].hexbin(c_data[labels[0]], c_data[labels[1]], mincnt=1, cmap=plt.cm.YlOrRd_r)
        axis[i].set_title(c)
        axis[i].set_xlabel(labels[0])
        axis[i].set_ylabel(labels[1])
        
    plt.show()
        
    return mean_slope, zero_percentile
Пример #3
0
def bayesian_random_effects(data, labels, group, n_samples=2000, n_burnin=500):
    import pymc as pm
    #preparing the data
    donors = data[group].unique()
    donors_lookup = dict(zip(donors, range(len(donors))))
    data['donor_code'] = data[group].replace(donors_lookup).values
    n_donors = len(data[group].unique())
    donor_idx = data['donor_code'].values
    
    #setting up the model
    with pm.Model() as hierarchical_model:
        # Hyperpriors for group nodes
        group_intercept_mean = pm.Normal('group intercept (mean)', mu=0., sd=100**2)
        group_intercept_variance = pm.Uniform('group intercept (variance)', lower=0, upper=100)
        group_slope_mean = pm.Normal('group slope (mean)', mu=0., sd=100**2)
        group_slope_variance = pm.Uniform('group slope (variance)', lower=0, upper=100)
        
        individual_intercepts = pm.Normal('individual intercepts', mu=group_intercept_mean, sd=group_intercept_variance, shape=n_donors)
        individual_slopes = pm.Normal('individual slopes', mu=group_slope_mean, sd=group_slope_variance, shape=n_donors)
        
        # Model error
        residuals = pm.Uniform('residuals', lower=0, upper=100)
        
        expression_est =  individual_slopes[donor_idx] * data[labels[0]].values + individual_intercepts[donor_idx]
        
        # Data likelihood
        expression_like = pm.Normal('expression_like', mu=expression_est, sd=residuals, observed=data[labels[1]])

        start = pm.find_MAP()
        step = pm.NUTS(scaling=start)
        hierarchical_trace = pm.sample(n_samples, step, start=start, progressbar=True)
        
    mean_slope = hierarchical_trace['group slope (mean)'][n_burnin:].mean()
    zero_percentile = percentileofscore(hierarchical_trace['group slope (mean)'][n_burnin:], 0)
    #print "Mean group level slope was %g (zero was %g percentile of the posterior distribution)"%(mean_slope, zero_percentile)
    
    #pm.summary(hierarchical_trace[n_burnin:], vars=['group slope (mean)'])
        
    #pm.traceplot(hierarchical_trace[n_burnin:])
    
    #selection = donors
    #fig, axis = plt.subplots(2, 3, figsize=(12, 6), sharey=True, sharex=True)
    #axis = axis.ravel()
    #xvals = np.linspace(data[labels[0]].min(), data[labels[0]].max())
    #for i, c in enumerate(selection):
    #    c_data = data.ix[data[group] == c]
    #    c_data = c_data.reset_index(drop = True)
    #    z = list(c_data['donor_code'])[0]
    #    for a_val, b_val in zip(hierarchical_trace['individual intercepts'][n_burnin::10][z], hierarchical_trace['individual slopes'][n_burnin::10][z]):
    #        axis[i].plot(xvals, a_val + b_val * xvals, 'g', alpha=.1)
    #    axis[i].plot(xvals, hierarchical_trace['individual intercepts'][n_burnin:][z].mean() + hierarchical_trace['individual slopes'][n_burnin:][z].mean() * xvals, 
    #                 'g', alpha=1, lw=2.)
    #    axis[i].hexbin(c_data[labels[0]], c_data[labels[1]], mincnt=1, cmap=plt.cm.YlOrRd_r)
    #    axis[i].set_title(c)
    #    axis[i].set_xlabel(labels[0])
    #    axis[i].set_ylabel(labels[1])
    #    
    #plt.show()
        
    return mean_slope, zero_percentile
Пример #4
0
def createFeldmanRanking(protectedCandidates, nonProtectedCandidates, k):
    """
    creates a ranking that promotes the protected candidates by adjusting the distribution of the
    qualifications of the protected and non-protected group

    IMPORTANT: THIS METHOD MODIFIES THE ORIGINAL LIST OF PROTECTED CANDIDATES!
    I.e. it modifies the qualification of the
    protected candidates. If the original list has to be preserved, it has to be deep-copied into a
    new data structure, before handed over into this method.

    steps:
        1. take a protected candidate x
        2. determine the percentile of that candidate within their group percentile(x)
        3. find a non-protected candidate y that has the same percentile(y) == percentile(x)
        4. assign the score of y to x
        5. goto 1

    Parameters:
    ----------
    :param protectedCandidates: array of protected candidates
    :param nonProtectedCandidates: array of non-protected candidates
    :param k: length of the ranking to return

    Return:
    ------
    a ranking of protected and non-protected candidates, which tries to have a better share of
    protected and non-protected candidates
    """

    # ensure candidates are sorted by descending qualificiations
    protectedCandidates.sort(key=lambda candidate: candidate.qualification,
                             reverse=True)
    nonProtectedCandidates.sort(key=lambda candidate: candidate.qualification,
                                reverse=True)

    protectedQualifications = [
        protectedCandidates[i].qualification
        for i in range(len(protectedCandidates))
    ]
    nonProtectedQualifications = [
        nonProtectedCandidates[i].qualification
        for i in range(len(nonProtectedCandidates))
    ]

    # create same distribution for protected and non-protected candidates
    for i, candidate in enumerate(protectedCandidates):
        if i >= k:
            # only need to adapt the scores for protected candidates up to required length
            # the rest will not be considered anyway
            break
        # find percentile of protected candidate
        p = percentileofscore(protectedQualifications, candidate.qualification)
        # find score of a non-protected in the same percentile
        score = scoreatpercentile(nonProtectedQualifications, p)
        candidate.qualification = score

    # create a colorblind ranking
    return createFairRanking(k, protectedCandidates, nonProtectedCandidates,
                             ESSENTIALLY_ZERO, 0.1)
def fitspline_tilt(date, tilt, days, date_end):
    if len(tilt) < 10:
        return [], [], [], [], [], []

    date_start = date_end - timedelta(days=days)
    subdays = array('f')
    subtilt = array('f')
    d = 0

    while d <= len(date) - 1:
        if date[d] > date_end: break
        days_before = date_end - date[d]
        days_before = 0 - (days_before.days + (days_before.seconds /
                                               (60 * 60 * 24.)))

        if days_before < -days:
            d = d + 1
            continue

        elif days_before > 0:
            break

        else:
            subdays.append(days_before)
            subtilt.append(tilt[d])
            d = d + 1

    if len(subtilt) < 10:
        return [], [], [], [], [], []

    subtilt = np.asarray(subtilt)

    s0 = UnivariateSpline(subdays, subtilt, s=0)
    d2s0 = abs(s0(subdays, 2))
    weight = [
        100 - st.percentileofscore(d2s0, score, kind='rank') for score in d2s0
    ]
    sf = round(np.var(d2s0) / 100., 1)

    s = UnivariateSpline(subdays, subtilt, w=weight, s=sf)

    if np.sum([a for a in np.isnan(s(subdays))]) > 0:
        s = UnivariateSpline(subdays, subtilt, w=weight)

    if days <= 30:
        subdays_fine = np.linspace(min(subdays), max(subdays),
                                   (max(subdays) - min(subdays)) * 50)

    elif days <= 90:
        subdays_fine = np.linspace(min(subdays), max(subdays),
                                   (max(subdays) - min(subdays)) * 3)

    else:
        subdays_fine = np.linspace(min(subdays), max(subdays), 100)

    subtilt_fine = s(subdays_fine)
    subdtilt_fine = s(subdays_fine, 1)

    return subtilt, subdays, subtilt_fine, subdtilt_fine, subdays_fine, s
Пример #6
0
def permutation_test(X_scaled,
                     Y_scaled,
                     X_saliences,
                     Y_saliences,
                     singular_values,
                     inertia,
                     n_perm,
                     verbose=False,
                     algorithm="randomized"):
    n_components = X_saliences.shape[1]
    singular_values_samples = np.zeros((n_components, n_perm))

    if verbose:
        my_perc = pyprind.ProgBar(n_perm,
                                  stream=1,
                                  title='running permutations',
                                  monitor=True)
        #import warnings
        #warnings.filterwarnings("ignore")
    for perm_i in range(n_perm):
        _permute_and_calc_singular_values(X_scaled,
                                          Y_scaled,
                                          X_saliences,
                                          Y_saliences,
                                          singular_values_samples,
                                          perm_i,
                                          n_components,
                                          algorithm=algorithm)
        if verbose:
            my_perc.update()
    if verbose:
        print(my_perc)
        print("calculating p values")

    saliences_p_vals = np.zeros((n_components, ))
    for component_i in range(n_components):
        saliences_p_vals[component_i] = old_div(
            (100.0 - percentileofscore(singular_values_samples[component_i, :],
                                       singular_values[component_i])), 100.0)

    inertia_p_val = old_div(
        (100.0 -
         percentileofscore(singular_values_samples.sum(axis=0), inertia)),
        100.0)

    return saliences_p_vals, inertia_p_val
Пример #7
0
 def month_wise_company_relative_score(self, data, company_val):
     company_val = int(company_val)
     interval_dict = dict(fetch_data_list(IntervalData))
     crs_dataset = {}
     for e_id,e_name in interval_dict.items():
         percentile_of_score = 0.0
         filtered_data = map(lambda x: (list(x).pop(0),float(list(x).pop(2))), filter(lambda x: x[1]==e_id , data))
         filtered_data_dict = dict(filtered_data)
         if filtered_data_dict.has_key(company_val):
             company_rating = filtered_data_dict[company_val]
             del filtered_data_dict[company_val]
             percentile_of_score = stats.percentileofscore(filtered_data_dict.values(), company_rating, kind='mean')
         crs_dataset[e_name] = round(percentile_of_score, 2)
     return crs_dataset
Пример #8
0
def feldmanRanking(protectedCandidates, nonProtectedCandidates, k,
                   dataSetName):

    # ensure candidates are sorted by descending qualificiations
    nonProtectedCandidates.sort(key=lambda candidate: candidate.learnedScores,
                                reverse=True)
    nonProtectedQualifications = [
        nonProtectedCandidates[i].learnedScores
        for i in range(len(nonProtectedCandidates))
    ]

    protectedCandidates.sort(key=lambda candidate: candidate.learnedScores,
                             reverse=True)
    protectedQualifications = [
        protectedCandidates[i].learnedScores
        for i in range(len(protectedCandidates))
    ]

    ranking = []

    # create same distribution for protected and non-protected candidates
    for i, candidate in enumerate(protectedCandidates):
        if i >= k:
            # only need to adapt the scores for protected candidates up to required length
            # the rest will not be considered anyway
            break
        # find percentile of protected candidate
        p = percentileofscore(protectedQualifications, candidate.learnedScores)
        # find score of a non-protected in the same percentile
        score = scoreatpercentile(nonProtectedQualifications, p)
        candidate.qualification = score
        ranking.append(candidate)

    ranking += nonProtectedCandidates

    # create a colorblind ranking
    ranking.sort(key=lambda candidate: candidate.qualification, reverse=True)

    rankingResultsPath = "FeldmanEtAl/" + dataSetName + "ranking.csv"

    return ranking, rankingResultsPath
Пример #9
0
    print "\nRESULTS\n"
    for FS in FS_list: 

        if a==0:
            analysis='seepage, ru'
        else:
            print "\nEarthquake analysis not yet available."
            break
            analysis='earthquake, kh'

        a=a+1 
        #plotting histogram of FS
        plt.figure()
        plt.hist(FS,bins=25,normed=True, cumulative=False, histtype='bar',)
        
        stats1_0=st.percentileofscore(FS,1.0)
        stats1_2=st.percentileofscore(FS,1.2)
        plt.title('Factor of safety analysis (with '+analysis+')\n%FS<1: '+str(round(stats1_0))+'       %FS<1.2: '+str(round(stats1_2)))
        plt.xlabel("Factor of safety")
        plt.ylabel("normed frequency")
        print analysis+" analysis"
        print '%FS<1.0: ',round(stats1_0,1)
        print '%FS<1.2: ',round(stats1_2,1),
        if round(stats1_2,1)>0.05:
            print "; Failure by infinite slope mechanism is admissible."
        else:
            print "; Failure by infinite slope mechanism is NOT admissible."
        print 'mean FS: ', round(np.mean(FS),1) 


        #parameters where FS~1:
Пример #10
0
def permutation_test(X_scaled,
                     Y_scaled,
                     X_saliences,
                     Y_saliences,
                     singular_values,
                     inertia,
                     n_perm,
                     verbose=True,
                     algorithm="randomized"):
    n_components = X_saliences.shape[1]
    print "Starting permutations"
    #if verbose:
    #    my_perc = pyprind.ProgBar(n_perm, stream=1, title='running permutations', monitor=True)

    #import warnings
    #warnings.filterwarnings("ignore")

    ################################################################################
    #create pool to run permutations in parallel
    #procrustes=False
    iterable = np.arange(n_perm)
    P = Pool(processes=20)
    func = partial(_permute_and_calc_singular_values_pool, X_scaled, Y_scaled,
                   X_saliences, Y_saliences, n_components, True, algorithm)
    results = P.map(func, iterable)
    P.close()
    P.join()

    #cpu-count
    #multiprocessing.cpu_count()
    #if verbose:
    #     my_perc.update()

    #if verbose:
    #    print my_perc
    #    print "calculating p values"
    ################################################################################
    ################################################################################
    #use a list of processes and output queue
    #    output = Queue()
    #

    #    # Setup a list of processes that we want to run
    #    processes = [Process(target=_permute_and_calc_singular_values, args=(X_scaled, Y_scaled, a, b, n_components, algorithm, output, x)) for x in range(4)]
    #
    #    # Run processes
    #    for p in processes:
    #        p.start()
    #
    #    # Exit the completed processes
    #    for p in processes:
    #        p.join()
    #
    #    # Get process results from the output queue
    #    results = [output.get() for p in processes]
    #
    #    print(results)
    ################################################################################
    print "end permutations"
    singular_values_samples = np.array(results).reshape(
        n_perm, n_components)  #reshape results from list to np.array
    singvals_p_vals = np.zeros((n_components))  #initialize saliences_p_vals
    for component_i in range(n_components):
        #percentileofscore compares rank to list of ranks (here singular value of component to bootstrapped
        #list of singular values
        singvals_p_vals[component_i] = (
            100.0 - percentileofscore(singular_values_samples[:, component_i],
                                      singular_values[component_i])) / 100.0

    #inertia describes explained variance
    inertia_p_val = (100.0 - percentileofscore(
        singular_values_samples.sum(axis=0), inertia)) / 100.0

    return singvals_p_vals, inertia_p_val, singular_values_samples
Пример #11
0
    # Data likelihood
    model_like = pm.Normal('model_like',
                           mu=model_est,
                           sd=residuals,
                           observed=labels['loss_avg_dec_thr'])

    start = pm.find_MAP()
    step = pm.NUTS(scaling=start)
    hierarchical_trace = pm.sample(n_samples,
                                   step,
                                   start=start,
                                   tune=1000,
                                   progressbar=True)

mean_slope = hierarchical_trace['group slope (mean)'][n_burnin:].mean()
zero_percentile = percentileofscore(
    hierarchical_trace['group slope (mean)'][n_burnin:], 0)
print "Mean group level slope was %g (zero was %g percentile of the posterior distribution)" % (
    mean_slope, zero_percentile)

#show the distribution of the group slope (mean)
#pm.summary(hierarchical_trace[n_burnin:], hierarchical_trace['group slope (mean)'])
#optionally, show distribution of the group & individual intercepts, mean, variance, and residuals
pm.summary(hierarchical_trace[n_burnin:])

#traceplot
pm.traceplot(hierarchical_trace[n_burnin:])

#plot regression slopes across studies
# selection = studies
# fig, axis = plt.subplots(2, 3, figsize=(12, 6), sharey=True, sharex=True)
# axis = axis.ravel()
#plt.hist(sing_vals_distr[:,comp])
#plt.hist(sing_vals[comp])
#plt.xlim(-50)
plt.ylabel('singular value of first component', fontsize=22)
plt.xlabel('# permutations', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.tick_params(axis='both', which='minor', labelsize=18)
#plt.show()


##inertia/total amount of variance explained
inertia=np.load('inertia.npy')
inertia_sampled=np.load("singular_values_sampled.npy")
#p-value calculated with the script used to be wrong, inertia_p=np.load('inertia_p.npy')
#corrected:
inertia_p_val = (100.0-percentileofscore(inertia_sampled.sum(axis=1), inertia))/100.0
print "inertia calculated based on original and sum of permutations p: %.5d" %(len(inertia_sampled[inertia_sampled>inertia])/np.shape(inertia_sampled)[0])
print "inertia calculated based on original and sum of permutations p: %.5d" %(inertia_p_val)



plt.figure()
plt.hist(inertia_sampled.sum(axis=1))
#plt.show()

#explained variance
fig3=plt.figure()
plt.plot(sing_vals, 'ro')
exVar=sing_vals[comp]/inertia
print "explained variance of comp %i : %.3f" %(comp, exVar)
#plt.show()
                #13. Deterministic results for current H,B and material property (Global Minimum)
                FS_bis_list=np.asarray(FS_bis_list)
                ch=np.asarray(ch)
                ck=np.asarray(ck)
                cR=np.asarray(cR)
                cx1=np.asarray(cx1)
                cx2=np.asarray(cx2)
                exit_pt=cx1*100/(H/tan(rad(B)))
                if plot_deterministic:
                    plt.figure(100)
                    plot_FS_results(FS_bis_list,maxFS,ch,ck,cR,cx1,cx2,H,B,min_x1_c,max_x2_c,print_arc_centers)
                    print ""
                    print "min FS: ", round(FS_bis.min(),2)
                    print "mean FS: ", round(FS_bis.mean(),2)
                    print "max FS: ", round(FS_bis.max(),2)
                    print "% FS<1: ",round(st.percentileofscore(FS_bis,1.0),2)
                    print "toe exits: ", np.mean(exit_pt)
                    plt.show()
                HB_FS.append(FS_bis_list.min())
                HB_ch.append(ch[np.argmin(FS_bis_list)])
                HB_ck.append(ck[np.argmin(FS_bis_list)])
                HB_cR.append(cR[np.argmin(FS_bis_list)])
                HB_cx1.append(cx1[np.argmin(FS_bis_list)])
                HB_cx2.append(cx2[np.argmin(FS_bis_list)])
                

            #14. Probabilistic results for current H,B and all material properties (Overall slope)
            HB_FS=np.asarray(HB_FS)
            HB_ch=np.asarray(HB_ch)
            HB_ck=np.asarray(HB_ck)
            HB_cR=np.asarray(HB_cR)