def _skewed_distributions(ax):
     dist1 = scstats.norm()
     dist2 = scstats.gumbel_l(loc = 2.0)
     y1 = dist1.pdf(x)
     y2 = dist2.pdf(x)
     ax.plot(x, y1)
     ax.fill_between(x, y1, alpha = 0.5, label = 'Before')
     ax.plot(x, y2)
     ax.fill_between(x, y2, alpha = 0.5, label = 'After')
     ax.set_xlim((-5, 5))
     return ax
Пример #2
    def sample(self, k):
        X = self.gaussian.rvs(k)

        # to unfirom
        norm = stats.norm()
        U = norm.cdf(X)

        m1 = stats.gumbel_l()
        m2 = stats.beta(a=10, b=2)

        Y0 = m1.ppf(U[:, 0])
        Y1 = m2.ppf(U[:, 1])

        return (Y0, Y1)
Пример #3
def optimize_loc(res_loc, res_scale, load_distro, conf_target, eps):
    """Auxiliary function to be used with the scipy.optimize.bisect function
    to find the location parameters of the resistance distribution that
    matches a required confidence level.
    res_loc, res_scale: locations and scale parameters of the distribution
    load_distro: load distribution (frozen scipy.stats distribution)
    conf_target: confidence level target
    eps: limit integration domain where load and resistance pdfs are > eps"""
    res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale)
    x, dx = np.linspace(min(load_distro.ppf(eps), res_distro.ppf(eps)),
                        max(load_distro.ppf(1 - eps), res_distro.ppf(1 - eps)),
    confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx)
    return confidence - conf_target
Пример #4
 def __init__(self, mean, stdev, dtype='normal', weib_loc=0):
     if dtype == 'normal':
         self.dist = ss.norm(loc=mean, scale=stdev)
     elif dtype == 'gumbel_r':
         beta = stdev * sqrt(6) / pi
         mu = mean - euler_gamma * beta
         self.dist = ss.gumbel_r(loc=mu, scale=beta)
     elif dtype == 'gumbel_l':
         beta = stdev * sqrt(6) / pi
         mu = mean + euler_gamma * beta
         self.dist = ss.gumbel_l(loc=mu, scale=beta)
     elif dtype == 'weibull':
         self.dist = weibull(mean, stdev, weib_loc)
         print('Error dtype.')
Пример #5
Пример #6
Пример #7
                 (len(bad_sample) - 1) / len(bad_sample), len(bad_sample)))),

## Using Kolmogorov-Smirnov test
## The D statistic is the absolute max distance (supremum) between the CDFs of the two samples.
## The closer this number is to 0 the more likely it is that the two samples were drawn from the
## same distribution.
## The p-value returned by the k-s test has the same interpretation as other p-values. You reject
## the null hypothesis that the two samples were drawn from the same distribution if the p-value
## is less than your significance level. You can find tables online for the conversion of the
## D statistic into a p-value if you are interested in the procedure.
stats, pvalue = ss.kstest(rvs=good_sample,
print('The maximumdistance between CDFs is %.2f.' % stats, end='')
print('The sample is Gumbel distributed for a significance level of %.2f' %

stats, pvalue = ss.kstest(rvs=bad_sample,
print('The maximumdistance between CDFs is %.2f.' % stats, end='')
print('The sample is Gumbel distributed for a significance level of %.2f' %

## Using Anderson-Darling test
## The assumption regarding the distribution of the sample is rejected if the output value
## is larger than the critical values for the required significance level.
## For gumbel distributions, the critical values and significance levels are:
Пример #8
    load_loc = 100  # location parameter for the load distribution
    load_scale = 5  # scale parameter for the load distribution
    res_scale = 3.5  # scale parameter for the resistance distribution
    eps = 1e-8  # domain = pdf > eps, for load and resistance

    # frozen load distribution
    load_distro = ss.gumbel_r(loc=load_loc, scale=load_scale)
    # finds the location parameter for the resistance distribution that
    # gives the required conf_target
    res_loc = sp.optimize.bisect(optimize_loc,
                                 load_distro.ppf(1 - eps),
                                 args=(res_scale, load_distro, conf_target,
    # frozen resistance distribution
    res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale)
    # recalculates the domain and the confidence level
    x, dx = np.linspace(min(load_distro.ppf(eps), res_distro.ppf(eps)),
                        max(load_distro.ppf(1 - eps), res_distro.ppf(1 - eps)),
    confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx)
    # %% plotting
    plt.plot(x, load_distro.pdf(x), label='load pdf')
    plt.plot(x, res_distro.pdf(x), label='resistance pdf')

    print('Confidence %.3f%%' % (100 * confidence))
    pfailure = pfail_dblchk(load_distro.pdf, res_distro.pdf, x)
Пример #9
def case3(output=True):

    accuracy_in_each_turn = list()

    precision_in_each_turn_spam = list()
    recall_in_each_turn_spam = list()

    precision_in_each_turn_ham = list()
    recall_in_each_turn_ham = list()

    m = np.loadtxt(open("resources/normalized_data.csv", "rb"), delimiter=',')

    shuffled = np.random.permutation(m)

    valid.validate_cross_validation(NUMBER_OF_ROUNDS, TRAIN_TEST_RATIO)

    # equiprobable priors
    prior_spam = 0.5
    prior_ham = 0.5

    for i in xrange(NUMBER_OF_ROUNDS):

        # we're using cross-validation so each iteration we take a different
        # slice of the data to serve as test set
        train_set, test_set = prep.split_sets(shuffled, TRAIN_TEST_RATIO, i)

        #parameter estimation
        #but now we take ALL attributes into consideration
        sample_means_word_spam = list()
        sample_means_word_ham = list()

        sample_variances_word_spam = list()
        sample_variances_word_ham = list()

        # all but the last one
        for attr_index in xrange(57):

                nb.take_mean_spam(train_set, attr_index, SPAM_ATTR_INDEX))
                nb.take_mean_ham(train_set, attr_index, SPAM_ATTR_INDEX))

                nb.take_variance_spam(train_set, attr_index, SPAM_ATTR_INDEX))
                nb.take_variance_ham(train_set, attr_index, SPAM_ATTR_INDEX))

        #sample standard deviations from sample variances
        sample_std_devs_spam = map(lambda x: x**(1 / 2.0),
        sample_std_devs_ham = map(lambda x: x**(1 / 2.0),

        hits = 0.0
        misses = 0.0

        #number of instances correctly evaluated as spam
        correctly_is_spam = 0.0

        #total number of spam instances
        is_spam = 0.0

        #total number of instances evaluated as spam
        guessed_spam = 0.0

        #number of instances correctly evaluated as ham
        correctly_is_ham = 0.0

        #total number of ham instances
        is_ham = 0.0

        #total number of instances evaluated as ham
        guessed_ham = 0.0

        # now we test the hypothesis against the test set
        for row in test_set:

            # ou seja, o produto de todas as prob. condicionais das palavras dada a classe
            # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =)
            product_of_all_conditional_probs_spam = reduce(
                lambda acc, cur: acc * stats.gumbel_l(
                    sample_means_word_spam[cur], sample_std_devs_spam[cur]).
                pdf(row[CASE_2_ATTRIBUTE_INDEXES[cur]]), xrange(10), 1)
            # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior!
            posterior_spam = prior_spam * product_of_all_conditional_probs_spam

            product_of_all_conditional_probs_ham = reduce(
                lambda acc, cur: acc * stats.gumbel_l(
                    sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf(
                        row[CASE_2_ATTRIBUTE_INDEXES[cur]]), xrange(10), 1)
            posterior_ham = prior_ham * product_of_all_conditional_probs_ham

            # whichever is greater - that will be our prediction
            if posterior_spam > posterior_ham:
                guess = 1
                guess = 0

            if (row[SPAM_ATTR_INDEX] == guess):
                hits += 1
                misses += 1

            # we'll use these to calculate metrics
            if (row[SPAM_ATTR_INDEX] == 1):
                is_spam += 1

                if guess == 1:
                    guessed_spam += 1
                    correctly_is_spam += 1
                    guessed_ham += 1
                is_ham += 1

                if guess == 1:
                    guessed_spam += 1
                    guessed_ham += 1
                    correctly_is_ham += 1

        #accuracy = number of correctly evaluated instances/
        #           number of instances
        accuracy = hits / (hits + misses)

        #precision_spam = number of correctly evaluated instances as spam/
        #            number of spam instances
        # in order to avoid divisions by zero in case nothing was found
        if (is_spam == 0):
            precision_spam = 0
            precision_spam = correctly_is_spam / is_spam

        #recall_spam = number of correctly evaluated instances as spam/
        #         number of evaluated instances como spam
        # in order to avoid divisions by zero in case nothing was found
        if (guessed_spam == 0):
            recall_spam = 0
            recall_spam = correctly_is_spam / guessed_spam

        #precision_ham = number of correctly evaluated instances as ham/
        #            number of ham instances
        # in order to avoid divisions by zero in case nothing was found
        if (is_ham == 0):
            precision_ham = 0
            precision_ham = correctly_is_ham / is_ham

        #recall_ham = number of correctly evaluated instances as ham/
        #         number of evaluated instances como ham
        # in order to avoid divisions by zero in case nothing was found
        if (guessed_ham == 0):
            recall_ham = 0
            recall_ham = correctly_is_ham / guessed_ham




    # calculation of means for each metric at the end

    mean_accuracy = np.mean(accuracy_in_each_turn)
    std_dev_accuracy = np.std(accuracy_in_each_turn)
    variance_accuracy = np.var(accuracy_in_each_turn)

    mean_precision_spam = np.mean(precision_in_each_turn_spam)
    std_dev_precision_spam = np.std(precision_in_each_turn_spam)
    variance_precision_spam = np.var(precision_in_each_turn_spam)

    mean_recall_spam = np.mean(recall_in_each_turn_spam)
    std_dev_recall_spam = np.std(recall_in_each_turn_spam)
    variance_recall_spam = np.var(recall_in_each_turn_spam)

    mean_precision_ham = np.mean(precision_in_each_turn_ham)
    std_dev_precision_ham = np.std(precision_in_each_turn_ham)
    variance_precision_ham = np.var(precision_in_each_turn_ham)

    mean_recall_ham = np.mean(recall_in_each_turn_ham)
    std_dev_recall_ham = np.std(recall_in_each_turn_ham)
    variance_recall_ham = np.var(recall_in_each_turn_ham)

    if output:
        print "\033[1;32m"
        print '============================================='
        print '============================================='
        print "\033[00m"
        print 'MEAN ACCURACY: ' + str(round(mean_accuracy, 5))
        print 'STD. DEV. OF ACCURACY: ' + str(round(std_dev_accuracy, 5))
        print 'VARIANCE OF ACCURACY: ' + str(round(variance_accuracy, 8))
        print ''
        print 'MEAN PRECISION FOR SPAM: ' + str(round(mean_precision_spam, 5))
        print 'STD. DEV. OF PRECISION FOR SPAM: ' + str(
            round(std_dev_precision_spam, 5))
        print 'VARIANCE OF PRECISION FOR SPAM: ' + str(
            round(variance_precision_spam, 8))
        print ''
        print 'MEAN RECALL FOR SPAM: ' + str(round(mean_recall_spam, 5))
        print 'STD. DEV. OF RECALL FOR SPAM: ' + str(
            round(std_dev_recall_spam, 5))
        print 'VARIANCE OF RECALL FOR SPAM: ' + str(
            round(variance_recall_spam, 8))
        print ''
        print 'MEAN PRECISION FOR HAM: ' + str(round(mean_precision_ham, 5))
        print 'STD. DEV. OF PRECISION FOR HAM: ' + str(
            round(std_dev_precision_ham, 5))
        print 'VARIANCE OF PRECISION FOR HAM: ' + str(
            round(variance_precision_ham, 8))
        print ''
        print 'MEAN RECALL FOR HAM: ' + str(round(mean_recall_ham, 5))
        print 'STD. DEV. OF RECALL FOR HAM: ' + str(
            round(std_dev_recall_ham, 5))
        print 'VARIANCE OF RECALL FOR HAM: ' + str(
            round(variance_recall_ham, 8))
Пример #10
                #           1.305*np.std(data_tp[data_tp['wd'] == wd][c])
                #           for wd in set(data_tp['wd'])])
                # MLE's
                mx = max([
                    ss.gumbel_r(*[data_tp['wd'] == wd]
                    for wd in set(data_tp['wd'])
                # # moment estimators
                # mx = min([np.mean(data_tp[data_tp['wd'] == wd][c]) +
                #           1.305*np.std(data_tp[data_tp['wd'] == wd][c])
                #           for wd in set(data_tp['wd'])])
                # MLE's
                mx = min([
                    ss.gumbel_l(*[data_tp['wd'] == wd]
                    for wd in set(data_tp['wd'])
        stdev.loc[i] = rw
        i += 1

# adjust and sort index
stdev = stdev.sort_index(by=['Hs', 'Tp'])
stdev = stdev.reset_index()
del stdev['index']

# %%
# this works, but how to apply +/- depending on max/min
stdev3 = (1.305 * data.groupby(['Hs', 'Tp', 'wd'])[['Tmax', 'Tmin']].std() +
          data.groupby(['Hs', 'Tp', 'wd'])[['Tmax', 'Tmin']].mean()).max(
Пример #12
# Calculate a few first moments:

mean, var, skew, kurt = gumbel_l.stats(moments='mvsk')

# Display the probability density function (``pdf``):

x = np.linspace(gumbel_l.ppf(0.01), gumbel_l.ppf(0.99), 100)
ax.plot(x, gumbel_l.pdf(x), 'r-', lw=5, alpha=0.6, label='gumbel_l pdf')

# Alternatively, the distribution object can be called (as a function)
# to fix the shape, location and scale parameters. This returns a "frozen"
# RV object holding the given parameters fixed.

# Freeze the distribution and display the frozen ``pdf``:

rv = gumbel_l()
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

# Check accuracy of ``cdf`` and ``ppf``:

vals = gumbel_l.ppf([0.001, 0.5, 0.999])
np.allclose([0.001, 0.5, 0.999], gumbel_l.cdf(vals))
# True

# Generate random numbers:

r = gumbel_l.rvs(size=1000)

# And compare the histogram:

ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
Пример #13
            print "mean = ", trace.mean()
            for bin in [10,20,50,100]:
                print "maxlike = ", bin_edges[a], bin_edges[a+1], (bin_edges[a]+bin_edges[a+1])/2.0

            if plot_idx==2:
                n, bins, patches = plt.hist(np.array(trace), 50,  normed=1, facecolor='green', alpha=0.75)

                X =
                print X
                dist = sp.gumbel_l(X[0],X[1])
                x = np.array(bins)
                y = dist.pdf(x)
                print y
                plt.plot(x, y,'k--',linewidth=2)

                X =
                print X
                dist = sp.norm(X[0],X[1])
                x = np.array(bins)
                y = dist.pdf(x)
                plt.plot(x, y,'r--',linewidth=2)

                X =
                print X
                dist = sp.genextreme(X[0],X[1],X[2])
Пример #14
Пример #15
for tail in [tailmax, tailmin]:
    title('TEST TITLE')
    gumbel = best_fit(tail)
    loc, scale =
    mygl = gumbel(loc=loc, scale=scale)
    title('bets fit gumbel')
    loc, scale =
    mygl = stats.gumbel_l(loc=loc, scale=scale)
    title('gumbel l')
    loc, scale =
    mygr = stats.gumbel_r(loc=loc, scale=scale)
    title('gumbel r')

#import pandas
##list with the path to various results files from repeated lowering analyses
#with open('list_results.txt', 'r') as pf:
#    list_results = pf.readlines()
Пример #16
# %%
Dx = 3.0
x = np.linspace(-4, 4, 100)
norm1 = scstats.norm()
norm2 = scstats.norm(loc = Dx)
fig, ax = plt.subplots(1, 1, figsize = (7, 5))
ax.plot(x, norm1.pdf(x), label = 'before')
ax.plot(x+Dx, norm2.pdf(x+Dx), label = 'after')
ax.set_xlim((-5, 5+Dx))
ax.set_ylim((-0.01, 1))
# %%
x = np.linspace(-4, 4, 100)
dist1 = scstats.norm()
dist2 = scstats.gumbel_l(loc = 1.5)
fig, ax = plt.subplots(1, 1, figsize = (7, 5))
ax.plot(x, dist1.pdf(x), label = 'before')
ax.plot(x, dist2.pdf(x), label = 'after')
ax.set_xlim((-5, 5))
ax.set_ylim((-0.01, 1))
# %%
import matplotlib.pyplot as plt
import scipy.stats as scstats

def Supplementary_Figure1A():
    x = np.linspace(-4, 4, 100)
    norm1 = scstats.norm(scale = 0.5)
    norm2 = scstats.norm(scale = 1.0)
Пример #17
for tail in [tailmax, tailmin]:
    title('TEST TITLE')


    gumbel = best_fit(tail)
    loc, scale =
    mygl = gumbel(loc=loc, scale=scale)
    stats.probplot(tail, dist=mygl, plot=plt)
    title('bets fit gumbel')

    loc, scale =
    mygl = stats.gumbel_l(loc=loc, scale=scale)
    stats.probplot(tail, dist=mygl, plot=plt)
    title('gumbel l')

    loc, scale =
    mygr = stats.gumbel_r(loc=loc, scale=scale)
    stats.probplot(tail, dist=mygr, plot=plt)
    title('gumbel r')

#import pandas
##list with the path to various results files from repeated lowering analyses
#with open('list_results.txt', 'r') as pf:
#    list_results = pf.readlines()
                hist, bin_edges = np.histogram(trace, bins=bin)
                a = np.argmax(hist)
                print("maxlike = ", bin_edges[a], bin_edges[a + 1],
                      (bin_edges[a] + bin_edges[a + 1]) / 2.0)

            plt.subplot(2, len(things) / 2, plot_idx)
            if plot_idx == 2:
                n, bins, patches = plt.hist(np.array(trace),

                X =
                dist = sp.gumbel_l(X[0], X[1])
                x = np.array(bins)
                y = dist.pdf(x)
                plt.plot(x, y, 'k--', linewidth=2)

                X =
                dist = sp.norm(X[0], X[1])
                x = np.array(bins)
                y = dist.pdf(x)
                plt.plot(x, y, 'r--', linewidth=2)

                X =
                dist = sp.genextreme(X[0], X[1], X[2])
Пример #19
Пример #20
    conf_target = 0.9  # confidence level of non-failure
    load_loc = 100          # location parameter for the load distribution
    load_scale = 5      # scale parameter for the load distribution
    res_scale = 3.5     # scale parameter for the resistance distribution
    eps = 1e-8           # domain = pdf > eps, for load and resistance

    # frozen load distribution
    load_distro = ss.gumbel_r(loc=load_loc, scale=load_scale)
    # finds the location parameter for the resistance distribution that
    # gives the required conf_target
    res_loc = sp.optimize.bisect(optimize_loc, load_loc,
                                 args=(res_scale, load_distro,
                                       conf_target, eps))
    # frozen resistance distribution
    res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale)
    # recalculates the domain and the confidence level
    x, dx = np.linspace(min(load_distro.ppf(eps), res_distro.ppf(eps)),
                        max(load_distro.ppf(1-eps), res_distro.ppf(1-eps)),
                        200, retstep=True)
    confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx)
    # %% plotting
    plt.plot(x, load_distro.pdf(x), label='load pdf')
    plt.plot(x, res_distro.pdf(x), label='resistance pdf')

    print('Confidence %.3f%%' % (100*confidence))
    pfailure = pfail_dblchk(load_distro.pdf, res_distro.pdf, x)
    print('Dbl check %.3f%%' % (100*(1-pfailure)))
Пример #21
 def get_y(params, x, tail):
     if tail == 'upper':
         return -np.log(-ss.gumbel_r(*params).logcdf(x))
         return -np.log(-ss.gumbel_l(*params).logsf(x))
Пример #22
Пример #23
def all_dists():
    # dists param were taken from scipy.stats official
    # documentaion examples
    # Total - 89
    return {
        stats.alpha(a=3.57, loc=0.0, scale=1.0),
        stats.anglit(loc=0.0, scale=1.0),
        stats.arcsine(loc=0.0, scale=1.0),
        stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0),
        stats.betaprime(a=5, b=6, loc=0.0, scale=1.0),
        stats.bradford(c=0.299, loc=0.0, scale=1.0),
        stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0),
        stats.cauchy(loc=0.0, scale=1.0),
        stats.chi(df=78, loc=0.0, scale=1.0),
        stats.chi2(df=55, loc=0.0, scale=1.0),
        stats.cosine(loc=0.0, scale=1.0),
        stats.dgamma(a=1.1, loc=0.0, scale=1.0),
        stats.dweibull(c=2.07, loc=0.0, scale=1.0),
        stats.erlang(a=2, loc=0.0, scale=1.0),
        stats.expon(loc=0.0, scale=1.0),
        stats.exponnorm(K=1.5, loc=0.0, scale=1.0),
        stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0),
        stats.exponpow(b=2.7, loc=0.0, scale=1.0),
        stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0),
        stats.fatiguelife(c=29, loc=0.0, scale=1.0),
        stats.fisk(c=3.09, loc=0.0, scale=1.0),
        stats.foldcauchy(c=4.72, loc=0.0, scale=1.0),
        stats.foldnorm(c=1.95, loc=0.0, scale=1.0),
        # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0),
        # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0),
        stats.genlogistic(c=0.412, loc=0.0, scale=1.0),
        stats.genpareto(c=0.1, loc=0.0, scale=1.0),
        stats.gennorm(beta=1.3, loc=0.0, scale=1.0),
        stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0),
        stats.genextreme(c=-0.1, loc=0.0, scale=1.0),
        stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0),
        stats.gamma(a=1.99, loc=0.0, scale=1.0),
        stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0),
        stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0),
        stats.gilbrat(loc=0.0, scale=1.0),
        stats.gompertz(c=0.947, loc=0.0, scale=1.0),
        stats.gumbel_r(loc=0.0, scale=1.0),
        stats.gumbel_l(loc=0.0, scale=1.0),
        stats.halfcauchy(loc=0.0, scale=1.0),
        stats.halflogistic(loc=0.0, scale=1.0),
        stats.halfnorm(loc=0.0, scale=1.0),
        stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0),
        stats.hypsecant(loc=0.0, scale=1.0),
        stats.invgamma(a=4.07, loc=0.0, scale=1.0),
        stats.invgauss(mu=0.145, loc=0.0, scale=1.0),
        stats.invweibull(c=10.6, loc=0.0, scale=1.0),
        stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0),
        stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0),
        stats.ksone(n=1e03, loc=0.0, scale=1.0),
        stats.kstwobign(loc=0.0, scale=1.0),
        stats.laplace(loc=0.0, scale=1.0),
        stats.levy(loc=0.0, scale=1.0),
        stats.levy_l(loc=0.0, scale=1.0),
        stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0),
        stats.logistic(loc=0.0, scale=1.0),
        stats.loggamma(c=0.414, loc=0.0, scale=1.0),
        stats.loglaplace(c=3.25, loc=0.0, scale=1.0),
        stats.lognorm(s=0.954, loc=0.0, scale=1.0),
        stats.lomax(c=1.88, loc=0.0, scale=1.0),
        stats.maxwell(loc=0.0, scale=1.0),
        stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0),
        stats.nakagami(nu=4.97, loc=0.0, scale=1.0),
        stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0),
        stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0),
        stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0),
        stats.norm(loc=0.0, scale=1.0),
        stats.pareto(b=2.62, loc=0.0, scale=1.0),
        stats.pearson3(skew=0.1, loc=0.0, scale=1.0),
        stats.powerlaw(a=1.66, loc=0.0, scale=1.0),
        stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0),
        stats.powernorm(c=4.45, loc=0.0, scale=1.0),
        stats.rdist(c=0.9, loc=0.0, scale=1.0),
        stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0),
        stats.rayleigh(loc=0.0, scale=1.0),
        stats.rice(b=0.775, loc=0.0, scale=1.0),
        stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0),
        stats.semicircular(loc=0.0, scale=1.0),
        stats.t(df=2.74, loc=0.0, scale=1.0),
        stats.triang(c=0.158, loc=0.0, scale=1.0),
        stats.truncexpon(b=4.69, loc=0.0, scale=1.0),
        stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0),
        stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0),
        stats.uniform(loc=0.0, scale=1.0),
        stats.vonmises(kappa=3.99, loc=0.0, scale=1.0),
        stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0),
        stats.wald(loc=0.0, scale=1.0),
        stats.weibull_min(c=1.79, loc=0.0, scale=1.0),
        stats.weibull_max(c=2.87, loc=0.0, scale=1.0),
        stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0),
Пример #24
         -log(-log(linspace(1/len(bad_sample), (len(bad_sample)-1)/len(bad_sample),
         len(bad_sample)))), '*')

## Using Kolmogorov-Smirnov test
## The D statistic is the absolute max distance (supremum) between the CDFs of the two samples.
## The closer this number is to 0 the more likely it is that the two samples were drawn from the
## same distribution.
## The p-value returned by the k-s test has the same interpretation as other p-values. You reject
## the null hypothesis that the two samples were drawn from the same distribution if the p-value
## is less than your significance level. You can find tables online for the conversion of the
## D statistic into a p-value if you are interested in the procedure.
stats, pvalue = ss.kstest(rvs=good_sample, cdf=ss.gumbel_l(*
print('The maximumdistance between CDFs is %.2f.' % stats, end='')
print('The sample is Gumbel distributed for a significance level of %.2f' % pvalue)

stats, pvalue = ss.kstest(rvs=bad_sample, cdf=ss.gumbel_l(*
print('The maximumdistance between CDFs is %.2f.' % stats, end='')
print('The sample is Gumbel distributed for a significance level of %.2f' % pvalue)

## Using Anderson-Darling test
## The assumption regarding the distribution of the sample is rejected if the output value
## is larger than the critical values for the required significance level.
## For gumbel distributions, the critical values and significance levels are:
##     [0.456, 0.612, 0.728, 0.843, 0.998]
##     [25.0, 10.0, 5.0, 2.5, 1.0]
## I.e, for a sample to be assumed Gumbel distributed with a significant level of 25%,
Пример #25
Ahora usaremos lo que aprendimos arriba para "uniformizar" las marginales. 
Este diagrama conjunto suele ser cómo se visualizan las cópulas.

norm = stats.norm()
x_unif = norm.cdf(x)
h = sns.jointplot(x_unif[:, 0], x_unif[:, 1], kind='hex', stat_func=None)
h.set_axis_labels('Y1', 'Y2', fontsize=16)

Ahora solo transformamos los marginales nuevamente a lo que queremos 
(Gumbel y Beta):

m1 = stats.gumbel_l()
m2 = stats.beta(a=10, b=2)

x1_trans = m1.ppf(x_unif[:, 0])
x2_trans = m2.ppf(x_unif[:, 1])

h = sns.jointplot(x1_trans,
                  xlim=(-6, 2),
                  ylim=(.6, 1.0),
h.set_axis_labels('Maximum river level', 'Probablity of flooding', fontsize=16)

def case2(indexes=CASE_2_ATTRIBUTE_INDEXES,output=True):

    accuracy_in_each_turn = list()

    precision_in_each_turn_spam = list()
    recall_in_each_turn_spam = list()

    precision_in_each_turn_ham = list()
    recall_in_each_turn_ham = list()

    m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',')

    shuffled = np.random.permutation(m)


    # equiprobable priors
    prior_spam = 0.5
    prior_ham = 0.5

    for i in xrange(NUMBER_OF_ROUNDS):

        # we're using cross-validation so each iteration we take a different
        # slice of the data to serve as test set
        train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i)

        #parameter estimation
        #but now we take 10 attributes into consideration
        sample_means_word_spam = list()
        sample_means_word_ham = list()

        sample_variances_word_spam = list()
        sample_variances_word_ham = list()

        for attr_index in indexes:



        #sample standard deviations from sample variances
        sample_std_devs_spam = map(lambda x: x ** (1/2.0), sample_variances_word_spam)
        sample_std_devs_ham = map(lambda x: x ** (1/2.0), sample_variances_word_ham)

        hits = 0.0
        misses = 0.0

        #number of instances correctly evaluated as spam
        correctly_is_spam = 0.0

        #total number of spam instances
        is_spam = 0.0

        #total number of instances evaluated as spam
        guessed_spam = 0.0

        #number of instances correctly evaluated as ham
        correctly_is_ham = 0.0

        #total number of ham instances
        is_ham = 0.0

        #total number of instances evaluated as ham
        guessed_ham = 0.0

        # now we test the hypothesis against the test set
        for row in test_set:

            # ou seja, o produto de todas as prob. condicionais das palavras dada a classe   
            # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =)         
            product_of_all_conditional_probs_spam = reduce(lambda acc,cur: acc * stats.gumbel_l(sample_means_word_spam[cur], sample_std_devs_spam[cur]).pdf(row[indexes[cur]]) , xrange(10), 1)
            # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior!
            posterior_spam = prior_spam * product_of_all_conditional_probs_spam

            product_of_all_conditional_probs_ham = reduce(lambda acc,cur: acc * stats.gumbel_l(sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf(row[indexes[cur]]) , xrange(10), 1)
            posterior_ham = prior_ham * product_of_all_conditional_probs_ham
            # whichever is greater - that will be our prediction
            if posterior_spam > posterior_ham:
                guess = 1
                guess = 0

            if(row[SPAM_ATTR_INDEX] == guess):
                hits += 1
                misses += 1

            # we'll use these to calculate metrics
            if (row[SPAM_ATTR_INDEX] == 1 ):
                is_spam += 1
                if guess == 1:
                    guessed_spam += 1
                    correctly_is_spam += 1
                    guessed_ham += 1
                is_ham += 1

                if guess == 1:
                    guessed_spam += 1
                    guessed_ham += 1
                    correctly_is_ham += 1

        #accuracy = number of correctly evaluated instances/
        #           number of instances
        accuracy = hits/(hits+misses)

        #precision_spam = number of correctly evaluated instances as spam/
        #            number of spam instances
        # in order to avoid divisions by zero in case nothing was found
        if(is_spam == 0):
            precision_spam = 0
            precision_spam = correctly_is_spam/is_spam

        #recall_spam = number of correctly evaluated instances as spam/
        #         number of evaluated instances como spam
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_spam == 0):
            recall_spam = 0
            recall_spam = correctly_is_spam/guessed_spam

        #precision_ham = number of correctly evaluated instances as ham/
        #            number of ham instances
        # in order to avoid divisions by zero in case nothing was found
        if(is_ham == 0):
            precision_ham = 0
            precision_ham = correctly_is_ham/is_ham

        #recall_ham = number of correctly evaluated instances as ham/
        #         number of evaluated instances como ham
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_ham == 0):
            recall_ham = 0
            recall_ham = correctly_is_ham/guessed_ham




    # calculation of means for each metric at the end

    mean_accuracy = np.mean(accuracy_in_each_turn)
    std_dev_accuracy = np.std(accuracy_in_each_turn)
    variance_accuracy = np.var(accuracy_in_each_turn)

    mean_precision_spam = np.mean(precision_in_each_turn_spam)
    std_dev_precision_spam = np.std(precision_in_each_turn_spam)
    variance_precision_spam = np.var(precision_in_each_turn_spam)

    mean_recall_spam = np.mean(recall_in_each_turn_spam)
    std_dev_recall_spam = np.std(recall_in_each_turn_spam)
    variance_recall_spam = np.var(recall_in_each_turn_spam)

    mean_precision_ham = np.mean(precision_in_each_turn_ham)
    std_dev_precision_ham = np.std(precision_in_each_turn_ham)
    variance_precision_ham = np.var(precision_in_each_turn_ham)

    mean_recall_ham = np.mean(recall_in_each_turn_ham)
    std_dev_recall_ham = np.std(recall_in_each_turn_ham)
    variance_recall_ham = np.var(recall_in_each_turn_ham)

    if output:
        print "\033[1;32m"
        print '============================================='
        print '============================================='
        print "\033[00m"
        print 'MEAN ACCURACY: '+str(round(mean_accuracy,5))
        print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5))
        print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8))
        print ''
        print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5))
        print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5))
        print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8))
        print ''
        print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5))
        print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5))
        print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8))
        print ''
        print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5))
        print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5))
        print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8))
        print ''
        print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5))
        print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5))
        print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8))
Пример #27
 def get_y(params, x, tail):
     if tail == 'upper':
         return -np.log(-ss.gumbel_r(*params).logcdf(x))
         return -np.log(-ss.gumbel_l(*params).logsf(x))