예제 #1
0
def the_normal_cdf(samples_std1, samples_std3, samples_std10):
    x_std1, y_std1 = ecdf(samples_std1)
    x_std3, y_std3 = ecdf(samples_std3)
    x_std10, y_std10 = ecdf(samples_std10)

    _ = plt.plot(x_std1, y_std1, marker='.', linestyle='none')
    _ = plt.plot(x_std3, y_std3, marker='.', linestyle='none')
    _ = plt.plot(x_std10, y_std10, marker='.', linestyle='none')

    _ = plt.legend(('std = 1', 'std = 3', 'std = 10'), loc='lower right')
    plt.show()
def visualizing_bootstrap_samples():
    for _ in range(50):
        bs_sample = np.random.choice(rainfall, size=len(rainfall))
        x, y = ecdf(bs_sample)
        _ = plt.plot(x, y, marker='.', linestyle='none',
                     color='gray', alpha=0.1)

    x, y = ecdf(rainfall)
    _ = plt.plot(x, y, marker='.')
    plt.margins(0.02)
    _ = plt.xlabel('yearly rainfall (mm)')
    _ = plt.ylabel('ECDF')
    plt.show()
예제 #3
0
def do_the_data_follow_our_story():
    x, y = ecdf(nohitter_times)
    np.random.seed(42)
    tau = np.mean(nohitter_times)
    inter_nohitter_time = np.random.exponential(tau, 100000)
    x_theor, y_theor = ecdf(inter_nohitter_time)

    plt.plot(x_theor, y_theor)
    plt.plot(x, y, marker='.', linestyle='none')
    plt.margins(0.02)
    plt.xlabel('Games between no-hitters')
    plt.ylabel('CDF')
    plt.show()
def are_belmont_stakes_normally_distributed(belmont_no_outliers):
    mu = np.mean(belmont_no_outliers)
    sigma = np.std(belmont_no_outliers)
    samples = np.random.normal(mu, sigma, size=10000)

    x_theor, y_theor = ecdf(samples)
    x, y = ecdf(belmont_no_outliers)

    _ = plt.plot(x_theor, y_theor)
    _ = plt.plot(x, y, marker='.', linestyle='none')
    _ = plt.xlabel('Belmont winning time (sec.)')
    _ = plt.ylabel('CDF')
    plt.show()
예제 #5
0
	def fitmin(pts, mmefit):
		x = np.array(pts)

		ec = util.ecdf(x)
		xi = ec[:,0]
		ai = ec[:,1]
		
		if mmefit == True:
			(imu, isig) = Lognormal.mmefit(x)
		else:
			(imu, isig) = Lognormal.mlefit(x)

		sqrt2 = math.sqrt(2)
		xi2 = xi**2.0
		ki = Lognormal.__ki(ai)

		ivs = [imu, isig]
		ovs = (ki, xi2, sqrt2)

		(fvals, infodict, ier, mesg) = opt.fsolve(Lognormal.__solve_fitmin, ivs, ovs, None, 1, 0)
	
		f_mu = fvals[0]
		f_sig = fvals[1]

		if ier != 1:
			raise LognormalConvergenceError(mesg, (f_mu, f_sig))
		
		return (f_mu, f_sig)
예제 #6
0
    def fitmin(pts, mmefit):
        x = np.array(pts)

        ec = util.ecdf(x)
        xi = ec[:, 0]
        ai = ec[:, 1]

        if mmefit == True:
            (imu, isig) = Lognormal.mmefit(x)
        else:
            (imu, isig) = Lognormal.mlefit(x)

        sqrt2 = math.sqrt(2)
        xi2 = xi**2.0
        ki = Lognormal.__ki(ai)

        ivs = [imu, isig]
        ovs = (ki, xi2, sqrt2)

        (fvals, infodict, ier, mesg) = opt.fsolve(Lognormal.__solve_fitmin,
                                                  ivs, ovs, None, 1, 0)

        f_mu = fvals[0]
        f_sig = fvals[1]

        if ier != 1:
            raise LognormalConvergenceError(mesg, (f_mu, f_sig))

        return (f_mu, f_sig)
예제 #7
0
def sampling_out_of_binomial_distribution():
    n_defaults = np.random.binomial(100, 0.05, size=10000)
    x, y = ecdf(n_defaults)

    _ = plt.plot(x, y, marker='.', linestyle='none')
    _ = plt.xlabel('number of defaults out of 100 loans')
    _ = plt.ylabel('CDF')
    plt.show()
예제 #8
0
def eda_of_beak_depths():
    # Compute ECDFs
    x_1975, y_1975 = ecdf(bd_1975)
    x_2012, y_2012 = ecdf(bd_2012)

    # Plot the ECDFs
    _ = plt.plot(x_1975, y_1975, marker='.', linestyle='none')
    _ = plt.plot(x_2012, y_2012, marker='.', linestyle='none')

    # Set margins
    plt.margins(0.02)

    # Add axis labels and legend
    _ = plt.xlabel('beak depth (mm)')
    _ = plt.ylabel('ECDF')
    _ = plt.legend(('1975', '2012'), loc='lower right')

    # Show the plot
    plt.show()
예제 #9
0
def do_neonicotinoid_insecticides_have_unintended_consequences():
    # Compute x,y values for ECDFs
    x_control, y_control = ecdf(control)
    x_treated, y_treated = ecdf(treated)

    # Plot the ECDFs
    plt.plot(x_control, y_control, marker='.', linestyle='none')
    plt.plot(x_treated, y_treated, marker='.', linestyle='none')

    # Set the margins
    plt.margins(0.02)

    # Add a legend
    plt.legend(('control', 'treated'), loc='lower right')

    # Label axes and show plot
    plt.xlabel('millions of alive sperm per mL')
    plt.ylabel('ECDF')
    plt.show()
예제 #10
0
def visualizing_permutation_sampling():
    for i in range(50):
        perm_sample_1, perm_sample_2 = permutation_sample(
            rain_june, rain_november)
        x_1, y_1 = ecdf(perm_sample_1)
        x_2, y_2 = ecdf(perm_sample_2)
        # Plot ECDFs of permutation sample
        _ = plt.plot(x_1, y_1, marker='.', linestyle='none',
                     color='red', alpha=0.02)
        _ = plt.plot(x_2, y_2, marker='.', linestyle='none',
                     color='blue', alpha=0.02)

    # Create and plot ECDFs from original data
    x_1, y_1 = ecdf(rain_june)
    x_2, y_2 = ecdf(rain_november)
    _ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red')
    _ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue')
    plt.margins(0.02)
    _ = plt.xlabel('monthly rainfall (mm)')
    _ = plt.ylabel('ECDF')
    plt.show()
예제 #11
0
	def fitmin(points, **kwargs):
		"""
		Minimization of the FIT metric using the inverse CDF
		Usage: ModLav.fitmin(points, [beta=], [c=], [d=])

		Input
		------
		points: Points to run ML estimation
		**kwargs: Initial values for the mle fit. Mostly estimate Initial values from the __initial_values 
		method.
		beta = initial beta value
		c = initial c value
		d = initial d value

		Output
		------
		Return value: Tuple (beta, c, d)
		"""

		pts = np.array(points)
		c = util.ecdf(pts)
		iv = ModLav.__initial_values(pts)

		i_beta = iv["beta"]
		i_c = iv["c"]
		i_d = iv["d"]
		tol = 1e-10
		
		if "beta" in kwargs:
			i_beta = kwargs["beta"]
		if "c" in kwargs:
			i_c = kwargs["c"]
		if "d" in kwargs:
			i_d = kwargs["d"]
		if "tol" in kwargs:
			tol = kwargs["tol"]

		ivs = [math.log(i_beta), math.log(i_c), math.log(i_d)]
		oval = (c)
		
		(fvals, infodict, ier, mesg) = opt.fsolve(ModLav.__solve_fitmin, ivs, oval, None, 1, 0, tol,2000)

		f_beta = math.exp(fvals[0])
		f_c = math.exp(fvals[1])
		f_d = math.exp(fvals[2])
		

		if ier != 1:
			prms = {"beta": f_beta, "c": f_c, "d": f_d}
			raise ModLavConvergenceError(mesg, (f_beta, f_c, f_d))
		
		return (f_beta, f_c, f_d)
예제 #12
0
def distribution_of_no_hitters_and_cycles():
    waiting_times = successive_poisson(764, 715, 100000)

    _ = plt.hist(waiting_times, bins=100, density=True, histtype='step')
    _ = plt.xlabel('waiting time')
    _ = plt.ylabel('probability')
    plt.show()

    x, y = ecdf(waiting_times)
    _ = plt.plot(x, y, marker='.', linestyle='none')
    _ = plt.xlabel('waiting time')
    _ = plt.ylabel('CDF')
    plt.show()
예제 #13
0
def how_is_this_parameter_optimal():
    x, y = ecdf(nohitter_times)
    np.random.seed(42)
    tau = np.mean(nohitter_times)
    inter_nohitter_time = np.random.exponential(tau, 100000)
    x_theor, y_theor = ecdf(inter_nohitter_time)

    plt.plot(x_theor, y_theor)
    plt.plot(x, y, marker='.', linestyle='none')
    plt.margins(0.02)
    plt.xlabel('Games between no-hitters')
    plt.ylabel('CDF')

    samples_half = np.random.exponential(tau/2, 10000)
    samples_double = np.random.exponential(tau*2, 10000)
    x_half, y_half = ecdf(samples_half)
    x_double, y_double = ecdf(samples_double)

    _ = plt.plot(x_half, y_half)
    _ = plt.plot(x_double, y_double)
    _ = plt.legend(['theory', 'empirical', 'tau/2',
                   'tau*2'], loc='lower right')
    plt.show()
def will_the_bank_fail():
    np.random.seed()
    n_defaults = np.empty(1000)
    for i in range(1000):
        n_defaults[i] = perform_bernoulli_trials(100, 0.05)

    x, y = ecdf(n_defaults)

    _ = plt.plot(x, y, marker='.', linestyle='none')
    _ = plt.xlabel('number of defaults')
    _ = plt.ylabel('ECDF')
    plt.show()

    n_lose_money = np.sum(n_defaults >= 10)
    print('Number of 100-loan simulations with 10 or more defaults',
          n_lose_money)
    print('Probability of losing money =', n_lose_money / len(n_defaults))
예제 #15
0
def optfit(x, lo, hi, n, **kwargs):
	"""
	Optimum modlav fit using search for the best xmax.
	Input:
	x: Set of points
	lo: Low xmax value
	hi: Hi xmax value [Note lo <= max(x) <= hi]
	n: Number of searches.
	**kwargs:
	mlefit: True - use mlefit, False - use mmefit. True by default
	mt: True| false. Use mirror transform. False by default

	Output:
	Dict: {"fit": (ModLav object, xmax, FIT metric), "ks": (ModLav, xmax, ks)}
	"""

	pts = util.gen_points(lo, hi, n)
	fits_fm = dict()
	fits_ks = dict()
	rval = dict()
	x.sort()
	c = util.ecdf(x)

	mlefit = True
	if "mlefit" in kwargs:
		mlefit = kwargs["mlefit"]
	
	vmt = False
	if "mt" in kwargs:
		vmt = kwargs["mt"]

	for xmax in pts:
		try:
			if mlefit == True:
				m = ModLav.fromFit(x, xmax=xmax, fit="mlefit",mt=vmt)
			else:
				m = ModLav.fromFit(x, xmax=xmax, fit="mmefit",mt=vmt)
		except ModLavConvergenceError, mlce:
			print mlce
			continue
		except BaseException, err:
			print str(err)
			continue
def comparing_percentiles_to_ECDF(versicolor_petal_length):
    percentiles = np.array([2.5, 25, 50, 75, 97.5])
    ptiles_vers = np.percentile(versicolor_petal_length, percentiles)
    print(ptiles_vers)

    x_vers, y_vers = ecdf(versicolor_petal_length)
    _ = plt.plot(x_vers, y_vers, '.')
    _ = plt.xlabel('petal length (cm)')
    _ = plt.ylabel('ECDF')

    # Overlay percentiles as red diamonds.
    _ = plt.plot(ptiles_vers,
                 percentiles / 100,
                 marker='D',
                 color='red',
                 linestyle='none')

    # Show the plot
    plt.show()
예제 #17
0
	def ksmetric(self, **kwargs):
		"""
		Return the kolmogorov-smirnov metric for lognormal
		Input:
		**kwargs:
			points = [set of points to compute the cdf]
			-or-
			cdf = [Already computed cdf]

		Output:
			ks metric
		"""
		c = None
		if "cdf" in kwargs:
			c = kwargs["cdf"]
		else:
			p = kwargs["points"]
			p.sort()
			c = util.ecdf(p, issorted=True)
		
		y = self.cdf(c[:,0])
		return util.kstest(c[:,1],c[:,2],y)
예제 #18
0
    def ksmetric(self, **kwargs):
        """
		Return the kolmogorov-smirnov metric for lognormal
		Input:
		**kwargs:
			points = [set of points to compute the cdf]
			-or-
			cdf = [Already computed cdf]

		Output:
			ks metric
		"""
        c = None
        if "cdf" in kwargs:
            c = kwargs["cdf"]
        else:
            p = kwargs["points"]
            p.sort()
            c = util.ecdf(p, issorted=True)

        y = self.cdf(c[:, 0])
        return util.kstest(c[:, 1], c[:, 2], y)
예제 #19
0
	def fitmetric(self, **kwargs):
		"""
		Return the FIT metric for MOVLAV
		Input:
		**kwargs:
			points = [set of points to compute the cdf]
			-or-
			cdf = [Already computed cdf]
		
		Output:
			Fit metric
		"""
		c = None
		if "cdf" in kwargs:
			c = kwargs["cdf"]
		else:
			p = kwargs["points"]
			p.sort()
			c = util.ecdf(p, issorted=True)
		
		xi = c[:,0]
		x_hat_i = self.cdf_inv(c[:,1])
		return util.fitmetric(xi, x_hat_i, c[:,1])
예제 #20
0
    def ksmetric(self, **kwargs):
        """
		Return the ks metris for truncated pareto
		Input:
		**kwargs:
			points = [set of points]
			-or-
			cdf = [Precomputed cdf]

		Output:
			KS metric
		"""

        c = None
        if "cdf" in kwargs:
            c = kwargs["cdf"]
        else:
            p = kwargs["points"]
            p.sort()
            c = util.ecdf(p, issorted=True)

        y = self.cdf(c[:, 0])
        return util.kstest(c[:, 1], c[:, 2], y)
예제 #21
0
	def ksmetric(self, **kwargs):
		"""
		Return the ks metris for truncated pareto
		Input:
		**kwargs:
			points = [set of points]
			-or-
			cdf = [Precomputed cdf]

		Output:
			KS metric
		"""

		c = None
		if "cdf" in kwargs:
			c = kwargs["cdf"]
		else:
			p = kwargs["points"]
			p.sort()
			c = util.ecdf(p, issorted=True)
		
		y = self.cdf(c[:,0])
		return util.kstest(c[:,1],c[:,2],y)
예제 #22
0
	def difference(self, **kwargs):
		"""
		Return the Difference metric for MOVLAV
		Input:
		**kwargs:
			points = [set of points to compute the cdf]
			-or-
			cdf = [Already computed cdf]
		
		Output:
			Difference metric. The closer the difference to 0 the more similar the fit.	
		"""
		c = None
		if "cdf" in kwargs:
			c = kwargs["cdf"]
		else:
			p = kwargs["points"]
			p.sort()
			c = util.ecdf(p, issorted=True)
		
		xi = c[:,0]
		x_hat_i = self.cdf_inv(c[:,1])
		return 1 - util.chlebus_divgi_sim_fitmetric(xi, x_hat_i, c[:,1])
예제 #23
0
    def difference(self, **kwargs):
        """
		Return the difference metric for lognormal
		Input:
		**kwargs:
			points = [set of points to compute the cdf]
			-or-
			cdf = [Already computed cdf]
		
		Output:
			Difference metric
		"""

        c = None
        if "cdf" in kwargs:
            c = kwargs["cdf"]
        else:
            p = kwargs["points"]
            p.sort()
            c = util.ecdf(p, issorted=True)

        xi = c[:, 0]
        x_hat_i = self.cdf_inv(c[:, 1])
        return 1 - util.chlebus_divgi_sim_fitmetric(xi, x_hat_i, c[:, 1])
예제 #24
0
    def fitmetric(self, **kwargs):
        """
		Return the FIT metric for truncated pareto
		Input:
		**kwargs:
			points = [set of points]
			-or-
			cdf = [Precomputed cdf]

		Output:
			FIT metric
		"""

        c = None
        if "cdf" in kwargs:
            c = kwargs["cdf"]
        else:
            p = kwargs["points"]
            p.sort()
            c = util.ecdf(p, issorted=True)

        xi = c[:, 0]
        x_hat_i = self.cdf_inv(c[:, 1])
        return util.fitmetric(xi, x_hat_i, c[:, 1])
예제 #25
0
    def fitmetric(self, **kwargs):
        """
		Return the fit metric for lognormal
		Input:
		**kwargs:
			points = [set of points to compute the cdf]
			-or-
			cdf = [Already computed cdf]
		
		Output:
			FIT metric
		"""

        c = None
        if "cdf" in kwargs:
            c = kwargs["cdf"]
        else:
            p = kwargs["points"]
            p.sort()
            c = util.ecdf(p, issorted=True)

        xi = c[:, 0]
        x_hat_i = self.cdf_inv(c[:, 1])
        return util.fitmetric(xi, x_hat_i, c[:, 1])
예제 #26
0
                data=postos_por_ano,
                hue='estado',
                ax=ax)
ax.legend(sorted(postos_por_ano.estado.unique().tolist()),
          loc='center left',
          bbox_to_anchor=(1, 0.5),
          prop={'size': 18})
plt.title('Número de postos pesquisados anualmente por Estado', fontsize=22)
plt.show()

fig.savefig('imagem.png')  # eps, pdf, pgf, png, ps, raw, rgba, svg, svgz

# Preço médio
df_novo.preco_med_rev.describe()

util.ecdf(df_novo, 'preco_med_rev')

# 2 boxplots com escalas diferentes
fig, ax = plt.subplots(nrows=1,
                       ncols=2,
                       figsize=(18, 6.5),
                       gridspec_kw={
                           "width_ratios": [5, 1],
                           "wspace": 0
                       })

# Eixo para produtos com preço médio similares
sns.boxplot(x="produto",
            y="preco_med_rev",
            data=df_novo[df_novo.produto != "GLP"],
            order=["ETANOL", "GASOLINA", "GNV", "DIESEL", "DIESEL S10"],
예제 #27
0
def draw_graph(graph, prefix):
    plt.figure(figsize=(12, 8))
    pos = nx.random_layout(graph)
    edges, weights = zip(*nx.get_edge_attributes(graph, 'weight').items())
    nx.draw(graph,
            pos,
            node_color='k',
            node_size=5,
            edgelist=edges,
            edge_color=weights,
            width=1.0,
            edge_cmap=plt.cm.Blues)
    plt.savefig(prefix + "_graph.png")

    plt.figure(figsize=(12, 8))
    pos = nx.random_layout(graph)
    edges, weights = zip(*nx.get_edge_attributes(graph, 'weight').items())
    nx.draw_spring(graph,
                   node_color='k',
                   node_size=5,
                   edgelist=edges,
                   edge_color=weights,
                   width=1.0,
                   edge_cmap=plt.cm.Blues)
    plt.savefig(prefix + "_spring_graph.png")

    plt.figure(figsize=(12, 8))
    plt.hist(weights, bins=200)
    plt.xlabel('Weights')
    plt.yscale('log')
    plt.savefig(prefix + "_weights_hist.png")

    plt.figure(figsize=(12, 8))
    (x, y) = util.ecdf(weights)
    plt.scatter(x=x, y=y)
    plt.ylabel('percentage')
    plt.xlabel('edge weights')
    plt.savefig(prefix + '_weights_cdf.png')
    plt.show()

    graph_filtered = graph
    edge_weights = nx.get_edge_attributes(graph_filtered, 'weight')
    #Only keep edges with atleast weight 2
    graph_filtered.remove_edges_from(
        (e for e, w in edge_weights.items() if w < 2))
    plt.figure(figsize=(12, 8))
    pos = nx.random_layout(graph_filtered)
    edges, weights = zip(
        *nx.get_edge_attributes(graph_filtered, 'weight').items())
    nx.draw(graph_filtered,
            pos,
            node_color='k',
            node_size=5,
            edgelist=edges,
            edge_color=weights,
            width=1.0,
            edge_cmap=plt.cm.Blues)
    plt.savefig(prefix + "_filtered_w2_graph.png")

    plt.figure(figsize=(12, 8))
    pos = nx.random_layout(graph_filtered)
    edges, weights = zip(
        *nx.get_edge_attributes(graph_filtered, 'weight').items())
    nx.draw_circular(graph_filtered,
                     node_color='k',
                     node_size=5,
                     edgelist=edges,
                     edge_color=weights,
                     width=1.0,
                     edge_cmap=plt.cm.Blues)
    plt.savefig(prefix + "_filtered_w2_graph_circular.png")
    plt.figure(figsize=(12, 8))
    pos = nx.random_layout(graph_filtered)
    edges, weights = zip(
        *nx.get_edge_attributes(graph_filtered, 'weight').items())
    nx.draw_spectral(graph_filtered,
                     node_color='k',
                     node_size=5,
                     edgelist=edges,
                     edge_color=weights,
                     width=1.0,
                     edge_cmap=plt.cm.Blues)
    plt.savefig(prefix + "_filtered_w2_graph_spectral.png")
    plt.figure(figsize=(12, 8))
    pos = nx.random_layout(graph_filtered)
    edges, weights = zip(
        *nx.get_edge_attributes(graph_filtered, 'weight').items())
    nx.draw_spring(graph_filtered,
                   node_color='k',
                   node_size=5,
                   edgelist=edges,
                   edge_color=weights,
                   width=1.0,
                   edge_cmap=plt.cm.Blues)
    plt.savefig(prefix + "_filtered_w2_graph_spring.png")
예제 #28
0
 def test_return_length_of_x(self):
     a = [1, 1, 2, 2, 3, 3, 7, 8, 9, 10]
     x, y = util.ecdf(a)
     self.assertEqual(len(y), len(a))
예제 #29
0
 def test_return_10_y_values(self):
     a = [1, 1, 2, 2, 3, 3, 7, 8, 9, 10]
     x, y = util.ecdf(a)
     assert_array_equal(
         y, np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]))
예제 #30
0
 def test_return_3_y_values(self):
     a = [1, 1, 2]
     x, y = util.ecdf(a)
     expected_y = np.array([0.333, 0.666, 0.999])
     for i in range(len(a)):
         self.assertAlmostEqual(y[i], expected_y[i], places=2)
예제 #31
0
 def test_return_input_as_x(self):
     a = [1, 1, 2, 2, 3, 3, 7, 8, 9, 10]
     x, y = util.ecdf(a)
     assert_array_equal(x, np.array(a))