def threshold_n_binom(params, p_value, thresh_range=None): """ Determine a p-value threshold for a composite negative binomial and lognormal distribution based only on the value of the negative binomial. :param tuple params: Tuple of parameters for a combined \ negative/binomial (see :func:`~n_binom_plus_log_normal`) :param float p_value: P-value cut-off :param list thresh_range: Possible values to consider as a cut \ off (default is 0-500). :returns: Position above which the integral of the negative \ binomial is equal to the P-value cut-off. """ if thresh_range is None: thresh_range = list(range(500)) bin_n, bin_p, nm_delta, nm_scale, size = params bin_mean, bin_var = nbinom.stats(bin_n, bin_p) cumulative_dist = nbinom.cdf(thresh_range, bin_n, bin_p) prob_dist = sum_to_1(un_cumulative(cumulative_dist)) index = bisect_left(prob_dist[::-1], p_value) return thresh_range[::-1][index]
def plot_binom(breaks, counts, params): """ Given the parameters of a composite negative binomal and lognormal distribution, plot only the contribution of the negative binomial. :param breaks: Array containing histogram bin edges. :type breaks: :class:`~numpy.ndarray` :param counts: Array containing the number of windows \ falling into each histogram bin. :type counts: :class:`~numpy.ndarray` :param tuple params: Parameters of the composite \ distribution (see :func:`~n_binom_plus_log_normal`). """ bin_n, bin_p, nm_delta, nm_scale, size = params bin_mean, bin_var = nbinom.stats(bin_n, bin_p) binom_y = neg_binomial(breaks, bin_n, bin_p) binom_y = binom_y * sum(counts) * (1. - abs(size)) binom_y = mask_x_by_z(binom_y, counts) fit_x = get_fit_x(breaks, counts) return plt.plot(fit_x, binom_y, color='green')
def n_binom_plus_log_normal(params, x): """ Composite probability density function for the sum of a lognormal distribution and a negative binomial distribution. params is a tuple of all parameters for both underlying distributions: params = bin_n, bin_p, nm_delta, nm_scale, size Parameters for the negative binomial distribution are: bin_n = Negative binomial number of trials (see :data:`~scipy.stats.nbinom`) bin_p = Negative binomial probability of success (see :data:`~scipy.stats.nbinom`) Parameters for the lognormal distribution are: nm_delta = Absolute difference between the mean of the lognormal distribution and the mean of the negative binomial distribution nm_scale = Standard deviation of the lognormal distribution The lognormal is parameterized in this particular way because we don't want any solutions where the mean of the lognormal is less than the mean of the negative binomial. By parameterizing the function such that the position of the lognormal is given as a distance from the mean of the negative binomial (nm_delta), we can impose that nm_delta is always treated as positive. The final parameter, size, gives the ratio between the two underlying probability distributions. :param x: Edges of bins within which to calculate probability density :type x: :class:`~numpy.ndarray` :param tuple params: Parameters of the composite function :returns: Cumulative probability distribution over x. Return \ array has one less value than x, as the first value of \ the return array is the probability density between x[0] \ and x[1]. """ bin_n, bin_p, nm_delta, nm_scale, size = params bin_mean, bin_var = nbinom.stats(bin_n, bin_p) nm_loc = np.log10(bin_mean) + np.abs(nm_delta) bin_y = neg_binomial(x, bin_n, bin_p) norm_y = normal(x, nm_loc, nm_scale) sum_y = (bin_y * (1. - abs(size))) + (norm_y * abs(size)) return sum_y / sum(sum_y)
def _random_noise(df, noise_factor): r""" Generates random noise on an observable by a Negative Binomial :math:`NB`. References to the negative binomial can be found `here <https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Negative_Binomial_Regression.pdf>`_ . .. math:: O &\sim NB(\mu=datapoint,\alpha) We keep the alpha parameter low to obtain a small variance which should than always be approximately the size of the mean. Parameters ---------- df : new_cases , pandas.DataFrame Observable on which we want to add the noise noise_factor: :math:`\alpha` Alpha factor for the random number generation Returns ------- array : 1-dim observable with added noise """ def convert(mu, alpha): r = 1 / alpha p = mu / (mu + r) return r, 1 - p # Apply noise on every column for column in df: # Get values array = df[column].to_numpy() for i in range(len(array)): if (array[i] == 0) or (np.isnan(array[i])): continue log.debug(f"Data {array[i]}") r, p = convert(array[i], noise_factor) log.info(f"n {r}, p {p}") mean, var = nbinom.stats(r, p, moments="mv") log.debug(f"mean {mean} var {var}") array[i] = nbinom.rvs(r, p) log.debug(f"Drawn {array[i]}") df[column] = array return df
predR.append(RRest) testRRM=1.+infperiod*ln( gamma.ppf(0.99, a=alpha, scale=1./beta) )# these are the boundaries of the 99% confidence interval for new cases if (testRRM <0.): testRRM=0. pstRRM.append(testRRM) testRRm=1.+infperiod*ln( gamma.ppf(0.01, a=alpha, scale=1./beta) ) if (testRRm <0.): testRRm=0. pstRRm.append(testRRm) #print('estimated RR=',RRest,testRRm,testRRM) # to see the numbers for the evolution of Rt if (new_cases>0. and old_new_cases>0.): NewCases.append(new_cases) # Using a Negative Binomial as the Posterior Predictor of New Cases, given old one # This takes parameters r,p which are functions of new alpha, beta from Gamma r, p = alpha, beta/(old_new_cases+beta) mean, var, skew, kurt = nbinom.stats(r, p, moments='mvsk') pred.append(mean) # the expected value of new cases testciM=nbinom.ppf(0.99, r, p) # these are the boundaries of the 99% confidence interval for new cases pstdM.append(testciM) testcim=nbinom.ppf(0.01, r, p) pstdm.append(testcim) newp=p newr=r flag=0 while (new_cases>testciM or new_cases<testcim): if (flag==0): anomalyday.append(dates[i+1]) # the first new cases are at i=2 anomalypred.append(new_cases)
ax.vlines(x, 0, binom.pmf(x, n, p), colors='b', lw=5, alpha=0.5) rv = binom(n, p) ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1, label='frozen pmf') ax.legend(loc='best', frameon=False) # ============================================= # # ============= BINOMIALE NEGATIVE ============ # # ============================================= # fig, ax = plt.subplots(1, 1) n, p = 50, 0.4 mean, var, skew, kurt = nbinom.stats(n, p, moments='mvsk') x = np.arange(nbinom.ppf(0.01, n, p), nbinom.ppf(0.99, n, p)) ax.plot(x, nbinom.pmf(x, n, p), 'bo', ms=8, label='nbinom pmf') ax.vlines(x, 0, nbinom.pmf(x, n, p), colors='b', lw=5, alpha=0.5) rv = nbinom(n, p) ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1, label='frozen pmf') ax.legend(loc='best', frameon=False) # ============================================= # # ================= GEOMETRIQUE =============== #
def collect_and_plot_passes_nb(teams_list=None, teams_dict=None, plot_output=['single', 'all'], teams_col_dict=None): team_sequences = {} dict_of_passing_stats = {} all_sequences = [] for tm in teams_list: passing_stats = {} df = teams_dict[tm] list_of_dates = set(df['Date/Time']) date_sequences = {} for d in list_of_dates: df_filter = df[df['Date/Time'] == d] df_filter = df_filter[df_filter['Event Type'] != 'Cessation'] opponent = df_filter['Opponent'].iloc[0] kee = str(d) + ' | ' + opponent date_sequences[kee] = get_sequences(df_filter) team_sequences[tm] = date_sequences counts = convert_date_sequences_to_list_and_count(date_sequences) all_sequences.extend(counts) x_values_for_barplot = [key for key, group in groupby(counts)] y_values_for_barplot = [ i / sum([len(list(group)) for key, group in groupby(counts)]) for i in [len(list(group)) for key, group in groupby(counts)] ] ## (GP) NB Estimation mu = sum(counts) / len(counts) sigma = math.sqrt( sum([(mu - float(i))**2 for i in counts]) / (len([(mu - float(i))**2 for i in counts]) - 1)) r = (mu**2) / (sigma**2 - mu) p = (mu) / (sigma**2) mean, var, skew, kurt = nbinom.stats(r, p, moments='mvsk') passing_stats['nb_probability'] = p passing_stats['nb_r'] = r passing_stats['avg_passes'] = mean passing_stats['var_passes'] = sigma**2 passing_stats['nb_skew'] = skew passing_stats['nb_kurtosis'] = kurt dict_of_passing_stats[tm] = passing_stats if plot_output == 'single': x_values_for_nb = np.arange(nbinom.ppf(0.01, r, p), nbinom.ppf(0.9999, r, p)) y_values_for_nb = nbinom.pmf(x_values_for_nb, r, p) fig = go.Figure(data=[ go.Bar(x=x_values_for_barplot, y=y_values_for_barplot, marker_color=teams_col_dict[tm], marker_line_color="black", name="Passes Completed") ]) fig.add_trace( go.Scatter(x=x_values_for_nb, y=y_values_for_nb, marker_color="black", mode='lines', name='Negative Binomial Approximation')) fig.update_layout( title="{}: Catch Counts, with Negative Binomial Estimation". format(tm), xaxis_title="n Number of Catches", yaxis_title="Frequency", boxmode='group', plot_bgcolor='rgb(220,220,220)') iplot(fig) all_sequences.sort() if plot_output == 'all': mu_a = sum(all_sequences) / len(all_sequences) sigma_a = math.sqrt( sum([(mu_a - float(i))**2 for i in all_sequences]) / (len([(mu_a - float(i))**2 for i in all_sequences]) - 1)) r_a = (mu_a**2) / (sigma_a**2 - mu_a) p_a = (mu_a) / (sigma_a**2) mean_a, var_a, skew_a, kurt_a = nbinom.stats(r_a, p_a, moments='mvsk') x_values_for_barplot_a = [key for key, group in groupby(all_sequences)] y_values_for_barplot_a = [ i / sum([len(list(group)) for key, group in groupby(all_sequences)]) for i in [len(list(group)) for key, group in groupby(all_sequences)] ] x_values_for_nb_a = np.arange(nbinom.ppf(0.01, r_a, p_a), nbinom.ppf(0.9999, r_a, p_a)) y_values_for_nb_a = nbinom.pmf(x_values_for_nb_a, r_a, p_a) fig = go.Figure(data=[ go.Bar(x=x_values_for_barplot_a, y=y_values_for_barplot_a, marker_color="oldlace", marker_line_color="black", name="Passes Completed") ]) fig.add_trace( go.Scatter(x=x_values_for_nb_a, y=y_values_for_nb_a, marker_color="black", mode='lines', name='Negative Binomial Approximation')) fig.update_layout( title= "League Wide Catch Counts Per Possession, with Negative Binomial Estimation", xaxis_title="n Number of Catches in a Possession", yaxis_title="Frequency", boxmode='group', plot_bgcolor='rgb(220,220,220)') iplot(fig) return (dict_of_passing_stats, team_sequences, all_sequences)
def run_luis_model(df: pd.DataFrame, filepath: Path) -> None: infperiod = 4.5 # length of infectious period, adjust as needed def smooth(y, box_pts): box = np.ones(box_pts) / box_pts y_smooth = np.convolve(y, box, mode='same') return y_smooth # Loop through states states = df['state'].unique() returndf = pd.DataFrame() for state in states: from scipy.stats import gamma # not sure why this needs to be recalled after each state, but otherwite get a type exception import numpy as np statedf = df[df['state'] == state].sort_values('date') confirmed = list(statedf['positive']) dates = list(statedf['date']) day = list(range(1, len(statedf['date']) + 1)) if (confirmed[-1] < 10.): continue # this skips the Rt analysis for states for which there are <10 total cases ##### estimation and prediction dconfirmed = np.diff(confirmed) for ii in range(len(dconfirmed)): if dconfirmed[ii] < 0.: dconfirmed[ii] = 0. xd = dates[1:] sdays = 15 yy = smooth( dconfirmed, sdays ) # smoothing over sdays (number of days) moving window, averages large chunking in reporting in consecutive days yy[-2] = ( dconfirmed[-4] + dconfirmed[-3] + dconfirmed[-2] ) / 3. # these 2 last lines should not be necesary but the data tend to be initially underreported and also the smoother struggles. yy[-1] = (dconfirmed[-3] + dconfirmed[-2] + dconfirmed[-1]) / 3. #lyyy=np.cumsum(lwy) TotalCases = np.cumsum( yy ) # These are confirmed cases after smoothing: tried also a lowess smoother but was a bit more parameer dependent from place to place. alpha = 3. # shape parameter of gamma distribution beta = 2. # rate parameter of gamma distribution see https://en.wikipedia.org/wiki/Gamma_distribution valpha = [] vbeta = [] pred = [] pstdM = [] pstdm = [] xx = [] NewCases = [] predR = [] pstRRM = [] pstRRm = [] anomalyday = [] anomalypred = [] for i in range(2, len(TotalCases)): new_cases = float(TotalCases[i] - TotalCases[i - 1]) old_new_cases = float(TotalCases[i - 1] - TotalCases[i - 2]) # This uses a conjugate prior as a Gamma distribution for b_t, with parameters alpha and beta alpha = alpha + new_cases beta = beta + old_new_cases valpha.append(alpha) vbeta.append(beta) mean = gamma.stats(a=alpha, scale=1 / beta, moments='m') RRest = 1. + infperiod * ln(mean) if (RRest < 0.): RRest = 0. predR.append(RRest) testRRM = 1. + infperiod * ln( gamma.ppf(0.99, a=alpha, scale=1. / beta) ) # these are the boundaries of the 99% confidence interval for new cases if (testRRM < 0.): testRRM = 0. pstRRM.append(testRRM) testRRm = 1. + infperiod * ln( gamma.ppf(0.01, a=alpha, scale=1. / beta)) if (testRRm < 0.): testRRm = 0. pstRRm.append(testRRm) if (new_cases == 0. or old_new_cases == 0.): pred.append(0.) pstdM.append(10.) pstdm.append(0.) NewCases.append(0.) if (new_cases > 0. and old_new_cases > 0.): NewCases.append(new_cases) # Using a Negative Binomial as the Posterior Predictor of New Cases, given old one # This takes parameters r,p which are functions of new alpha, beta from Gamma r, p = alpha, beta / (old_new_cases + beta) mean, var, skew, kurt = nbinom.stats(r, p, moments='mvsk') pred.append(mean) # the expected value of new cases testciM = nbinom.ppf( 0.99, r, p ) # these are the boundaries of the 99% confidence interval for new cases pstdM.append(testciM) testcim = nbinom.ppf(0.01, r, p) pstdm.append(testcim) np = p nr = r flag = 0 while (new_cases > testciM or new_cases < testcim): if (flag == 0): anomalypred.append(new_cases) anomalyday.append( dates[i + 1]) # the first new cases are at i=2 # annealing: increase variance so as to encompass anomalous observation: allow Bayesian code to recover # mean of negbinomial=r*(1-p)/p variance= r (1-p)/p**2 # preserve mean, increase variance--> np=0.8*p (smaller), r= r (np/p)*( (1.-p)/(1.-np) ) # test anomaly nnp = 0.95 * np # this doubles the variance, which tends to be small after many Bayesian steps nr = nr * (nnp / np) * ( (1. - np) / (1. - nnp) ) # this assignement preserves the mean of expected cases np = nnp mean, var, skew, kurt = nbinom.stats(nr, np, moments='mvsk') testciM = nbinom.ppf(0.99, nr, np) testcim = nbinom.ppf(0.01, nr, np) flag = 1 else: if (flag == 1): alpha = nr # this updates the R distribution with the new parameters that enclose the anomaly beta = np / (1. - np) * old_new_cases testciM = nbinom.ppf(0.99, nr, np) testcim = nbinom.ppf(0.01, nr, np) # annealing leaves the RR mean unchanged, but we need to adjus its widened CI: testRRM = 1. + infperiod * ln( gamma.ppf(0.99, a=alpha, scale=1. / beta) ) # these are the boundaries of the 99% confidence interval for new cases if (testRRM < 0.): testRRM = 0. testRRm = 1. + infperiod * ln( gamma.ppf(0.01, a=alpha, scale=1. / beta)) if (testRRm < 0.): testRRm = 0. pstRRM = pstRRM[: -1] # remove last element and replace by expanded CI for RRest pstRRm = pstRRm[:-1] pstRRM.append(testRRM) pstRRm.append(testRRm) # visualization of the time evolution of R_t with confidence intervals x = [] for i in range(len(predR)): x.append(i) days = dates[3:] xd = days dstr = [] for xdd in xd: dstr.append(xdd.strftime("%Y-%m-%d")) appenddf = pd.DataFrame({ 'state': state, 'date': days, 'RR_pred_luis': predR, 'RR_CI_lower_luis': pstRRm, 'RR_CI_upper_luis': pstRRM }) returndf = pd.concat([returndf, appenddf], axis=0) returndf.to_csv(filepath / "luis_code_estimates.csv", index=False)
# DISTRIBUCIÓN BINOMIAL NEGATIVA from scipy.stats import nbinom nbinom.pmf(k=5, n=2, p=0.1) nbinom.pmf(k=5, n=2, p=0.1, loc=0) nbinom.cdf(k=4, n=2, p=0.1) 1 - nbinom.cdf(k=4, n=2, p=0.1) nbinom.rvs(n=2, p=0.1, size=100) params = nbinom.stats(n=2, p=0.1, moments='mv') 'E(X) = {} y Var(X) = {}'.format(params[0], params[1]) n, p = 10, 0.25 x = np.arange(nbinom.ppf(0.01, n, p), nbinom.ppf(0.99, n, p)) fig = plt.figure(figsize=(5, 2.7)) ax = fig.add_subplot(1, 2, 1) ax.plot(x, nbinom.pmf(x, n, p), 'bo', ms=8, label="nbinom pmf") ax.vlines(x, 0, nbinom.pmf(x, n, p), color="b", lw=5, alpha=0.5) for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(5) for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize(5) ax = fig.add_subplot(1, 2, 2) ax.plot(x, nbinom.cdf(x, n, p), 'bo', ms=8, label='nbinom pmf')