def q2(): # Retorne aqui o resultado da questão 2. norm = dataframe['normal'] ecdf = ECDF(norm) prob_norm = ecdf(norm.mean() + norm.std()) - ecdf(norm.mean() - norm.std()) prob_norm = prob_norm.round(3) return float(prob_norm)
def q4(): # Retorne aqui o resultado da questão 4. x = stars[stars['target'] == 0].mean_profile false_pulsar_mean_profile_standardized = (x - np.mean(x)) / np.std(x) ecdf = ECDF(false_pulsar_mean_profile_standardized) return tuple(map(lambda x: round(x, 3), ecdf(sct.norm.ppf([.8, .9, .95]))))
def q2(): # Retorne aqui o resultado da questão 2. inferior = dataframe.normal.mean() - dataframe.normal.std() superior = dataframe.normal.mean() + dataframe.normal.std() ecdf = ECDF(dataframe.normal) return np.float(round(ecdf(superior) - ecdf(inferior), 3))
def sample_background(intensities, n): """Sample pixels from a provided sample.""" p1 = np.percentile(intensities, 10) p2 = np.percentile(intensities, 90) samples = intensities[np.logical_and(p1 < intensities, intensities < p2)] ecdf = ECDF(samples) return ecdf.x[np.searchsorted(ecdf.y, np.random.uniform(size=n))]
def estimate_empirical_cdf(X: np.ndarray, X_new: Optional[np.ndarray] = None): # initialize ecdf ecdf_f = ECDF(X) if X_new is None: return ecdf_f(X) else: return ecdf_f(X_new)
def q2(): media_normal = dataframe['normal'].mean() desvio_padrao_normal = dataframe['normal'].std() prob = ECDF(dataframe['normal']) resposta = np.round( prob(media_normal + desvio_padrao_normal) - prob(media_normal - desvio_padrao_normal), 3) return float(resposta)
def build_edf_fr_vals(data): """ construct empirical distribution function given data values """ from statsmodels.distributions.empirical_distribution import ECDF data = data.ravel() cdf = ECDF(data) x0 = cdf.x[1:] y0 = cdf.y[1:] return x0, y0
def q4(): # Retorne aqui o resultado da questão 4. fn_ecdf= ECDF(false_pulsar_mean_profile_standardized) q1 = sct.norm.ppf(0.8, loc=0, scale=1) q2 = sct.norm.ppf(0.9, loc=0, scale=1) q3 = sct.norm.ppf(0.95, loc=0, scale=1) return (fn_ecdf(q1).round(3), fn_ecdf(q2).round(3), fn_ecdf(q3).round(3)) pass
def percentiles_computation(chunk): # initialize Pandas DataFrame percentiles_series_append = pd.DataFrame() # initialize chunk chunks_df_tmp = pd.DataFrame() chunks_df_tmp = chunks_df[chunk] for quantile_rank in pd.unique(chunks_df_tmp.loc[:, 'quantile_rank']): # sub-setting percentiles_tmp = chunks_df_tmp.loc[chunks_df_tmp['quantile_rank'] == quantile_rank].reset_index( drop=True) # compute percentiles of the sub-set accorting to sub-set's size ecdf_values = [] # prepare Numpy array numpy_array = percentiles_tmp.loc[:, ['average_watch_percentage' ]].to_numpy() numpy_vector = numpy_array.flatten() # compute percentiles (by using Empirical Cumulative Density Function method) ecdf_values = ECDF(numpy_vector) # build Pandas Series with percentiles percentiles_series_tmp = pd.Series(ecdf_values(numpy_vector)) percentiles_series = percentiles_series_tmp.round(decimals=2) # build Pandas Series with video_ids video_id_series = percentiles_tmp.loc[:, ['video_id']] # build Pandas Series with quantiles quantile_series_tmp = pd.Series(quantile_rank) quantile_series = quantile_series_tmp.repeat( repeats=percentiles_tmp.shape[0]).reset_index(drop=True) # concatenate across columns the two DataFrames video_id_percentiles = pd.concat( [video_id_series, quantile_series, percentiles_series], axis=1, sort=False) # rename columns video_id_percentiles = video_id_percentiles.rename(columns={ 0: 'quantiles', 1: 'percentiles' }, inplace=False) # append Series percentiles_series_append = percentiles_series_append.append( video_id_percentiles, ignore_index=True, sort=False) # function's output return (percentiles_series_append)
def drawCumulativeHist(h0, h1, h2, h3, fname): # 创建累积曲线 # 第一个参数为待绘制的定量数据 # 第二个参数为划分的区间个数 # normed参数为是否无量纲化 # histtype参数为'step',绘制阶梯状的曲线 # cumulative参数为是否累积 # pyplot.rc('font', family='serif', serif='Times') pyplot.rc('text', usetex=True) pyplot.rc('xtick', labelsize=8) pyplot.rc('ytick', labelsize=8) pyplot.rc('axes', labelsize=8) # width as measured in inkscape width = 3.487 height = width / 1.618 length1 = len(h0) for i in range(0, length1): h0[i] = h0[i] / 10 fig, ax = pyplot.subplots() fig.subplots_adjust(left=.15, bottom=.16, right=.99, top=.97) length = int(min(len(h0), len(h1), len(h2), len(h3))) ecdf = ECDF(h0) ecdf1 = ECDF(h1) ecdf2 = ECDF(h2) ecdf3 = ECDF(h3) x = np.linspace(min(h0), max(h0), length) y = ecdf(x) y1 = ecdf1(x) y2 = ecdf2(x) y3 = ecdf3(x) pyplot.step(x, y2, label="DCUM", color="blue") # pyplot.step(x, y1, label="LRFL", color="green", linestyle="--") pyplot.step(x, y3, label="DUM", color="black", linestyle=":") pyplot.step(x, y, label="FIFO", color="red", linestyle="-.") pyplot.xlabel('delays/ms') pyplot.ylabel('CDF') pyplot.xlim(0, 3000) pyplot.title('CDF of delays') pyplot.legend(loc='lower right') fig.set_size_inches(width, height) fig.savefig(fname + '.pdf') pyplot.show()
def CDF_estimator(data): # Bootsrapping 1000 times # Also creating the theta_star each time, for third part of question bootstrapped_samples = [] theta_star = [] for i in range(0, 1000): temp = bootstrap(data['mag'].values) temp_ecdf = ECDF(temp) theta_star.append(temp_ecdf(4.9) - temp_ecdf(4.3)) bootstrapped_samples.extend(temp) # Estimated Empirical CDF ecdf = ECDF(bootstrapped_samples) line = np.linspace(3.5, 6.5, 1000) ecdf_points = [] for i in line: ecdf_points.append(ecdf(i)) plt.plot(line, ecdf_points) # Creating the confidence band epsilon = math.sqrt((1 / (2 * len(data)) * math.log10(2 / 0.05))) lower_band_points = [] upper_band_points = [] for x in line: lower_band_points.append(max(ecdf(x) - epsilon, 0)) for x in line: upper_band_points.append(min(ecdf(x) + epsilon, 1)) plt.title('Red: Lower CB | Green: Upper CB') plt.plot(line, lower_band_points, color='red') plt.plot(line, upper_band_points, color='green') plt.show() # Computing 3 types of CI for F(4.9) - F(4.3) # Normal: se = standard_error(theta_star) theta_hat = ecdf(4.9) - ecdf(4.3) normal_CI = (theta_hat - 1.96 * se, theta_hat + 1.96 * se) print('Normal Interval:', normal_CI) # Percentile percentile_CI = (np.percentile(theta_star, 0.025), np.percentile(theta_star, 97.5)) print('Percentile Interval:', percentile_CI) # Pivotal pivotal_CI = (2 * theta_hat - np.percentile(theta_star, 97.5) , 2 * theta_hat - np.percentile(theta_star, 0.025)) print('Pivotal Interval:', pivotal_CI)
def quantile_mapping(mod, obs, downscale, *args, **kwargs): """ Quantile Mapping using empirical cumulative distribution function """ mod_ecdf = ECDF(mod) p = mod_ecdf(downscale) * 100 corr = np.percentile(obs[~np.isnan(obs)], p) - \ np.percentile(mod[~np.isnan(mod)], p) return downscale + corr
def q2(): # CDF empírica da variável: ecdf = ECDF(dataframe.normal) # Média e desvio padrão: average, std = dataframe.normal.mean(),dataframe.normal.std() # Área acumulada superior menos a área acumulada inferior: return float(round(ecdf(average + std) - ecdf(average - std),3))
def q4(): false_pulsar_mean_profile = stars.mean_profile[stars.target == 0] media = false_pulsar_mean_profile.mean() std = false_pulsar_mean_profile.std() false_pulsar_mean_profile_standardized = (false_pulsar_mean_profile - media) / std quantis = sct.norm.ppf([0.80, 0.90, 0.95]) cdf_model = ECDF(false_pulsar_mean_profile_standardized) resposta_q4 = cdf_model(quantis) return tuple(np.round(resposta_q4, 3))
def q2(): # Retorne aqui o resultado da questão 2. ecdf = ECDF(dataframe.normal) media = dataframe.normal.mean() desvio = dataframe.normal.std() resposta = ecdf(media + desvio) - ecdf(media - desvio) resposta = round(resposta, 3) return resposta pass
def q2(): z_inf = dataframe['normal'].mean() - dataframe['normal'].std() z_sup = dataframe['normal'].mean() + dataframe['normal'].std() ecdf = ECDF(dataframe['normal']) answer = np.round(ecdf(z_sup) - ecdf(z_inf), 3) return float(answer)
def perc(x): """get the percentile values (ECDF * 100) >>> perc(np.arange(10)) array([ 10., 20., 30., 40., 50., 60., 70., 80., 90., 100.]) """ from statsmodels.distributions.empirical_distribution import ECDF return ECDF(x)(x) * 100
def q5(): x = df_stars['mean_profile'] x = x[df_stars['target'] == False] x = (x - x.mean()) / x.std() false_pulsar_mean_profile_standardized = x normal_quantiles = sct.norm.ppf([0.25, 0.50, 0.75]) false_pulsar_quantiles = ECDF(x)([0.25, 0.50, 0.75]) return tuple( round(i, 3) for i in false_pulsar_quantiles - normal_quantiles)
def q2(): # Retorne aqui o resultado da questão 2. mean =dataframe['normal'].mean() std = dataframe['normal'].std() interv = [mean - std, mean + std] ecdf = ECDF(dataframe['normal']) empirico = ecdf(interv) result = empirico[1] - empirico[0] return result.round(3)
def epsilon_time_from_distance(dfg_time_inner, aggregate_type, beta, distance, precision, sens_time): delta_time_inner = [] delta_edge = [] delta_per_event = [] R_ij = max(dfg_time_inner) r_ij = R_ij * precision accurate_result = 0 # calculating the accurate result if aggregate_type == AggregateType.AVG: accurate_result = sum(dfg_time_inner) * 1.0 / len(dfg_time_inner) elif aggregate_type == AggregateType.SUM: accurate_result = sum(dfg_time_inner) * 1.0 elif aggregate_type == AggregateType.MIN: accurate_result = min(dfg_time_inner) * 1.0 elif aggregate_type == AggregateType.MAX: accurate_result = max(dfg_time_inner) * 1.0 # in case of the time is instant, we set epsilon to avoid the error of division by zero if accurate_result == 0: epsilon_time_ij = 1 else: distance_ij = accurate_result * distance # hence distance is between 0 and 1 # calculate epsilon epsilon_time_ij = sens_time / distance_ij * log(1 / beta) epsilon_time_inner = epsilon_time_ij # fix the case of time is fixed flag = 1 prev = dfg_time_inner[0] current = dfg_time_inner for t_k in dfg_time_inner: # fix the case of time is fixed if t_k != prev: flag = 0 prev = t_k cdf = ECDF(dfg_time_inner) # p_k is calculated for every instance. cdf1 = calculate_cdf(cdf, t_k + r_ij) cdf2 = calculate_cdf(cdf, t_k - r_ij) p_k = cdf1 - cdf2 # current_delta = p_k*( 1/( (1-p_k) * exp(-R_ij * epsilon_time) +p_k) -1) current_delta = (p_k / ( (1 - p_k) * exp(-R_ij * epsilon_time_ij) + p_k)) - p_k # eps = - log(p_k / (1.0 - p_k) * (1.0 / (current_delta + p_k) - 1.0)) / log(exp(1.0)) * (1.0 / R_ij) # we append the deltas and take the maximum delta out of them # if current_delta != float.nan: delta_edge.append(current_delta) # delta_per_event.append([x, current_delta]) delta_per_event.append( current_delta) # *****************!!!!!!!!!!!! changed if current_delta != 0: delta_time_inner.append(current_delta) return delta_edge, delta_per_event, delta_time_inner, epsilon_time_inner
def qq_plot(self): ecdf = ECDF(self.values) observed_quantiles = sorted(self.values) theorical_quantiles = [self.quantile(q=ecdf(x)) for x in observed_quantiles] x = np.linspace(min(self.values), max(self.values), 10) plt.plot(x, x, '-', color='red') plt.plot(observed_quantiles, theorical_quantiles, '.', color='black') plt.show()
def q4(): # Retorne aqui o resultado da questão 4. filtro = stars['mean_profile'][(stars['target'] == 0)] filtro = filtro.values false_pulsar_mean_profile_standardized = (filtro - filtro.mean()) / filtro.std() ppf = sct.norm.ppf([0.8, 0.9, 0.95]) ecdf = ECDF(false_pulsar_mean_profile_standardized) return (tuple(ecdf(ppf).round(3)))
def fit_transform(self, X): transformed_X = [] for col in X.T: ecdf = ECDF(col) self.ecdfs.append(ecdf) transformed_X.append(ecdf(col)) transformed_X = np.array(transformed_X) transformed_X = transformed_X * 2 - 1 return transformed_X.T
def plot_domain_alignment(): ''' all_sets = list() pool = Pool(6) for tmp_counts in pool.imap_unordered(get_domain_alignment, range(len(g_clusters))): if tmp_counts: all_sets.append(tmp_counts) try: with open(g_ca.fmt_path('datadir/domain_alignment/raw.json'),'w') as f: json.dump(all_sets,f) except: print('failed to save raw') data = list() means = dict() for i, cluster in enumerate(all_sets): alns, sizes, perfs = zip(*cluster.values()) mean_aln = np.mean(alns) perfs = [z for z in perfs if z > 0] if perfs: mean_perf = np.mean(perfs) else: mean_perf = None means[i] = (mean_aln, mean_perf) for dom, val in cluster.items(): aln, s, perf = val data.append((dom, aln - mean_aln, perf - mean_perf if mean_perf and perf else None)) with open(g_ca.fmt_path('datadir/domain_alignment/deviations.json'),'w') as f: json.dump(data,f) ''' D = DataGetter() #with open(D.fmt_path('datadir/domain_alignment/deviations.json'),'r') as f: with open(D.fmt_path('datadir/deviations.json'), 'r') as f: data = json.load(f) doms, aln_devs, perf_devs = zip(*data) fig, ax = plt.subplots(figsize=(6, 3.5)) ecdf = ECDF(aln_devs) ax.plot(list(ecdf.x), list(ecdf.y)) ax.set_xlabel('distance from mean alignment') ax.set_ylabel('CDF') fig.savefig(D.fmt_path('plotsdir/domain_alignment/alignment.png')) plt.close(fig) fig, ax = plt.subplots(figsize=(4.5, 4.5)) aln_devs, perf_devs = zip( *[z for z in zip(aln_devs, perf_devs) if z[1] is not None]) heatmap, x, y = np.histogram2d(aln_devs, perf_devs, bins=50) extent = [x[0], x[-1], y[0], y[-1]] pos = ax.imshow(heatmap.T, extent=extent, origin='lower', cmap='Greys', aspect='auto') fig.colorbar(pos) ax.set_xlabel('distance from mean alignment') ax.set_ylabel('distance from mean performance') fig.savefig(D.fmt_path('plotsdir/domain_alignment/align_vs_perf.png')) plt.close(fig)
def q2(): #CDF empírica ecdf = ECDF(dataframe.normal) #média e desvio media = dataframe.normal.mean() desvio = dataframe.normal.std() prob = ecdf(media + desvio) - ecdf(media - desvio) return round(prob, 3)
def func_ps_level(ha_open, ha_close, ha_bar_percent_level): ha_bar_size = ha_close - ha_open idx_positive_bar = np.where(ha_bar_size>0)[0] idx_negative_bar = np.where(ha_bar_size<0)[0] ha_bar_positive_size = ha_bar_size[idx_positive_bar] ha_bar_negative_size = ha_bar_size[idx_negative_bar] positive = ECDF(ha_bar_positive_size) negative = ECDF(-ha_bar_negative_size) ha_positive_size, ha_positive_cdf = positive.x, positive.y ha_negative_size, ha_negative_cdf = negative.x, negative.y n_level = len(ha_bar_percent_level) ha_ps_positive_level = np.zeros(n_level) ha_ps_negative_level = np.zeros(n_level) for i in range(n_level): ha_ps_positive_level[i] = ha_positive_size[np.where(ha_positive_cdf<=ha_bar_percent_level[i])[0][-1]] ha_ps_negative_level[i] = -ha_negative_size[np.where(ha_negative_cdf<=ha_bar_percent_level[i])[0][-1]] return ha_ps_positive_level, ha_ps_negative_level
def q4(): # Retorne aqui o resultado da questão 4. var = stars[stars["target"]== 0]["mean_profile"] false_pulsar_mean_profile_standardized =(var - var.mean())/var.std() ecdf_f = ECDF(false_pulsar_mean_profile_standardized) theor_quant = [sct.norm.ppf(x) for x in [0.8, 0.9, 0.95]] prob = tuple(ecdf_f(theor_quant).round(3)) return prob pass
def q2(): serie = dataframe['normal'] x_ = serie.mean() s = serie.std() interval_min = x_ - s interval_max = x_ + s ecdf = ECDF(serie) interval = ecdf(interval_max) - ecdf(interval_min) return float(round(interval, 3))
def build_edf_fr_vals(data): """ construct empirical distribution function given data values """ data = data.ravel() cdf = ECDF(data) x0 = cdf.x[1:] y0 = cdf.y[1:] y0 = np.round(y0, 8) return x0, y0
def q4(): df_f = stars['mean_profile'][stars['target'] == False] false_pulsar_mean_profile_standardized = (df_f - df_f.mean()) / df_f.std(ddof=0) ppf = sct.norm.ppf([0.80, 0.90, 0.95]) ecdf = ECDF(false_pulsar_mean_profile_standardized) return (ecdf(ppf[0]).round(decimals=3), ecdf(ppf[1]).round(decimals=3), ecdf(ppf[2]).round(decimals=3)) pass