def fit_poissons(X, alpha=0.05, min_dist=0.2, min_zscore=1): if np.mean(X) < 5: # Can't really form a good statistic meanbounds = sms.DescrStatsW(X).tconfint_mean(alpha=alpha) return {"n": 1, "coeffs": [meanbounds[1]]} shift = np.min(X) - 1 # Needed later to shift back Xarr = np.log(X - shift) res = one_or_two_mixtures(Xarr.tolist(), alpha=0.05, min_dist=min_dist, min_zscore=min_zscore) numcomponents = len(res["low_means"]) if numcomponents == 2: mean1 = 0.5 * (res["low_means"][0] + res["high_means"][0]) mean2 = 0.5 * (res["low_means"][1] + res["high_means"][1]) mean1 = np.exp(mean1) + shift mean2 = np.exp(mean2) + shift sz1 = res["n"][0] sz2 = res["n"][1] alpha = sz1 / sz2 # Now optimize with estimates # coeffs = fit_data_two_poissons(X, [alpha, mean1, mean2]) coeffs_fm = fit_poissons_fixed_means(X, mean1, mean2) print("Optimality fm = {}".format(coeffs_fm.cost)) coeffs_2 = fit_data_two_poissons(X, [alpha, mean1, mean2]) print("Optimality 2 = {}".format(coeffs_2.cost)) coeffs_1 = fit_data_one_poisson(X, [np.mean(X)]) print("Optimality 1 = {}".format(coeffs_1.cost)) if coeffs_2.cost < coeffs_fm.cost: coeffs = coeffs_2 else: coeffs = coeffs_fm if coeffs.x[0] > 0.0 and 2 * coeffs.cost < coeffs_1.cost: return {"n": 2, "coeffs": coeffs} print("Only have one!") Xarr = np.array(X) mean1 = np.mean(Xarr) mean2 = mean1 + min_zscore * np.sqrt(mean1) mean1 = np.mean(Xarr[Xarr < mean2 - np.sqrt(mean2) / 2.0]) coeffs = fit_poissons_fixed_means(X, mean1, mean2) print("Alpha = {}".format(coeffs.x[0])) # coeffs = fit_data_one_poisson(X, [mean1]) return {"n": 2, "coeffs": coeffs}
def main(): a = analysis.run(force_fit=False, use_backup_file=True) parameters = a.class_model.param_labels neutral = {"distortion": 1, "risk_aversion": 0, "side_bias": 0} monkey_list = a.monkeys.copy() monkey_list.remove("Havane") monkey_list.remove("Gladys") monkey_list = ["Havane", "Gladys"] + monkey_list for p in parameters: row_list = [] for m in monkey_list: if m == "Havane": m_name = "Hav" elif m == "Gladys": m_name = "Gla" else: m_name = m row = {"ID": m_name} for cond in GAIN, LOSS: x = a.cpt_fit[cond][m][p] mean = np.mean(x) ic = sms.DescrStatsW(x).tconfint_mean() # print(f"{p} {m} {mean:.2f} [{ic[0]:.2f}, {ic[1]:.2f}]) row[f"{cond.capitalize()} - Mean [CI]"] = f"{mean:.2f} [{ic[0]:.2f}, {ic[1]:.2f}]" if p in neutral.keys(): could_be_neutral = "Yes" if ic[0] <= neutral[p] <= ic[ 1] else "No" row[f"{cond.capitalize()} - Neutral"] = could_be_neutral row_list.append(row) df = pd.DataFrame(row_list) df.to_csv(os.path.join(TABLE_FOLDER, f"table_{p}.csv"), index=False)
def calculate_confidence_interval_for_weight_mean(dataset, confidence_level: float): """ arguments: confidence_level -- In plain English, a Confidence Interval is a range of values we are fairly sure our true value lies in. The level of "fair surety" is called confidence level significance level (alpha) + confidence level = 1 alpha is also the threshold of pvalue. """ assert np.isnan(dataset).any() == False assert confidence_level > 0.8 ci_lower_bound, ci_upper_bound = sms.DescrStatsW(dataset).tconfint_mean( alpha=(1 - confidence_level)) print('Assuming that the population is normally distributed, ', end='') print('C.I. with {}% confidence: [{}, {}]'.format( confidence_level * 100, round(ci_lower_bound, 10), round(ci_upper_bound, 10))) return ci_lower_bound, ci_upper_bound
def confint_mean(var, alpha=0.05, alternative='two-sided'): ''' Confident interval with mean :param var: dataframe var 1 :param alpha: significance level :param alternative : h1 != val (two-sided) h1 > val (larger) h1 < val (smaller) :return: ''' s = smstats.DescrStatsW(var) ci = None if s.std: ci = s.zconfint_mean(alpha,alternative) else: ci = s.tconfint_mean(alpha,alternative) print("{0}Confidence Interval - Compare Means{0}".format("="*5)) print("=" * 50) print(pd.DataFrame({'Mean': [var.mean], 'Lower CI': [ci[0]], 'Upper CI': [ci[1]]})) print("=" * 50) return ci
def make_pdf_and_cdf_plot(z, outfile='histogram.png'): fig, ax1 = plt.subplots() nbins = 21 bins = np.linspace(RIRANGE[0], RIRANGE[1], nbins) pdf, _, _ = ax1.hist(z, bins=bins, normed=True, color=cmap(0.5)) plt.suptitle('Hydrotrend: recurrence interval distribution', fontsize=20) ax1.set_ylim(0.0, 0.4) ax1.set_xlabel('RI [yr]', fontsize=18) ax1.set_ylabel('pdf', fontsize=18) cdf = np.cumsum(pdf) cdf /= cdf.max() ax2 = ax1.twinx() ax2.plot(bins[:-1], cdf, color='b', lw=1.5) ax2.set_ylabel('cdf', fontsize=18) ri_median = np.median(z) ri_mean = z.mean() ri_stdv = z.std() ri_ci = sms.DescrStatsW(z).tconfint_mean() top = ax2.get_ylim()[-1] right = ax2.get_xlim()[-1] ymrk = 0.95 * top ax2.plot([ri_mean - ri_stdv, ri_mean + ri_stdv], [ymrk, ymrk], color=cmap(0.5), lw=0.75) ax2.plot(ri_ci, [ymrk, ymrk], '|', color=cmap(0.5), ms=10, mew=1) ax2.plot(ri_mean, ymrk, 's', color=cmap(0.5), ms=5) ax2.plot(ri_median, ymrk, 'D', color=cmap(0.5), ms=5) print 'mean = {}'.format(ri_mean) print 'median = {}'.format(ri_median) print 'std = {}'.format(ri_stdv) print 'ci = {}'.format(ri_ci) plt.savefig(outfile, dpi=150) plt.close()
def conn_highvariance(allcovdata): """ -Identify windows with high variance in connectivity for each subject, -calculate the average connectitivy (average of all edges) -Define a 95% confidence interval on this average -Select data points outside (higher values) Parameters ---------- allcovdata:{array-like} , Connectivity matrices of all subjects , shape =(Subjects X NWindows X Mfeatures X Mfeatures ) Returns ---------- mtd_allsubj_highvar:{array-like} , All windows of High Variance , shape= (Windows X Mfeatures X Mfeatures) """ mtd_allsubj_highvar = [] # High variance windows for each subject var_mtd_allsubj =[] for curcov in allcovdata: # calculate variance of connectivity intra subject var_mtd_allsubj.append([a.mean() for a in curcov]) # Extract points with high variance ( > 95 % confidence interval ) for cur_i,curvarmtd in enumerate(var_mtd_allsubj): a = sms.DescrStatsW(curvarmtd) _,high= a.tconfint_mean() ind_highvar = np.argwhere(curvarmtd>high) # select the covdata for these points only curcov = allcovdata[cur_i] mtd_allsubj_highvar.append(curcov[ind_highvar]) return np.vstack(mtd_allsubj_highvar)
def evaluate(dataset, predictions): exact_match = total = 0 f1_scores = [] for article in dataset: for paragraph in article['paragraphs']: for qa in paragraph['qas']: total += 1 if qa['id'] not in predictions: message = 'Unanswered question ' + qa['id'] + \ ' will receive score 0.' print(message, file=sys.stderr) continue ground_truths = list(map(lambda x: x['text'], qa['answers'])) prediction = predictions[qa['id']] exact_match += metric_max_over_ground_truths( exact_match_score, prediction, ground_truths) f1_scores.append( metric_max_over_ground_truths(f1_score, prediction, ground_truths)) # Exact-match is binary, so use binomial CI's exact_match_mean = 100.0 * exact_match / total lower, upper = sms.proportion_confint(exact_match, total, alpha=0.05, method="beta") exact_match_ci = (100.0 * lower, 100.0 * upper) # F1 scores are continuous in [0, 1]. We could use fancy bounded random # variable CI, but for now, we settle for the normal approximation. f1_mean = 100.0 * sum(f1_scores) / total lower, upper = sms.DescrStatsW(f1_scores).tconfint_mean() f1_ci = (100.0 * lower, 100.0 * upper) return { 'exact_match': exact_match_mean, 'exact_match_ci': exact_match_ci, 'f1': f1_mean, 'f1_ci': f1_ci }
def getSpikeStats(data, groups): groupIDs = np.unique(groups).astype(int) nGroups = len(groupIDs) stats = {} stats['mean'] = np.zeros(nGroups) stats['sem'] = np.zeros(nGroups) stats['conf_Int'] = np.zeros((nGroups, 2)) stats['N'] = np.zeros(nGroups) stats['MWUz'] = np.zeros(nGroups) for i in groupIDs: g1 = groups == i g2 = groups != i x_stats = sms.DescrStatsW(data[g1]) stats['mean'][i] = x_stats.mean stats['sem'][i] = x_stats.std_mean stats['N'][i] = x_stats.nobs stats['conf_Int'][i] = x_stats.tconfint_mean() stats['MWUz'][i], _, _ = getMWUz(data, g1, g2) return stats
def main(): df = pd.read_csv(FILE, index_col=0) # SD, CI, effect size desc = sms.DescrStatsW(df) dsim_values = desc.mean sd = desc.std_ddof(1) lower_ci, _ = desc.tconfint_mean(alternative='larger') cohens_d = dsim_values / sd # t test raw_tstats = ttest_1samp(df, popmean=0, axis=0) column_bools = ~np.isnan(raw_tstats.pvalue) & \ list(map(lambda c: not any(ex in c for ex in EXCLUDE), df.columns)) all_tstats = [ np.array(dsim_values[column_bools]), sd[column_bools], lower_ci[column_bools], cohens_d[column_bools], raw_tstats.statistic[column_bools], raw_tstats.pvalue[column_bools], raw_tstats.pvalue[column_bools] / 2 ] # one tailed p # multiple comparison correction multi_corrections = ['fdr_bh'] for method in multi_corrections: corrected = multipletests(all_tstats[3], alpha=ALPHA, method=method) all_tstats.append(corrected[1]) p_df = pd.DataFrame(all_tstats, columns=df.columns[column_bools], index=[ 'delta_sim', 'sd', 'lower_ci', 'cohens_d', 'tstats', 'raw_p_2tailed', 'raw_p_1tailed' ] + ['p_' + m for m in multi_corrections]) p_df = p_df.T.sort_values('raw_p_1tailed') p_df.to_csv(OUTCSV) print('Output to: ' + OUTCSV)
def add_data(frame, group, key, algs, opt_gat, domain_path): new_frame = DataFrame() means = [] bounds = [] for k, action_group in (DataFrame(group).groupby(['actionDuration'])): seeds = [] for j, seed_group in action_group.groupby('domainSeed'): cur_opt_gat = opt_gat[(k, domain_path, j)] actual_gat = action_group['goalAchievementTime'].iloc(0)[0] seeds.append(actual_gat / cur_opt_gat) bound = sms.DescrStatsW(seeds).tconfint_mean() mean = statistics.mean(seeds) bounds.append((abs(mean - bound[0]), abs(mean - bound[1]))) means.append(statistics.mean(seeds)) # mean_gat = action_group.mean()['goalAchievementTime'] # means.append(mean_gat / opt_gat[(k, domain_path)]) new_frame[key] = means new_frame[key + "_" + "lerror"] = [i[0] for i in bounds] new_frame[key + "_" + "rerror"] = [i[1] for i in bounds] algs.append(key) return pd.concat([frame, new_frame], axis=1)
#489756&326584 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır! #489756&675201 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır! #489756&201436 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır! #361254&874521 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur! #361254&326584 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır! #361254&675201 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur! #361254&201436 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur! #874521&326584 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır! #874521&675201 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur! #874521&201436 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur! #326584&675201 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır! #326584&201436 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark vardır! #675201&201436 itemlerinin fiyat ortalamaları arasında istatistiksel açıdan fark yoktur! #Confidence Interval sms.DescrStatsW(df["price"]).tconfint_mean() #(38.334045331672925, 39.216612629527816) #category_id{201436,326584,361254,489756,675201,874521 } olan itemlerin #489756,326584,675201 itemlerinin fiyatlarının #güven aralığında olması gerekir. #sümilasyon #minimum kazanç için minfreq = len(df[df['price'] >= 35.693170]) income_min = minfreq * 35.6931 income_min #ortalama kazanç meanfreq = len(df[df['price'] >= 37.443592]) income_mean = minfreq * 37.4435 income_mean
#missing for media opti frames['OPTI_f4d']['cctrust_media'] = np.nan vizframes = {} # make a set with all items to visualise -->data for cntry in cntrs: vizframes[cntry] = frames[cntry][vizitems].astype('float64') vizframes[cntry]['country'] = cntry print(vizframes[cntry]) data = pd.concat(vizframes.values(), ignore_index=True) # make a set with totals and 95 cis totals_m = data.groupby('country').agg([ 'mean', 'count', 'sem', lambda lb: sms.DescrStatsW(lb.dropna()).tconfint_mean(alpha=0.05)[0], lambda ub: sms.DescrStatsW(ub.dropna()).tconfint_mean(alpha=0.05)[1] ]) # add a better column label totals_m.columns = totals_m.columns.set_levels( ['mean', 'count', 'sem', 'lowerbound', 'upperbound'], level=1) # sortorder items/overall totals (by mean) totals_institutions = data[vizitems].agg(['mean', 'count', 'sem' ]).T.sort_values(by='mean', ascending=False) totals_countries = data.groupby('country').mean().mean(axis=1).sort_values( ascending=True) # labels for countries newlabels = dict( zip(list(totals_m.index),
def t_distribution_ci(df, metric='post_sales_temp', control='Control', test='Test_1', test_flag='test_flag', alpha=0.05): signi = [] p_value = [] test_data_A = df[df[test_flag] == control] test_data_B = df[df[test_flag] == test] test_data_A[metric] = test_data_A[metric].astype('float') test_data_B[metric] = test_data_B[metric].astype('float') print(test_data_A[metric].quantile(.995)) #test_data_A_clean = test_data_A[(test_data_A[metric]>0) & (test_data_A[metric]<test_data_A[metric].quantile(.995))] test_data_A_clean = test_data_A print(test_data_B[metric].quantile(.995)) #test_data_B_clean = test_data_B[(test_data_B[metric]>0) & (test_data_B[metric]<test_data_B[metric].quantile(.995))] test_data_B_clean = test_data_B #Combine the cleaned data sets as one test_data_clean = test_data_A_clean.append(test_data_B_clean) #Summarize the metrics:- Calculating totals test_summary1 = test_data_clean.groupby(test_flag).agg({metric: 'sum'}) #Summarize the metrics:- Calculating means test_summary2 = test_data_clean.groupby(test_flag).agg({metric: 'mean'}) #Transposing the summaries test_summary1 = test_summary1.T test_summary2 = test_summary2.T #Initialize a dataframe with test stats test_stats = pd.DataFrame( columns=['pct_lft', 'conf_int_lb', 'conf_int_ub', 'p-value']) #Concatenate the test stats with both the summaries test_summary1 = pd.concat([test_summary1, test_stats], axis=1, ignore_index=False, sort=False) #Calculate pct_lift for all the metrics test_summary1['pct_lft'] = (test_summary1[test] - test_summary1[control] ) / test_summary1[control] * 100 test_summary2 = pd.concat([test_summary2, test_stats], axis=1, ignore_index=False, sort=False) #Calculate pct_lift for all the metrics test_summary2['pct_lft'] = (test_summary2[test] - test_summary2[control] ) / test_summary2[control] * 100 cm = sms.CompareMeans( sms.DescrStatsW( test_data_A_clean[metric][test_data_A_clean[metric].notnull()]), sms.DescrStatsW( test_data_B_clean[metric][test_data_B_clean[metric].notnull()])) lb, rb = cm.tconfint_diff(usevar='unequal', alternative='two-sided', alpha=0.10) test_summary2['conf_int_lb'] = (rb * -1) / test_data_A_clean[metric].mean() test_summary2['conf_int_ub'] = (lb * -1) / test_data_A_clean[metric].mean() t_stat, test_summary2['p-value'] = sc.ttest_ind( test_data_A_clean[metric][test_data_A_clean[metric].notnull()], test_data_B_clean[metric][test_data_B_clean[metric].notnull()], equal_var=False) if (test_summary2['p-value'].iloc[0] < alpha) and (test_summary2['pct_lft'].iloc[0] > 0): signi.append('Significant with lift') elif (test_summary2['p-value'].iloc[0] < alpha) and (test_summary2['pct_lft'].iloc[0] < 0): signi.append('Significanct ,control performance better than test') elif (test_summary2['p-value'].iloc[0] > alpha) and (test_summary2['pct_lft'].iloc[0] < 0): signi.append('Not significanct with negative lift') elif (test_summary2['p-value'].iloc[0] > alpha) and (test_summary2['pct_lft'].iloc[0] > 0): signi.append('Not significant with positive lift') else: signi.append('Nothing') print(signi) test_summary2['sigificance'] = signi return test_summary2
def run_benchmark(benchmark, max_runs, timeout_hours, output_file, min_runs=3): def run_benchmark_once(): print('Running benchmark... ', end='', flush=True) result = benchmark.run() print(result) for dimension, value in result.items(): results_by_dimension[dimension] += [value] results_by_dimension = defaultdict(lambda: []) print('Preparing for benchmark... ', end='', flush=True) benchmark.prepare() print('Done.') start_time = timer() # Run at least min_runs times for i in range(min_runs): run_benchmark_once() # Then consider running a few more times to get the desired precision. while True: if timer() - start_time > timeout_hours * 3600: print( "Warning: timed out, couldn't determine a result with the desired precision." ) break for dimension, results in results_by_dimension.items(): if all(result == results[0] for result in results): # If all results are exactly the same the code below misbehaves. We don't need to run again in this case. continue confidence_interval = stats.DescrStatsW(results).tconfint_mean( 0.05) confidence_interval_2dig = (round_to_significant_digits( confidence_interval[0], 2), round_to_significant_digits(confidence_interval[1], 2)) if abs(confidence_interval_2dig[0] - confidence_interval_2dig[1]) > numpy.finfo(float).eps * 10: if len(results) < max_runs: print( "Running again to get more precision on the metric %s. Current confidence interval: [%.3g, %.3g]" % (dimension, confidence_interval[0], confidence_interval[1])) break else: print( "Warning: couldn't determine a precise result for the metric %s. Confidence interval: [%.3g, %.3g]" % (dimension, confidence_interval[0], confidence_interval[1])) else: # We've reached sufficient precision in all metrics, or we've reached the max number of runs. break run_benchmark_once() # We've reached the desired precision in all dimensions or reached the maximum number of runs. Record the results. rounded_confidence_intervals_by_dimension = {} confidence_intervals_by_dimension = {} for dimension, results in results_by_dimension.items(): confidence_interval = stats.DescrStatsW(results).tconfint_mean(0.05) confidence_interval_2dig = (round_to_significant_digits( confidence_interval[0], 2), round_to_significant_digits(confidence_interval[1], 2)) rounded_confidence_intervals_by_dimension[ dimension] = confidence_interval_2dig confidence_intervals_by_dimension[dimension] = ( confidence_interval, confidence_interval_2dig) with open(output_file, 'a') as f: json.dump( { "benchmark": benchmark.describe(), "results": confidence_intervals_by_dimension }, f) print(file=f) print('Benchmark finished. Result: ', rounded_confidence_intervals_by_dimension) print()
def plot_hvi(parameters_file, output_hvi_file_name, list_of_dirs): """ Plot the hypervolume indicator (HVI) results of the design space exploration. In this plot specifically we plot the HVI of HyperMapper's DSE against the HVI of a competing approach. On the x axis we plot time in seconds and on the y axis the HVI. HVI to be computed needs a real Pareto or at least a Pareto that is the best found by the results concatenation of HyperMapper and the competing approach. ###################################################### ######### Input of this script ###################### # 1) a file that is the real Pareto or the best Pareto found # (supposing the we are comparing several approaches for example the best Pareto is the result of all these approaches combined). # 2) a file containing all the samples of the exploration (not only the Pareto). # From this file we can compute the Pareto at time t and then the hvi at time t """ try: import statsmodels.stats.api as sms except: # TODO: Long-term: move this import to the top. ImportError( "Failed to import statsmodels. Statsmodels is required for plot_hvi." ) xlabel = "Time (sec)" ylabel = "HyperVolume Indicator (HVI)" number_of_bins = 20 filename, file_extension = os.path.splitext(parameters_file) if file_extension != ".json": print( "Error: invalid file name. \nThe input file has to be a .json file not a %s" % file_extension ) exit(1) with open(parameters_file, "r") as f: config = json.load(f) schema = json.load(resource_stream("hypermapper", "schema.json")) DefaultValidatingDraft4Validator = extend_with_default(Draft4Validator) DefaultValidatingDraft4Validator(schema).validate(config) if "application_name" in config: application_name = config["application_name"] else: application_name = "" print("########## plot_hvi.py #########################") print("### Parameters file is %s" % parameters_file) print("### Application name is %s" % application_name) print("### The input directories data are %s" % str(list_of_dirs)) print("################################################") param_space = space.Space(config) optimization_metrics = param_space.get_optimization_parameters() ################################################################################################################### ########### Compute the hypervolume of all the input files concatenated as a reference for the HVI metric. ################################################################################################################### input_files = {} # y_data_mean is dict on the directories that for each entry in the dict contains the mean of each point x over multiple file repetitions in one directory; lower and upper are for the confidence interval. y_data_mean = defaultdict(list) y_data_median = defaultdict(list) y_data_min = defaultdict(list) y_data_max = defaultdict(list) y_data_lower = defaultdict(list) y_data_upper = defaultdict(list) bin_array_X = {} number_or_runs_in_bins = {} for dir in list_of_dirs: input_files[dir] = [f for f in listdir(dir) if isfile(join(dir, f))] for dir in list_of_dirs: files_to_remove = [] for file in input_files[dir]: filename, file_extension = os.path.splitext(file) if file_extension != ".csv": print( "Warning: file %s is not a csv file, it will not be considered in the HVI plot. " % file ) files_to_remove.append(file) # Don't move this for loop inside the previous identical one otherwise you will remove the elements before they get process because of overlapping references. for file in files_to_remove: input_files[dir].remove(file) for dir in list_of_dirs: if len(input_files[dir]) == 0: print( "Warning: directory %s is empty, it will not be considered in the HVI plot." ) del input_files[dir] if len(input_files) == 0: print("Error: there no input files to compute the HVI.") print("The files used as a input are: ") for i, dir in enumerate(input_files.keys()): print( "Directory " + str(i) + ": " + dir + ", # of files: " + str(len(input_files[dir])) + ", list of files: " + str(input_files[dir]) ) all_data_files = [] for dir in input_files.keys(): for file in input_files[dir]: all_data_files += [dir + "/" + file] selection_keys = ( param_space.get_output_parameters() + param_space.get_timestamp_parameter() ) feasible_flag = True if (param_space.get_feasible_parameter() != [None]) else False concatenated_all_data_array = param_space.load_data_files( all_data_files, selection_keys_list=selection_keys, only_valid=feasible_flag ) if len(next(iter(concatenated_all_data_array.values()))) == 0: return return_empty_images( application_name, input_files, number_of_bins, output_hvi_file_name, xlabel, ylabel, ) bounds = {} max_point = [] standard_deviation_optimization_metrics = [] max_min_difference = [] # Get bounds of objective space for metric in optimization_metrics: X = np.array(concatenated_all_data_array[metric]) standard_deviation = np.std(X, axis=0) standard_deviation_optimization_metrics.append(standard_deviation) X /= standard_deviation concatenated_all_data_array[metric] = X bounds[metric] = ( min(concatenated_all_data_array[metric]), max(concatenated_all_data_array[metric]), ) max_point.append(bounds[metric][1]) max_min_difference.append(bounds[metric][1] - bounds[metric][0]) print( "(min, max) = (%f, %f) for the metric %s. This is to compute the hypervolume." % (bounds[metric][0], bounds[metric][1], metric) ) total_volume = prod(max_min_difference) list_of_objectives = [ concatenated_all_data_array[objective] for objective in param_space.get_optimization_parameters() ] reformatted_all_data = list(zip(*list_of_objectives)) # Get dominated hypervolume for Pareto of all data observed hv_all_data = H(reformatted_all_data, max_point) print("The hypervolume of all the files concatenated: %d" % hv_all_data) ################################################################################################################### ########### Compute the HVI for each directory. ################################################################################################################### hvi = {} for dir in input_files: print("Compute HVI for %s" % dir) convert_in_seconds = 1000.0 hvi[dir], bin_array_X[dir], number_or_runs_in_bins[dir] = compute_hvi( standard_deviation_optimization_metrics, input_files[dir], dir, total_volume, max_point, hv_all_data, param_space, convert_in_seconds, number_of_bins, ) # Round the floating point numbers to 1 decimal for clarity of visualization. bin_array_X[dir] = [round(float(i), 1) for i in bin_array_X[dir]] for file in hvi[dir]: for bin in hvi[dir][file]: hvi[dir][file][bin] = round(float(hvi[dir][file][bin]), 1) ################################################################################################################### ########### Plot all the HVIs (using box plots bin_array_X and hvi) ################################################################################################################### for dir in input_files: hvi_list_of_lists = [] each_bin = defaultdict(list) for file in hvi[dir]: for bin in hvi[dir][file]: each_bin[bin].append(hvi[dir][file][bin]) for bin in hvi[dir][file]: hvi_list_of_lists.append( each_bin[bin] ) # This is a list of bins and for each bin there is a list of hvi values for each file in that directory. # Print boxplot (one figure per directory). boxplot( bin_array_X[dir], hvi_list_of_lists, application_name, number_of_bins, xlabel, ylabel, str(dir + "/" + os.path.basename(dir) + "_boxplot" + ".pdf"), ) # Print lineplot (only one figure comparing all the directories). for hvi_list in hvi_list_of_lists: hvi_list_array = np.array(hvi_list) y_data_mean[dir].append(hvi_list_array.mean()) y_data_median[dir].append(np.median(hvi_list_array)) y_data_min[dir].append(np.min(hvi_list_array)) y_data_max[dir].append(np.max(hvi_list_array)) low, up = sms.DescrStatsW(hvi_list_array).tconfint_mean() y_data_lower[dir].append(low) y_data_upper[dir].append(up) for bin_number, bin_value in enumerate(y_data_lower[dir]): if not math.isnan(bin_value) and bin_value < 0: y_data_lower[dir][bin_number] = 0 for bin_number, bin_value in enumerate(y_data_upper[dir]): if not math.isnan(bin_value) and bin_value < 0: y_data_upper[dir][bin_number] = 0 print_stats_on_a_txt( dir, str(dir + "/" + os.path.basename(dir) + "_stats" + ".txt"), bin_array_X, number_or_runs_in_bins, y_data_mean, y_data_median, y_data_min, y_data_max, y_data_lower, y_data_upper, ) # Call the function to create plot lineplotCI( input_files, application_name, x_data=bin_array_X, y_data=y_data_mean, low_CI=y_data_lower, upper_CI=y_data_upper, xlabel=xlabel, ylabel=ylabel, title="Line plot with 95% confidence intervals", output_filename=output_hvi_file_name, )
df_control_group = data_control.copy() df_testing_group.head() df_control_group.head() df_control_group.shape df_control_group.shape # na checked df_testing_group.isnull().sum() df_control_group.isnull().sum() # Confidence Interval # Testing group sms.DescrStatsW(df_testing_group["Purchase"]).tconfint_mean() # Control group sms.DescrStatsW(df_control_group["Purchase"]).tconfint_mean() ############################ # Testing of Assumptions ############################ # Assumptions of normality # H0: Normality assumption is provide. # H1: Normality assumption isn't provided. test_stat, pvalue = shapiro(df_testing_group["Purchase"]) print('Test statistics = %.4f, p-value = %.4f' % (test_stat, pvalue))
def believeCase(): df = readDatasetNinformantions() sms.DescrStatsW(df["Hava Sıcaklığı ( °C )"]).tconfint_mean() return df["Hava Sıcaklığı ( °C )"].mean()
def sim_stats(series): stats = { 'mean': series.mean(), 'ci': series.mean() - sms.DescrStatsW(series).tconfint_mean(0.05)[0] } return stats
def two_population(a, b, alpha=.05, consistency='equal', option='right', show_table=False, stages=[1, 2, 3], show=True, precision=4, matched_pairs=False): """ + [First stage]: F Statistics - consistency: equal, left (1 is more consistent than 2), right (2 is more consistent than 1) + [Second stage]: t Test + [Third stage]: Confidence Interval Will return a result_dict regardless of stages. """ opt = option.lower()[0] results = "" const = consistency.lower()[0] result_dict = dict() df_1 = len(a) - 1 df_2 = len(b) - 1 if 1 in stages: varall = [stats.describe(a).variance, stats.describe(b).variance] f_value = varall[0] / varall[1] result_dict['varall'] = varall result_dict['f_value'] = f_value ptmp = stats.f.cdf(f_value, df_1, df_2) if const == 'e': if ptmp > 0.5: ptmp = 1 - ptmp p_value = ptmp * 2 rej_upper = stats.f.ppf(1 - alpha / 2, df_1, df_2) rej_lower = stats.f.ppf(alpha / 2, df_1, df_2) result_dict['f_rej_upper'] = rej_upper result_dict['f_rej_lower'] = rej_lower if f_value < rej_lower or f_value > rej_upper: flag = True else: flag = False text = 'unequal variances' else: rej_upper = stats.f.ppf(1 - alpha, df_1, df_2) rej_lower = stats.f.ppf(alpha, df_1, df_2) if const == 'r': result_dict['f_rej_upper'] = rej_upper p_value = 1 - ptmp if f_value > rej_upper: flag = True else: flag = False text = 'σ_1/σ_2 > 1' else: result_dict['f_rej_lower'] = rej_lower p_value = ptmp if f_value < rej_lower: flag = True else: flag = False text = 'σ_1/σ_2 < 1' result_dict['p_value'] = p_value results = f""" F Statistics =================================== F statistic = {f_value:.{precision}f} p-value = {p_value:.{precision}f} ({inter_p_value(p_value)}) Reject H_0 ({text}) → {flag} """ if 2 in stages: if matched_pairs: samp_diff = a - b nobs = samp_diff.shape[0] df = nobs - 1 tmpdesc = stats.describe(samp_diff) t_value = tmpdesc.mean / (tmpdesc.variance**0.5) * (nobs**0.5) # p-values ptmp = stats.t.cdf(t_value, df) if opt == 'r': text = 'one-tail' tcv = stats.t.ppf(1 - alpha, df=df) p_value = 1 - ptmp elif opt == 'l': text = 'one-tail' p_value = ptmp tcv = stats.t.ppf(alpha, df=df) else: text = 'two-tail' tcv = stats.t.ppf(1 - alpha / 2, df=df) if ptmp > 0.5: ptmp = 1 - ptmp p_value = ptmp * 2 flag = p_value < alpha results += f""" t Test =================================== t (Observed value) = {t_value:.{precision}f} p-value ({text}) = {p_value:.{precision}f} ({inter_p_value(p_value)}) t (Critical, ({text})) = {tcv:.{precision}f} DF = {(df):.{precision}f} Reject H_0 → {flag} """ result_dict['t_p_value'] = p_value result_dict['t_critical_value'] = tcv result_dict['t_observed_value'] = t_value t_alpha = stats.t.ppf(1 - alpha / 2, df) std_xbar = (tmpdesc.variance / nobs)**0.5 LCL = tmpdesc.mean - t_alpha * std_xbar UCL = tmpdesc.mean + t_alpha * std_xbar con_coef = 1 - alpha conf_interval = [LCL, UCL] result_dict['conf_interval'] = conf_interval results += f""" Confidence Interval =================================== {con_coef * 100:.1f}% Confidence Interval: [{LCL:.{precision}f}, {UCL:.{precision}f}] """ else: if flag: # True == unequal variance ttest_result = stats.ttest_ind(a, b, equal_var=False) t_summary = list(ttest_result) t_critical_two = stats.t.ppf(1 - alpha / 2, df=(df_1 + df_2)) if opt == 'r': t_critical_one = stats.t.ppf(1 - alpha, df=(df_1 + df_2)) result_dict['t_critical_one'] = t_critical_one elif opt == 'l': t_critical_one = stats.t.ppf(alpha, df=(df_1 + df_2)) result_dict['t_critical_one'] = t_critical_one if opt == 't': flag = t_summary[1] < alpha result_dict['t_critical_two'] = t_critical_two result_dict['t_observed_value'] = t_summary[0] result_dict['t_p_value'] = t_summary[1] result_dict['df'] = df_1 + df_2 results += f""" t Test =================================== t (Observed value) = {t_summary[0]:.{precision}f} p-value (two-tail) = {t_summary[1]:.{precision}f} ({inter_p_value(t_summary[1])}) t (Critical, two-tail) = {t_critical_two:.{precision}f} DF = {(df_1 + df_2):.{precision}f} Reject H_0 → {flag} """ else: flag = t_summary[1] / 2 < alpha result_dict['t_observed_value'] = t_summary[0] result_dict['t_p_value'] = t_summary[1] / 2 result_dict['df'] = df_1 + df_2 results += f""" t Test =================================== t (Observed value) = {t_summary[0]:.{precision}f} p-value (one-tail) = {(t_summary[1] / 2):.{precision}f} ({inter_p_value(t_summary[1] / 2)}) t (Critical, one-tail) = {t_critical_one:.{precision}f} DF = {(df_1 + df_2):.{precision}f} Reject H_0 → {flag} """ if 3 in stages: cm_result = sms.CompareMeans(sms.DescrStatsW(a), sms.DescrStatsW(b)) conf_table = cm_result.summary(usevar='unequal', alpha=alpha) conf_interval = list( map(float, conf_table.as_text().split('\n')[4].split()[6:])) con_coef = 1 - alpha # record result result_dict['conf_interval'] = conf_interval results += f""" Confidence Interval =================================== {con_coef * 100:.1f}% Confidence Interval: [{conf_interval[0]:.{precision}f}, {conf_interval[1]:.{precision}f}] """ else: ttest_result = stats.ttest_ind(a, b, equal_var=True) t_summary = list(ttest_result) t_critical_two = stats.t.ppf(1 - alpha / 2, df=(df_1 + df_2)) if opt == 'r': t_critical_one = stats.t.ppf(1 - alpha, df=(df_1 + df_2)) result_dict['t_critical_one'] = t_critical_one elif opt == 'l': t_critical_one = stats.t.ppf(alpha, df=(df_1 + df_2)) result_dict['t_critical_one'] = t_critical_one if opt == 't': flag = t_summary[1] < alpha result_dict['t_critical_two'] = t_critical_two result_dict['t_observed_value'] = t_summary[0] result_dict['t_p_value'] = t_summary[1] result_dict['df'] = df_1 + df_2 results += f""" t Test =================================== t (Observed value) = {t_summary[0]:.{precision}f} p-value (two-tail) = {t_summary[1]:.{precision}f} ({inter_p_value(t_summary[1])}) t (Critical, two-tail) = {t_critical_two:.{precision}f} DF = {(df_1 + df_2):.{precision}f} Reject H_0 → {flag} """ else: flag = t_summary[1] / 2 < alpha result_dict['t_observed_value'] = t_summary[0] result_dict['t_p_value'] = t_summary[1] result_dict['df'] = df_1 + df_2 results += f""" t Test =================================== t (Observed value) = {t_summary[0]:.{precision}f} p-value (one-tail) = {(t_summary[1] / 2):.{precision}f} ({inter_p_value(t_summary[1] / 2)}) t (Critical, one-tail) = {t_critical_one:.{precision}f} DF = {(df_1 + df_2):.{precision}f} Reject H_0 → {flag} """ if 3 in stages: cm_result = sms.CompareMeans(sms.DescrStatsW(a), sms.DescrStatsW(b)) conf_table = cm_result.summary(usevar='pooled', alpha=alpha) conf_interval = list( map(float, conf_table.as_text().split('\n')[4].split()[6:])) # record result result_dict['conf_interval'] = conf_interval con_coef = 1 - alpha results += f""" Confidence Interval =================================== {con_coef * 100:.1f}% Confidence Interval: [{conf_interval[0]:.{precision}f}, {conf_interval[1]:.{precision}f}] """ if show_table == True and 3 in stages: results += f"""{conf_table.as_text()}""" if show == True: print(results) return result_dict
def believeCase(): df = checkEmptyValues() sms.DescrStatsW(df["Üretim"]).tconfint_mean() df["Üretim"].mean()
lower_bound, upper_bound = CI_printout(df['temperature'], 0.95, 'z') print('Frequentist approach:') print('95% confidence interval range: [{:.3f}, {:.3f}]'.format( lower_bound, upper_bound)) #%% [markdown] # ==> So, we consider the normal temperature to be in the range from 98.123 to 98.375 with a confidence level of 95%. Any value of temperature beyond this range can be considered abnormal. #%% [markdown] # Please note that we also can use the provided library as follows. #%% import statsmodels.stats.api as sms sms.DescrStatsW(df['temperature']).tconfint_mean() #%% [markdown] # ### 6. Is there a significant difference between males and females in normal temperature? #%% [markdown] # First, we find the mean normal temperature for both malies and females. #%% means = df.groupby("gender")["temperature"].mean() means #%% [markdown] # It seems that the mean female temeprature slightly higher than that for males. # We visualise the distribution of temperatures for both males and females. #%%
motivations = np.load('data/cleaned/motivation.npy') #Plot x = [1, 2, 3] count = 0 for segment in segments: count += 1 y = [] ci1 = [] ci2 = [] yerr = [] for measurement_index in range(0, 3): segment_score = motivations[motivation_index, segment, measurement_index] mean_segment_score = np.mean(segment_score) y.append(mean_segment_score) ci = sms.DescrStatsW(segment_score).tconfint_mean() ci1.append(mean_segment_score - ci[0]) ci2.append(ci[1] - mean_segment_score) yerr = [ci1, ci2] plt.errorbar(x, y, yerr = yerr) plt.title('Segment ' + str(count) + ': mean motivation with 95% confidence intervals') plt.xlabel('Time') plt.xlim([0, 4]) plt.xticks(x, ['pre', 'half way', 'post'], size=8) plt.ylabel('Motivation score') plt.ylim([1, 7]) plt.grid()
treatment_df["active_mins"].describe() control_df["active_mins"].describe() #note that the mean active_mins is higher in the dataframe that has the experimental group than the control group #conduct t-test stats.ttest_ind(treatment_df["active_mins"], control_df["active_mins"], equal_var=False) #output: t-statistic=30.686846737487123 and pvalue<.05) #now we're going to find the 95% confidene interval x1 = treatment_df["active_mins"] x2 = control_df["active_mins"] #going to use statsmodels cm = sms.CompareMeans(sms.DescrStatsW(x1), sms.DescrStatsW(x2)) print(cm.tconfint_diff(usevar='unequal')) #################################################################################### #PAGE 4 #read in the dataframes wrangled in R ctrl_df_pg4 = pd.read_csv("/Users/ankushbharadwaj/Desktop/ctrl_df_pg4.csv") exp_df_pg4 = pd.read_csv("/Users/ankushbharadwaj/Desktop/exp_df_pg4.csv") #STEP 1: REMOVE OUTLIERS #going to remove outliers more than 3 standard deviations from mean #get standard deviation of active minutes per user per day for each group std_exp = np.std(exp_df_pg4["active_mins"]) std_ctrl = np.std(ctrl_df_pg4["active_mins"])
def getPrecisionRecallFalseRate(self, resultSimulation, kMAX, plot=False, output='dict'): """ The function receives the detections/false alarms of all the nodes and computes the Precision/Recall for all the K. """ precisionConfInterval = {} recallConfInterval = {} falseConfInterval = {} for k in range(1, kMAX + 1): precisionConfInterval[k] = [] recallConfInterval[k] = [] falseConfInterval[k] = [] for key, value in resultSimulation.iteritems(): detections = value['detections'] falsePositives = value['falsePositives'] for k in range(1, kMAX + 1): if detections[k]['events'] != 0: recallConfInterval[k].append(detections[k]['detection']/float(detections[k]['events'])) if (detections[k]['detection'])!= 0 and falsePositives != 0: precisionConfInterval[k].append((detections[k]['detection'])/(float(detections[k]['detection']) + falsePositives[k] )) falseConfInterval[k].append(falsePositives[k]/float(len(self.truth.clears))) errorRecall = np.ndarray(kMAX) errorPrecision = np.ndarray(kMAX) errorFalse = np.ndarray(kMAX) meanRecall = np.ndarray(kMAX) meanPrecision = np.ndarray(kMAX) meanFalseRate = np.ndarray(kMAX) for k in range(1, kMAX + 1): a = recallConfInterval[k] meanRecall[k-1] = np.mean(a) interval = sms.DescrStatsW(a).tconfint_mean() errorRecall[k-1] = interval[1] - np.mean(a) a = precisionConfInterval[k] meanPrecision[k-1] = np.mean(a) interval = sms.DescrStatsW(a).tconfint_mean() errorPrecision[k-1] = interval[1] - np.mean(a) a = falseConfInterval[k] meanFalseRate[k-1] = np.mean(a) interval = sms.DescrStatsW(a).tconfint_mean() errorFalse[k-1] = interval[1] - np.mean(a) if plot: visual = Visualization() visual.barRecallPrecisionvsK2(meanRecall, meanFalseRate, meanPrecision, errorRecall, errorPrecision, errorFalse) if output == 'dict': if meanFalseRate[0] > 1: meanFalseRate[0] = 1 result = { 'Precision': np.nan_to_num(meanPrecision).tolist(), 'errPrecision': errorPrecision.tolist(), 'Recall': np.nan_to_num(meanRecall).tolist(), 'errRecall': errorRecall.tolist(), 'FalseRate': np.nan_to_num(meanFalseRate).tolist(), 'errFalseRate': errorFalse.tolist() } return result elif output == 'tuple': return meanPrecision, errorPrecision, meanRecall, errorRecall, meanFalseRate, errorFalse else: return
from scipy.stats import chi2_contingency chi_test = chi2_contingency(cross_tab) print(chi_test) #Boxlplot import seaborn as sns import matplotlib.pyplot as plt plt.boxplot(newprice, labels=['New price'], patch_artist=True) plt.boxplot(nuclee, labels=['Nuclee '], patch_artist=True) plt.boxplot(rating, labels=['Rating'], patch_artist=True) #Estimarea intervalului de incredere import statsmodels.stats.api as sms print('Confidence Interval', sms.DescrStatsW(newprice).tconfint_mean()) print('Confidence Interval', sms.DescrStatsW(nuclee).tconfint_mean()) print('Confidence Interval', sms.DescrStatsW(rating).tconfint_mean()) #Testarea mediilor #Simple Student test from scipy import stats print(stats.ttest_1samp(newprice, 3000)) print(stats.ttest_1samp(nuclee, 2)) print(stats.ttest_1samp(rating, 5)) #Test 2 means newprice_i7 = baza.loc[baza['procesor'] == 'i7'] newprice_i5 = baza.loc[baza['procesor'] == 'i5'] print(stats.ttest_ind(newprice_i7.newprice, newprice_i5.newprice))
def getDelay(self, resultSimulation, kMAX, plot=False, samplingRate = 5): """ The function receives the results previously obtained and computes the average detection delay for all the K and with respect to the distance from the root node (how fare is the event). """ depth = 3 delay0 = {} delay1 = {} delay2 = {} for k in range(1, kMAX + 1): delay0[k] = [] delay1[k] = [] delay2[k] = [] for key, value in resultSimulation.iteritems(): detections = value['detections'] for k in range(1, kMAX + 1): delays = detections[k]['delays'] for delay in delays: if delay['position'] == 0: delay0[k].append(delay['delay']) if delay['position'] == 1: delay1[k].append(delay['delay']) if delay['position'] == 2: delay2[k].append(delay['delay']) delayConfInterval = {'hop0': np.ndarray(kMAX), 'hop1': np.ndarray(kMAX), 'hop2': np.ndarray(kMAX)} delaymeansConfInterval = np.ndarray((kMAX,depth)) for k in range(1, kMAX + 1): a = delay0[k] b = delay1[k] c = delay2[k] delaymeansConfInterval[k-1][0] = np.mean(a) delaymeansConfInterval[k-1][1] = np.mean(b) delaymeansConfInterval[k-1][2] = np.mean(c) interval = sms.DescrStatsW(a).tconfint_mean() delayConfInterval['hop0'][k-1] = interval[1] - np.mean(a) interval = sms.DescrStatsW(b).tconfint_mean() delayConfInterval['hop1'][k-1] = interval[1] - np.mean(b) interval = sms.DescrStatsW(c).tconfint_mean() delayConfInterval['hop2'][k-1] = interval[1] - np.mean(c) if plot: visual = Visualization() visual.plotBarDelay(delaymeansConfInterval, delayConfInterval, trunc='yes') return delaymeansConfInterval, delayConfInterval
df = sns.load_dataset("tips") df.describe().T df.head() df["sex"].value_counts() df[["tip", "total_bill"]].corr() ############################ # Confidence Interval ############################ import statsmodels.stats.api as sms df = sns.load_dataset("tips") df.describe().T sms.DescrStatsW(df["total_bill"]).tconfint_mean() sms.DescrStatsW(df["tip"]).tconfint_mean() df = pd.read_csv("datasets/titanic.csv") df.describe().T sms.DescrStatsW(df["Age"].dropna()).tconfint_mean() sms.DescrStatsW(df["Fare"].dropna()).tconfint_mean() df_ = pd.read_excel("datasets/online_retail_II.xlsx", sheet_name="Year 2010-2011") df = df_.copy() sms.DescrStatsW(df["Quantity"].dropna()).tconfint_mean() sms.DescrStatsW(df["Price"].dropna()).tconfint_mean()
sorted_info = sorted(info, key=lambda tup: tup[2]) visualize_data = None with open(os.path.join(output_dir, "feature_qvalues_and_qt_means.txt"), 'w+') as file_handle: column_names = [ "feature", "pvalue", "qvalue", "signif_mean", "nonsignif_mean", "signif_lo_interval", "signif_up_interval", "nonsignif_lo_interval", "nonsignif_up_interval" ] file_handle.write("{0}\n".format("\t".join(column_names))) for index, (feat, pval, qval) in enumerate(sorted_info): sig_scores = qt_sig_scores_matrix[:, index] nonsig_scores = qt_nonsig_scores_matrix[:, index] sig_group_descr = sms.DescrStatsW(sig_scores) nonsig_group_descr = sms.DescrStatsW(nonsig_scores) sig_lower, sig_upper = sig_group_descr.tconfint_mean() nonsig_lower, nonsig_upper = nonsig_group_descr.tconfint_mean() sig_mean = sig_group_descr.mean nonsig_mean = nonsig_group_descr.mean values = [ feat, pval, qval, sig_mean, nonsig_mean, sig_lower, sig_upper, nonsig_lower, nonsig_upper ] if visualize_data is None: # visualize the feature with the smallest q-value visualize_data = values[:1] + values[2:]
def CI_ttest(X1, X2): cm = sms.CompareMeans(sms.DescrStatsW(X1), sms.DescrStatsW(X2)) out = cm.tconfint_diff(usevar='unequal') return '[%.2f, %.2f]'%(out[0], out[1])
import pandas as pd import numpy as np import matplotlib.pyplot as plt import scipy.stats as stats from scipy.stats import ttest_ind import statsmodels.stats.api as sms GE = pd.read_csv('C:/Users/anivia/Desktop/geDJ.txt', sep="\s+", header=None, names=['date', 'open', 'high', 'low', 'close', 'vol']) SP = pd.read_csv( 'https://www.math.ust.hk/~macwyu/MAFS5110_2018-2019/MAFS5110_2018-2019/Chapter_1/sp500.txt', sep="\s+") logreturn_GE = np.diff(np.log(np.array(GE["close"]))) logreturn_sp500 = np.diff(np.log(np.array(SP["close"]))) da2 = pd.concat([pd.DataFrame(logreturn_GE), pd.DataFrame(logreturn_sp500)], axis=1) #da2.columns=['date','open','high','low','close','vol','logreturn_sp500'] #da2.index=da.index[1:] da2.columns = ["logreturn_GE", "logreturn_sp500"] da2.boxplot(column=['logreturn_GE', 'logreturn_sp500']) plt.show() print(stats.mood(logreturn_sp500, logreturn_GE)) print('H0 can be rejected, the variances are significantly different') print(ttest_ind(logreturn_sp500, logreturn_GE, equal_var=True)) print('') cm = sms.CompareMeans(sms.DescrStatsW(logreturn_sp500), sms.DescrStatsW(logreturn_GE)) print(cm.tconfint_diff())