def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() if len(xs) == 1: if len(ys) == 1: x = xs[0] y = ys[0] if x.is_categorical() and y.is_categorical(): # Get the count for each category x_cat = [k for k, v in x.metadata[categories].items()] y_cat = [k for k, v in y.metadata[categories].items()] for xc in x_cat: for yc in y_cat: data = dataset.select( y.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'" ]) # Check that the count is at least five for each of the (x,y) group pairs if len(data) < 5: return False return True else: return False else: raise ValueError( f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}" ) else: x0 = xs[0] x1 = xs[1] if x0.is_categorical() and x1.is_categorical(): # Get the count for each category x0_cat = [k for k, v in x0.metadata[categories].items()] x1_cat = [k for k, v in x1.metadata[categories].items()] for x0c in x0_cat: for x1c in x1_cat: data = dataset.select(x1.metadata[name], where=[ f"{x.metadata[name]} == '{xc}'", f"{x1.metadata[name]} == '{x1c}'" ]) # Check that the count is at least five for each of the (x,x1) group pairs if len(data) < 5: return False return True else: return False
def friedman(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) # return stats.friedmanchisquare(*data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None test_statistic, p_val = stats.friedmanchisquare(*data) dof = len(data[0]) # TODO This might not be correct test_result = TestResult(name="Kruskall Wallis Test", test_statistic=test_statistic, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha) return test_result
def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: if x.metadata[categories] is None: raise ValueError('') cat = [k for k,v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None t_stat, p_val = stats.kruskal(*data) dof = len(data[0]) # TODO This might not be correct test_result = TestResult( name = kruskall_wallis_name, test_statistic = t_stat, p_value = p_val, prediction = prediction, dof = dof, alpha = combined_data.alpha, x = xs[0], # TODO: Not sure if it's possible to have multiple x's? y = y) return test_result
def wilcoxon_signed_rank(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None t_stat, p_val = stats.wilcoxon(data[0], data[1]) dof = len(data[0]) # TODO This might not be correct test_result = TestResult(name=wilcoxon_signed_rank_name, test_statistic=t_stat, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha, x=x, y=y) return test_result
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None t_stat, p_val = stats.pointbiserialr(data[0], data[1]) dof = None test_result = TestResult(name=pointbiserial_name, test_statistic=t_stat, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha) return test_result
def chi_square(dataset: Dataset, predictions, combined_data: CombinedData): # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() if len(xs) == 1: if len(ys) == 1: x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k,v in x.metadata[categories].items()] y_cat = [k for k,v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"]) table_row.append(len(data)) x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert(len(table_row_key) == len(table_row)) assert(len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) else: raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}") else: raise ValueError(f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}") # chi2, p, dof, ex = chi2_contingency(obs, correction=False) # chi2, p, dof, ex = stats.chi2_contingency(contingency_table, correction=False) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None test_statistic, p_val, dof, ex = stats.chi2_contingency(contingency_table, correction=False) dof = None test_result = TestResult( name = chi_square_name, test_statistic = test_statistic, p_value = p_val, prediction = prediction, dof = dof, alpha = combined_data.alpha, x = x, y = y) return test_result
def fishers_exact(dataset: Dataset, predictions, combined_data: CombinedData): assert(len(combined_data.vars) == 2) # Compute the contingency table xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert(len(xs) == 1) assert(len(ys) == 1) x = xs[0] y = ys[0] # Get the count for each category x_cat = [k for k,v in x.metadata[categories].items()] y_cat = [k for k,v in y.metadata[categories].items()] contingency_table = [] contingency_table_key = [] # labels for the order in which data is stored in data array (define above) for xc in x_cat: table_row = [] table_row_key = [] for yc in y_cat: data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"]) table_row.append(len(data)) x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc) table_row_key.append(x_y_key) assert(len(table_row_key) == len(table_row)) assert(len(table_row) == len(y_cat)) contingency_table.append(table_row) contingency_table_key.append(table_row_key) # odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided') # return FishersResult(odds_ratio, p_value) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None odds_ratio, p_val = stats.fisher_exact(contingency_table, alternative='two-sided') dof = None test_result = TestResult( name = fisher_exact_name, test_statistic = odds_ratio, p_value = p_val, prediction = prediction, dof = dof, alpha = combined_data.alpha, x = x, y = y) return test_result
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(xs) == 1) assert (len(ys) == 1) x = xs[0] y = ys[0] cat = [k for k, v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None if len(data[0]) == len( data[1] ): # Scipy requires that groups have equal sizes even though this is not technically a requirement of the Pointbiserial correlation corr, p_val = stats.pointbiserialr(data[0], data[1]) else: # Compute pointbiserial correlation on our own data_all = data[0].append(data[1]) group_0_mean = np.mean(data[0]) group_0_size = len(data[0]) group_1_mean = np.mean(data[1]) group_1_size = len(data[1]) sample_size = group_0_size + group_1_size assert (sample_size == len(data_all)) sample_std = stats.tstd(data_all) corr = (group_0_mean - group_1_mean) / sample_std * math.sqrt( (group_0_size * group_1_size) / (sample_size * (sample_size - 1))) t_stat, p_val = stats.ttest_ind(data[0], data[1], equal_var=True) dof = None test_result = TestResult(name=POINTBISERIAL_NAME, test_statistic=corr, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha) return test_result
def wilcoxon_signed_rank(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() x = xs[0] y = ys[0] cat = [k for k,v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.wilcoxon(data[0], data[1])
def friedman(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: cat = [k for k,v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.friedmanchisquare(*data)
def has_equal_variance(dataset: Dataset, var_data: list, alpha): xs = [] ys = [] cat_xs = [] cont_ys = [] grouped_data = [] if isinstance(var_data, CombinedData): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() else: for var in var_data: if var.role == iv_identifier or var.role == contributor_identifier: xs.append(var) if var.role == dv_identifier or var.role == outcome_identifier: ys.append(var) for x in xs: if x.is_categorical(): cat_xs.append(x) for y in ys: if y.is_continuous(): cont_ys.append(y) eq_var = (None, None) if cat_xs and cont_ys: for y in ys: for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) grouped_data.append(data) if isinstance(var_data, BivariateData): # Equal variance eq_var = compute_eq_variance(grouped_data) else: eq_var = compute_eq_variance(grouped_data) if eq_var[0] is None and eq_var[1] is None: import pdb pdb.set_trace() # raise Exception("did not compute variance, this is a bug") return False return eq_var[1] > alpha
def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] data = [] for x in xs: if x.metadata[categories] is None: raise ValueError('') cat = [k for k,v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.kruskal(*data)
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert(len(xs) == 1) assert(len(ys) == 1) x = xs[0] y = ys[0] cat = [k for k,v in x.metadata[categories].items()] data = [] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) data.append(cat_data) return stats.pointbiserialr(data[0], data[1])
def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha): xs = var_data.get_explanatory_variables() ys = var_data.get_explained_variables() cat_xs = [] cont_ys = [] grouped_data = [] for x in xs: if x.is_categorical(): cat_xs.append(x) for y in ys: if y.is_continuous(): cont_ys.append(y) eq_var = (None, None) if cat_xs and cont_ys: for y in ys: for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) grouped_data.append(data) if isinstance(var_data, BivariateData): # Equal variance eq_var = compute_eq_variance(grouped_data) # elif isinstance(var_data, MultivariateData): # var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data) else: raise ValueError( f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}" ) if eq_var[0] is None and eq_var[1] is None: import pdb pdb.set_trace() # raise Exception("did not compute variance, this is a bug") return False return (eq_var[1] > alpha)
def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData): calculations = {} xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() for y in ys: # for now assert(len(ys) == 1) # Main effects for x in xs: cat = [k for k,v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median) calculations[c] = stat # import pdb; pdb.set_trace() # store all the medians & confidence intervals # return all the medians & CIs # data.append(cat_data) return calculations
def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData): calculations = {} xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() for y in ys: # for now assert (len(ys) == 1) # Main effects for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median) calculations[c] = stat if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None x = xs[0] # We should do this for the prediction, only....? cat = [k for k, v in x.metadata[categories].items()] test_statistic = {} p_val = None for c in cat: # import pdb; pdb.set_trace() lb = calculations[c].lower_bound ub = calculations[c].upper_bound test_statistic[c] = (lb, ub) alpha = combined_data.alpha lb = None ub = None for group, bounds in test_statistic.items(): if not lb: assert (not ub) lb = bounds[0] ub = bounds[1] else: if bounds[0] >= lb and bounds[0] <= ub: p_val = f'Greater than or equal to {alpha}' elif bounds[1] >= lb and bounds[1] <= ub: p_val = f'Greater than or equal to {alpha}' else: p_val = f'Less than {alpha}' dof = None test_result = TestResult(name="Bootstrap", test_statistic=test_statistic, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha, table=calculations) return test_result
def get_data(dataset: Dataset, var: VarData): return dataset.select(var.metadata[name], where=f"{var.metadata[query]}")