示例#1
0
def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()

    if len(xs) == 1:
        if len(ys) == 1:
            x = xs[0]
            y = ys[0]

            if x.is_categorical() and y.is_categorical():

                # Get the count for each category
                x_cat = [k for k, v in x.metadata[categories].items()]
                y_cat = [k for k, v in y.metadata[categories].items()]

                for xc in x_cat:
                    for yc in y_cat:
                        data = dataset.select(
                            y.metadata[name],
                            where=[
                                f"{x.metadata[name]} == '{xc}'",
                                f"{y.metadata[name]} == '{yc}'"
                            ])

                        # Check that the count is at least five for each of the (x,y) group pairs
                        if len(data) < 5:
                            return False

                return True
            else:
                return False
        else:
            raise ValueError(
                f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}"
            )
    else:
        x0 = xs[0]
        x1 = xs[1]

        if x0.is_categorical() and x1.is_categorical():
            # Get the count for each category
            x0_cat = [k for k, v in x0.metadata[categories].items()]
            x1_cat = [k for k, v in x1.metadata[categories].items()]

            for x0c in x0_cat:
                for x1c in x1_cat:
                    data = dataset.select(x1.metadata[name],
                                          where=[
                                              f"{x.metadata[name]} == '{xc}'",
                                              f"{x1.metadata[name]} == '{x1c}'"
                                          ])

                    # Check that the count is at least five for each of the (x,x1) group pairs
                    if len(data) < 5:
                        return False
            return True
        else:
            return False
示例#2
0
def friedman(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs:
        cat = [k for k, v in x.metadata[categories].items()]
        for c in cat:
            cat_data = dataset.select(y.metadata[name],
                                      where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)

    # return stats.friedmanchisquare(*data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    test_statistic, p_val = stats.friedmanchisquare(*data)
    dof = len(data[0])  # TODO This might not be correct
    test_result = TestResult(name="Kruskall Wallis Test",
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result
def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        if x.metadata[categories] is None: 
            raise ValueError('')
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    if predictions: 
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    t_stat, p_val = stats.kruskal(*data)
    dof = len(data[0]) # TODO This might not be correct
    test_result = TestResult( 
                        name = kruskall_wallis_name,
                        test_statistic = t_stat,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        x = xs[0], # TODO: Not sure if it's possible to have multiple x's?
                        y = y)
    
    return test_result
示例#4
0
def wilcoxon_signed_rank(dataset: Dataset, predictions,
                         combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    t_stat, p_val = stats.wilcoxon(data[0], data[1])
    dof = len(data[0])  # TODO This might not be correct
    test_result = TestResult(name=wilcoxon_signed_rank_name,
                             test_statistic=t_stat,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             x=x,
                             y=y)

    return test_result
示例#5
0
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(xs) == 1)
    assert (len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    t_stat, p_val = stats.pointbiserialr(data[0], data[1])
    dof = None
    test_result = TestResult(name=pointbiserial_name,
                             test_statistic=t_stat,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result
def chi_square(dataset: Dataset, predictions, combined_data: CombinedData): 
    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    if len(xs) == 1: 
        if len(ys) == 1: 
            x = xs[0]
            y = ys[0]

            # Get the count for each category
            x_cat = [k for k,v in x.metadata[categories].items()]
            y_cat = [k for k,v in y.metadata[categories].items()]

            contingency_table = []
            contingency_table_key = [] # labels for the order in which data is stored in data array (define above)

            for xc in x_cat: 
                table_row = []
                table_row_key = []
                for yc in y_cat: 
                    data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"])
                    table_row.append(len(data))

                    x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc)
                    table_row_key.append(x_y_key)
                
                assert(len(table_row_key) == len(table_row))
                assert(len(table_row) == len(y_cat))
                contingency_table.append(table_row)
                contingency_table_key.append(table_row_key)
            
        else: 
            raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}")    
    else: 
        raise ValueError(f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}")

    # chi2, p, dof, ex = chi2_contingency(obs, correction=False)
    # chi2, p, dof, ex = stats.chi2_contingency(contingency_table, correction=False)

    if predictions: 
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    test_statistic, p_val, dof, ex = stats.chi2_contingency(contingency_table, correction=False)
    dof = None
    test_result = TestResult( 
                        name = chi_square_name,
                        test_statistic = test_statistic,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        x = x,
                        y = y)
    
    return test_result
def fishers_exact(dataset: Dataset, predictions, combined_data: CombinedData): 
    assert(len(combined_data.vars) == 2)

    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert(len(xs) == 1)
    assert(len(ys) == 1)

    x = xs[0]
    y = ys[0]

    # Get the count for each category
    x_cat = [k for k,v in x.metadata[categories].items()]
    y_cat = [k for k,v in y.metadata[categories].items()]

    contingency_table = []
    contingency_table_key = [] # labels for the order in which data is stored in data array (define above)

    for xc in x_cat: 
        table_row = []
        table_row_key = []
        for yc in y_cat: 
            data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"])
            table_row.append(len(data))

            x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc)
            table_row_key.append(x_y_key)
        
        assert(len(table_row_key) == len(table_row))
        assert(len(table_row) == len(y_cat))
        contingency_table.append(table_row)
        contingency_table_key.append(table_row_key)

    # odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided')
    # return FishersResult(odds_ratio, p_value)

    if predictions: 
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    odds_ratio, p_val = stats.fisher_exact(contingency_table, alternative='two-sided')
    dof = None
    test_result = TestResult( 
                        name = fisher_exact_name,
                        test_statistic = odds_ratio,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        x = x,
                        y = y)
    
    return test_result
示例#8
0
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(xs) == 1)
    assert (len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    if len(data[0]) == len(
            data[1]
    ):  # Scipy requires that groups have equal sizes even though this is not technically a requirement of the Pointbiserial correlation
        corr, p_val = stats.pointbiserialr(data[0], data[1])
    else:
        # Compute pointbiserial correlation on our own
        data_all = data[0].append(data[1])

        group_0_mean = np.mean(data[0])
        group_0_size = len(data[0])
        group_1_mean = np.mean(data[1])
        group_1_size = len(data[1])

        sample_size = group_0_size + group_1_size
        assert (sample_size == len(data_all))
        sample_std = stats.tstd(data_all)

        corr = (group_0_mean - group_1_mean) / sample_std * math.sqrt(
            (group_0_size * group_1_size) / (sample_size * (sample_size - 1)))
        t_stat, p_val = stats.ttest_ind(data[0], data[1], equal_var=True)

    dof = None
    test_result = TestResult(name=POINTBISERIAL_NAME,
                             test_statistic=corr,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result
示例#9
0
def wilcoxon_signed_rank(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k,v in x.metadata[categories].items()]
    data = []

    for c in cat: 
        cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)
    
    return stats.wilcoxon(data[0], data[1])
示例#10
0
def friedman(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    return stats.friedmanchisquare(*data)
示例#11
0
def has_equal_variance(dataset: Dataset, var_data: list, alpha):
    xs = []
    ys = []
    cat_xs = []
    cont_ys = []
    grouped_data = []

    if isinstance(var_data, CombinedData):
        xs = var_data.get_explanatory_variables()
        ys = var_data.get_explained_variables()

    else:
        for var in var_data:
            if var.role == iv_identifier or var.role == contributor_identifier:
                xs.append(var)
            if var.role == dv_identifier or var.role == outcome_identifier:
                ys.append(var)

    for x in xs:
        if x.is_categorical():
            cat_xs.append(x)

    for y in ys:
        if y.is_continuous():
            cont_ys.append(y)

    eq_var = (None, None)
    if cat_xs and cont_ys:
        for y in ys:
            for x in xs:
                cat = [k for k, v in x.metadata[categories].items()]
                for c in cat:
                    data = dataset.select(
                        y.metadata[name],
                        where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data.append(data)
                if isinstance(var_data, BivariateData):
                    # Equal variance
                    eq_var = compute_eq_variance(grouped_data)
                else:
                    eq_var = compute_eq_variance(grouped_data)

    if eq_var[0] is None and eq_var[1] is None:
        import pdb
        pdb.set_trace()
        # raise Exception("did not compute variance, this is a bug")
        return False

    return eq_var[1] > alpha
示例#12
0
def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        if x.metadata[categories] is None: 
            raise ValueError('')
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    return stats.kruskal(*data)
示例#13
0
def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert(len(xs) == 1)
    assert(len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k,v in x.metadata[categories].items()]
    data = []

    for c in cat: 
        cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)
    
    return stats.pointbiserialr(data[0], data[1])
示例#14
0
def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()
    cat_xs = []
    cont_ys = []
    grouped_data = []

    for x in xs:
        if x.is_categorical():
            cat_xs.append(x)

    for y in ys:
        if y.is_continuous():
            cont_ys.append(y)

    eq_var = (None, None)
    if cat_xs and cont_ys:
        for y in ys:
            for x in xs:
                cat = [k for k, v in x.metadata[categories].items()]
                for c in cat:
                    data = dataset.select(
                        y.metadata[name],
                        where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data.append(data)
                if isinstance(var_data, BivariateData):
                    # Equal variance
                    eq_var = compute_eq_variance(grouped_data)
                # elif isinstance(var_data, MultivariateData):
                #     var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data)
                else:
                    raise ValueError(
                        f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}"
                    )

    if eq_var[0] is None and eq_var[1] is None:
        import pdb
        pdb.set_trace()
        # raise Exception("did not compute variance, this is a bug")
        return False

    return (eq_var[1] > alpha)
示例#15
0
def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys: 
        # for now
        assert(len(ys) == 1)
        
        # Main effects
        for x in xs: 
            cat = [k for k,v in x.metadata[categories].items()]
            for c in cat: 
                cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median)
                calculations[c] = stat
                # import pdb; pdb.set_trace()
                # store all the medians & confidence intervals
                # return all the medians & CIs
                # data.append(cat_data)
    
    return calculations
示例#16
0
def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys:
        # for now
        assert (len(ys) == 1)

        # Main effects
        for x in xs:
            cat = [k for k, v in x.metadata[categories].items()]
            for c in cat:
                cat_data = dataset.select(
                    y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(),
                                    stat_func=bs_stats.median)
                calculations[c] = stat

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None

    x = xs[0]  # We should do this for the prediction, only....?
    cat = [k for k, v in x.metadata[categories].items()]
    test_statistic = {}
    p_val = None
    for c in cat:
        # import pdb; pdb.set_trace()
        lb = calculations[c].lower_bound
        ub = calculations[c].upper_bound

        test_statistic[c] = (lb, ub)

    alpha = combined_data.alpha
    lb = None
    ub = None
    for group, bounds in test_statistic.items():
        if not lb:
            assert (not ub)
            lb = bounds[0]
            ub = bounds[1]
        else:
            if bounds[0] >= lb and bounds[0] <= ub:
                p_val = f'Greater than or equal to {alpha}'
            elif bounds[1] >= lb and bounds[1] <= ub:
                p_val = f'Greater than or equal to {alpha}'
            else:
                p_val = f'Less than {alpha}'

    dof = None
    test_result = TestResult(name="Bootstrap",
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             table=calculations)

    return test_result
示例#17
0
def get_data(dataset: Dataset, var: VarData):
    return dataset.select(var.metadata[name], where=f"{var.metadata[query]}")