Python Dataset примеры, tea.runtimeDataStructures.dataset.Dataset Python примеры использования

Пример #1

0

Показать файл

Файл: solver.py Проект: pkuleon/tea-lang

def greater_than_5_frequency(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()

    if len(xs) == 1:
        if len(ys) == 1:
            x = xs[0]
            y = ys[0]

            if x.is_categorical() and y.is_categorical():

                # Get the count for each category
                x_cat = [k for k, v in x.metadata[categories].items()]
                y_cat = [k for k, v in y.metadata[categories].items()]

                for xc in x_cat:
                    for yc in y_cat:
                        data = dataset.select(
                            y.metadata[name],
                            where=[
                                f"{x.metadata[name]} == '{xc}'",
                                f"{y.metadata[name]} == '{yc}'"
                            ])

                        # Check that the count is at least five for each of the (x,y) group pairs
                        if len(data) < 5:
                            return False

                return True
            else:
                return False
        else:
            raise ValueError(
                f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}"
            )
    else:
        x0 = xs[0]
        x1 = xs[1]

        if x0.is_categorical() and x1.is_categorical():
            # Get the count for each category
            x0_cat = [k for k, v in x0.metadata[categories].items()]
            x1_cat = [k for k, v in x1.metadata[categories].items()]

            for x0c in x0_cat:
                for x1c in x1_cat:
                    data = dataset.select(x1.metadata[name],
                                          where=[
                                              f"{x.metadata[name]} == '{xc}'",
                                              f"{x1.metadata[name]} == '{x1c}'"
                                          ])

                    # Check that the count is at least five for each of the (x,x1) group pairs
                    if len(data) < 5:
                        return False
            return True
        else:
            return False

Пример #2

0

Показать файл

Файл: evaluateHelperMethods.py Проект: andrewhead/tea-lang

def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        if x.metadata[categories] is None: 
            raise ValueError('')
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    if predictions: 
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    t_stat, p_val = stats.kruskal(*data)
    dof = len(data[0]) # TODO This might not be correct
    test_result = TestResult( 
                        name = kruskall_wallis_name,
                        test_statistic = t_stat,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        x = xs[0], # TODO: Not sure if it's possible to have multiple x's?
                        y = y)
    
    return test_result

Пример #3

0

Показать файл

def friedman(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs:
        cat = [k for k, v in x.metadata[categories].items()]
        for c in cat:
            cat_data = dataset.select(y.metadata[name],
                                      where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)

    # return stats.friedmanchisquare(*data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    test_statistic, p_val = stats.friedmanchisquare(*data)
    dof = len(data[0])  # TODO This might not be correct
    test_result = TestResult(name="Kruskall Wallis Test",
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result

Пример #4

0

Показать файл

def wilcoxon_signed_rank(dataset: Dataset, predictions,
                         combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    t_stat, p_val = stats.wilcoxon(data[0], data[1])
    dof = len(data[0])  # TODO This might not be correct
    test_result = TestResult(name=wilcoxon_signed_rank_name,
                             test_statistic=t_stat,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             x=x,
                             y=y)

    return test_result

Пример #5

0

Показать файл

def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(xs) == 1)
    assert (len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    t_stat, p_val = stats.pointbiserialr(data[0], data[1])
    dof = None
    test_result = TestResult(name=pointbiserial_name,
                             test_statistic=t_stat,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result

Пример #6

0

Показать файл

Файл: evaluateHelperMethods.py Проект: andrewhead/tea-lang

def chi_square(dataset: Dataset, predictions, combined_data: CombinedData): 
    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    if len(xs) == 1: 
        if len(ys) == 1: 
            x = xs[0]
            y = ys[0]

            # Get the count for each category
            x_cat = [k for k,v in x.metadata[categories].items()]
            y_cat = [k for k,v in y.metadata[categories].items()]

            contingency_table = []
            contingency_table_key = [] # labels for the order in which data is stored in data array (define above)

            for xc in x_cat: 
                table_row = []
                table_row_key = []
                for yc in y_cat: 
                    data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"])
                    table_row.append(len(data))

                    x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc)
                    table_row_key.append(x_y_key)
                
                assert(len(table_row_key) == len(table_row))
                assert(len(table_row) == len(y_cat))
                contingency_table.append(table_row)
                contingency_table_key.append(table_row_key)
            
        else: 
            raise ValueError(f"Currently, chi square requires/only supports 1 explained variable, instead received: {len(ys)} -- {ys}")    
    else: 
        raise ValueError(f"Currently, chi square requires/only supports 1 explanatory variable, instead received: {len(xs)} -- {xs}")

    # chi2, p, dof, ex = chi2_contingency(obs, correction=False)
    # chi2, p, dof, ex = stats.chi2_contingency(contingency_table, correction=False)

    if predictions: 
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    test_statistic, p_val, dof, ex = stats.chi2_contingency(contingency_table, correction=False)
    dof = None
    test_result = TestResult( 
                        name = chi_square_name,
                        test_statistic = test_statistic,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        x = x,
                        y = y)
    
    return test_result

Пример #7

0

Показать файл

Файл: evaluateHelperMethods.py Проект: andrewhead/tea-lang

def fishers_exact(dataset: Dataset, predictions, combined_data: CombinedData): 
    assert(len(combined_data.vars) == 2)

    # Compute the contingency table
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert(len(xs) == 1)
    assert(len(ys) == 1)

    x = xs[0]
    y = ys[0]

    # Get the count for each category
    x_cat = [k for k,v in x.metadata[categories].items()]
    y_cat = [k for k,v in y.metadata[categories].items()]

    contingency_table = []
    contingency_table_key = [] # labels for the order in which data is stored in data array (define above)

    for xc in x_cat: 
        table_row = []
        table_row_key = []
        for yc in y_cat: 
            data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{xc}'", f"{y.metadata[name]} == '{yc}'"])
            table_row.append(len(data))

            x_y_key = str(x.metadata[name]) + ':' + str(xc) + ' by ' + str(y.metadata[name]) + ':' + str(yc)
            table_row_key.append(x_y_key)
        
        assert(len(table_row_key) == len(table_row))
        assert(len(table_row) == len(y_cat))
        contingency_table.append(table_row)
        contingency_table_key.append(table_row_key)

    # odds_ratio, p_value = stats.fisher_exact(contingency_table, alternative='two-sided')
    # return FishersResult(odds_ratio, p_value)

    if predictions: 
        if isinstance(predictions[0], list): 
            prediction = predictions[0][0]
        else: 
            prediction = predictions[0]
    else: 
        prediction = None
    odds_ratio, p_val = stats.fisher_exact(contingency_table, alternative='two-sided')
    dof = None
    test_result = TestResult( 
                        name = fisher_exact_name,
                        test_statistic = odds_ratio,
                        p_value = p_val,
                        prediction = prediction,
                        dof = dof,
                        alpha = combined_data.alpha,
                        x = x,
                        y = y)
    
    return test_result

Пример #8

0

Показать файл

Файл: vardata_factory.py Проект: pkuleon/tea-lang

 def __create_variable_vardata(self, dataset: Dataset,
                               expr: Variable) -> VarData:
     # dataframe = dataset[expr.name] # I don't know if we want this. We may want to just store query (in metadata?) and
     # then use query to get raw data later....(for user, not interpreter?)
     metadata = dataset.get_variable_data(expr.name)  # (dtype, categories)
     # if expr.name == 'strategy':
     #     import pdb; pdb.set_trace()
     metadata['var_name'] = expr.name
     metadata['query'] = ''
     return VarData(metadata)

Пример #9

0

Показать файл

def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert (len(xs) == 1)
    assert (len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k, v in x.metadata[categories].items()]
    data = []

    for c in cat:
        cat_data = dataset.select(y.metadata[name],
                                  where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None
    if len(data[0]) == len(
            data[1]
    ):  # Scipy requires that groups have equal sizes even though this is not technically a requirement of the Pointbiserial correlation
        corr, p_val = stats.pointbiserialr(data[0], data[1])
    else:
        # Compute pointbiserial correlation on our own
        data_all = data[0].append(data[1])

        group_0_mean = np.mean(data[0])
        group_0_size = len(data[0])
        group_1_mean = np.mean(data[1])
        group_1_size = len(data[1])

        sample_size = group_0_size + group_1_size
        assert (sample_size == len(data_all))
        sample_std = stats.tstd(data_all)

        corr = (group_0_mean - group_1_mean) / sample_std * math.sqrt(
            (group_0_size * group_1_size) / (sample_size * (sample_size - 1)))
        t_stat, p_val = stats.ttest_ind(data[0], data[1], equal_var=True)

    dof = None
    test_result = TestResult(name=POINTBISERIAL_NAME,
                             test_statistic=corr,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha)

    return test_result

Пример #10

0

Показать файл

def wilcoxon_signed_rank(dataset: Dataset, predictions, combined_data: CombinedData):
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    x = xs[0]
    y = ys[0]
    cat = [k for k,v in x.metadata[categories].items()]
    data = []

    for c in cat: 
        cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)
    
    return stats.wilcoxon(data[0], data[1])

Пример #11

0

Показать файл

def friedman(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    return stats.friedmanchisquare(*data)

Пример #12

0

Показать файл

Файл: solver.py Проект: pkuleon/tea-lang

def has_equal_variance(dataset: Dataset, var_data: list, alpha):
    xs = []
    ys = []
    cat_xs = []
    cont_ys = []
    grouped_data = []

    if isinstance(var_data, CombinedData):
        xs = var_data.get_explanatory_variables()
        ys = var_data.get_explained_variables()

    else:
        for var in var_data:
            if var.role == iv_identifier or var.role == contributor_identifier:
                xs.append(var)
            if var.role == dv_identifier or var.role == outcome_identifier:
                ys.append(var)

    for x in xs:
        if x.is_categorical():
            cat_xs.append(x)

    for y in ys:
        if y.is_continuous():
            cont_ys.append(y)

    eq_var = (None, None)
    if cat_xs and cont_ys:
        for y in ys:
            for x in xs:
                cat = [k for k, v in x.metadata[categories].items()]
                for c in cat:
                    data = dataset.select(
                        y.metadata[name],
                        where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data.append(data)
                if isinstance(var_data, BivariateData):
                    # Equal variance
                    eq_var = compute_eq_variance(grouped_data)
                else:
                    eq_var = compute_eq_variance(grouped_data)

    if eq_var[0] is None and eq_var[1] is None:
        import pdb
        pdb.set_trace()
        # raise Exception("did not compute variance, this is a bug")
        return False

    return eq_var[1] > alpha

Пример #13

0

Показать файл

def kruskall_wallis(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()
    assert (len(ys) == 1)
    y = ys[0]

    data = []
    for x in xs: 
        if x.metadata[categories] is None: 
            raise ValueError('')
        cat = [k for k,v in x.metadata[categories].items()]
        for c in cat: 
            cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
            data.append(cat_data)
    
    return stats.kruskal(*data)

Пример #14

0

Показать файл

def pointbiserial(dataset: Dataset, predictions, combined_data: CombinedData): 
    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    assert(len(xs) == 1)
    assert(len(ys) == 1)
    x = xs[0]
    y = ys[0]
    cat = [k for k,v in x.metadata[categories].items()]
    data = []

    for c in cat: 
        cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
        data.append(cat_data)
    
    return stats.pointbiserialr(data[0], data[1])

Пример #15

0

Показать файл

Файл: solver.py Проект: eiselmayer/tea-lang

def has_equal_variance(dataset: Dataset, var_data: CombinedData, alpha):
    xs = var_data.get_explanatory_variables()
    ys = var_data.get_explained_variables()
    cat_xs = []
    cont_ys = []
    grouped_data = []

    for x in xs:
        if x.is_categorical():
            cat_xs.append(x)

    for y in ys:
        if y.is_continuous():
            cont_ys.append(y)

    eq_var = (None, None)
    if cat_xs and cont_ys:
        for y in ys:
            for x in xs:
                cat = [k for k, v in x.metadata[categories].items()]
                for c in cat:
                    data = dataset.select(
                        y.metadata[name],
                        where=[f"{x.metadata[name]} == '{c}'"])
                    grouped_data.append(data)
                if isinstance(var_data, BivariateData):
                    # Equal variance
                    eq_var = compute_eq_variance(grouped_data)
                # elif isinstance(var_data, MultivariateData):
                #     var_data.properties[eq_variance + '::' + x.metadata[name] + ':' + y.metadata[name]] = compute_eq_variance(grouped_data)
                else:
                    raise ValueError(
                        f"var_data_data object is neither BivariateData nor MultivariateData: {type(var_data)}"
                    )

    if eq_var[0] is None and eq_var[1] is None:
        import pdb
        pdb.set_trace()
        # raise Exception("did not compute variance, this is a bug")
        return False

    return (eq_var[1] > alpha)

Пример #16

0

Показать файл

def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys: 
        # for now
        assert(len(ys) == 1)
        
        # Main effects
        for x in xs: 
            cat = [k for k,v in x.metadata[categories].items()]
            for c in cat: 
                cat_data = dataset.select(y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median)
                calculations[c] = stat
                # import pdb; pdb.set_trace()
                # store all the medians & confidence intervals
                # return all the medians & CIs
                # data.append(cat_data)
    
    return calculations

Пример #17

0

Показать файл

def get_data(dataset: Dataset, var: VarData):
    return dataset.select(var.metadata[name], where=f"{var.metadata[query]}")

Пример #18

0

Показать файл

def bootstrap(dataset: Dataset, predictions, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys:
        # for now
        assert (len(ys) == 1)

        # Main effects
        for x in xs:
            cat = [k for k, v in x.metadata[categories].items()]
            for c in cat:
                cat_data = dataset.select(
                    y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(),
                                    stat_func=bs_stats.median)
                calculations[c] = stat

    if predictions:
        if isinstance(predictions[0], list):
            prediction = predictions[0][0]
        else:
            prediction = predictions[0]
    else:
        prediction = None

    x = xs[0]  # We should do this for the prediction, only....?
    cat = [k for k, v in x.metadata[categories].items()]
    test_statistic = {}
    p_val = None
    for c in cat:
        # import pdb; pdb.set_trace()
        lb = calculations[c].lower_bound
        ub = calculations[c].upper_bound

        test_statistic[c] = (lb, ub)

    alpha = combined_data.alpha
    lb = None
    ub = None
    for group, bounds in test_statistic.items():
        if not lb:
            assert (not ub)
            lb = bounds[0]
            ub = bounds[1]
        else:
            if bounds[0] >= lb and bounds[0] <= ub:
                p_val = f'Greater than or equal to {alpha}'
            elif bounds[1] >= lb and bounds[1] <= ub:
                p_val = f'Greater than or equal to {alpha}'
            else:
                p_val = f'Less than {alpha}'

    dof = None
    test_result = TestResult(name="Bootstrap",
                             test_statistic=test_statistic,
                             p_value=p_val,
                             prediction=prediction,
                             dof=dof,
                             alpha=combined_data.alpha,
                             table=calculations)

    return test_result

Пример #19

0

Показать файл

Файл: build.py Проект: jtloong/tea-lang

def load_data_from_url(url: str, name: str):
    return Dataset.load(url, name)

Пример #20

0

Показать файл

Файл: build.py Проект: jtloong/tea-lang

def load_data(source_name: str, vars: list, pid: str):
    return Dataset(source_name, vars, pid)

Пример #21

0

Показать файл

def load_data(source_name: Union[str, Path, pd.DataFrame], vars: list,
              pid: str):
    return Dataset(source_name, vars, pid)

Пример #22

0

Показать файл

Файл: evaluate.py Проект: jtloong/tea-lang

def evaluate(dataset: Dataset,
             expr: Node,
             assumptions: Dict[str, str],
             design: Dict[str, str] = None):
    if isinstance(expr, Variable):
        # dataframe = dataset[expr.name] # I don't know if we want this. We may want to just store query (in metadata?) and
        # then use query to get raw data later....(for user, not interpreter?)
        metadata = dataset.get_variable_data(expr.name)  # (dtype, categories)
        # if expr.name == 'strategy':
        #     import pdb; pdb.set_trace()
        metadata['var_name'] = expr.name
        metadata['query'] = ''
        return VarData(metadata)

    elif isinstance(expr, Literal):
        data = pd.Series(
            [expr.value] * len(dataset.data),
            index=dataset.data.index)  # Series filled with literal value
        # metadata = None # metadata=None means literal
        metadata = dict()  # metadata=None means literal
        metadata['var_name'] = ''  # because not a var in the dataset
        metadata['query'] = ''
        metadata['value'] = expr.value
        return VarData(data, metadata)

    elif isinstance(expr, Equal):
        lhs = evaluate(dataset, expr.lhs)
        rhs = evaluate(dataset, expr.rhs)
        assert isinstance(lhs, VarData)
        assert isinstance(rhs, VarData)

        dataframe = lhs.dataframe[lhs.dataframe == rhs.dataframe]
        metadata = lhs.metadata
        if (isinstance(expr.rhs, Literal)):
            metadata[
                'query'] = f" == \'{rhs.metadata['value']}\'"  # override lhs metadata for query
        elif (isinstance(expr.rhs, Variable)):
            metadata['query'] = f" == {rhs.metadata['var_name']}"
        else:
            raise ValueError(f"Not implemented for {rhs}")

        return VarData(metadata)

    elif isinstance(expr, NotEqual):
        rhs = evaluate(dataset, expr.rhs)
        lhs = evaluate(dataset, expr.lhs)
        assert isinstance(rhs, VarData)
        assert isinstance(lhs, VarData)

        dataframe = lhs.dataframe[lhs.dataframe != rhs.dataframe]
        metadata = lhs.metadata
        if (isinstance(expr.rhs, Literal)):
            metadata['query'] = " != \'\'"  # override lhs metadata for query
        elif (isinstance(expr.rhs, Variable)):
            metadata['query'] = f" != {rhs.metadata['var_name']}"
        else:
            raise ValueError(f"Not implemented for {rhs}")
        return VarData(metadata)

    elif isinstance(expr, LessThan):
        lhs = evaluate(dataset, expr.lhs)
        rhs = evaluate(dataset, expr.rhs)
        assert isinstance(lhs, VarData)
        assert isinstance(rhs, VarData)

        dataframe = None
        metadata = rhs.metadata

        if (not lhs.metadata):
            raise ValueError(
                'Malformed Relation. Filter on Variables must have variable as rhs'
            )
        elif (lhs.metadata['dtype'] is DataType.NOMINAL):
            raise ValueError('Cannot compare nominal values with Less Than')
        elif (lhs.metadata['dtype'] is DataType.ORDINAL):
            # TODO May want to add a case should RHS and LHS both be variables
            # assert (rhs.metadata is None)
            comparison = rhs.dataframe.iloc[0]
            if (isinstance(comparison, str)):
                categories = lhs.metadata['categories']  # OrderedDict
                # Get raw Pandas Series indices for desired data
                ids = [
                    i for i, x in enumerate(lhs.dataframe)
                    if categories[x] < categories[comparison]
                ]
                # Get Pandas Series set indices for desired data
                p_ids = [lhs.dataframe.index.values[i] for i in ids]
                # Create new Pandas Series with only the desired data, using set indices
                dataframe = pd.Series(lhs.dataframe, p_ids)
                dataframe.index.name = dataset.pid_col_name

            elif (np.issubdtype(comparison, np.integer)):
                categories = lhs.metadata['categories']  # OrderedDict
                # Get raw Pandas Series indices for desired data
                ids = [
                    i for i, x in enumerate(lhs.dataframe)
                    if categories[x] < comparison
                ]
                # Get Pandas Series set indices for desired data
                p_ids = [lhs.dataframe.index.values[i] for i in ids]
                # Create new Pandas Series with only the desired data, using set indices
                dataframe = pd.Series(lhs.dataframe, p_ids)
                dataframe.index.name = dataset.pid_col_name

            else:
                raise ValueError(
                    f"Cannot compare ORDINAL variables to {type(rhs.dataframe.iloc[0])}"
                )

        elif (lhs.metadata['dtype'] is DataType.INTERVAL
              or lhs.metadata['dtype'] is DataType.RATIO):
            comparison = rhs.dataframe.iloc[0]
            # Get raw Pandas Series indices for desired data
            ids = [i for i, x in enumerate(lhs.dataframe) if x < comparison]
            # Get Pandas Series set indices for desired data
            p_ids = [lhs.dataframe.index.values[i] for i in ids]
            # Create new Pandas Series with only the desired data, using set indices
            dataframe = pd.Series(lhs.dataframe, p_ids)
            dataframe.index.name = dataset.pid_col_name

        else:
            raise Exception(f"Invalid Less Than Operation:{lhs} < {rhs}")

        if (isinstance(expr.rhs, Literal)):
            metadata['query'] = " < \'\'"  # override lhs metadata for query
        elif (isinstance(expr.rhs, Variable)):
            metadata['query'] = f" < {rhs.metadata['var_name']}"
        else:
            raise ValueError(f"Not implemented for {rhs}")
        return VarData(metadata)

    elif isinstance(expr, LessThanEqual):
        lhs = evaluate(dataset, expr.lhs)
        rhs = evaluate(dataset, expr.rhs)
        assert isinstance(lhs, VarData)
        assert isinstance(rhs, VarData)

        dataframe = None
        metadata = rhs.metadata

        if (not lhs.metadata):
            raise ValueError(
                'Malformed Relation. Filter on Variables must have variable as rhs'
            )
        elif (lhs.metadata['dtype'] is DataType.NOMINAL):
            raise ValueError('Cannot compare nominal values with Less Than')
        elif (lhs.metadata['dtype'] is DataType.ORDINAL):
            # TODO May want to add a case should RHS and LHS both be variables
            # assert (rhs.metadata is None)
            comparison = rhs.dataframe.iloc[0]
            if (isinstance(comparison, str)):
                categories = lhs.metadata['categories']  # OrderedDict
                # Get raw Pandas Series indices for desired data
                ids = [
                    i for i, x in enumerate(lhs.dataframe)
                    if categories[x] <= categories[comparison]
                ]
                # Get Pandas Series set indices for desired data
                p_ids = [lhs.dataframe.index.values[i] for i in ids]
                # Create new Pandas Series with only the desired data, using set indices
                dataframe = pd.Series(lhs.dataframe, p_ids)
                dataframe.index.name = dataset.pid_col_name

            elif (np.issubdtype(comparison, np.integer)):
                categories = lhs.metadata['categories']  # OrderedDict
                # Get raw Pandas Series indices for desired data
                ids = [
                    i for i, x in enumerate(lhs.dataframe)
                    if categories[x] <= comparison
                ]
                # Get Pandas Series set indices for desired data
                p_ids = [lhs.dataframe.index.values[i] for i in ids]
                # Create new Pandas Series with only the desired data, using set indices
                dataframe = pd.Series(lhs.dataframe, p_ids)
                dataframe.index.name = dataset.pid_col_name

            else:
                raise ValueError(
                    f"Cannot compare ORDINAL variables to {type(rhs.dataframe.iloc[0])}"
                )

        elif (lhs.metadata['dtype'] is DataType.INTERVAL
              or lhs.metadata['dtype'] is DataType.RATIO):
            comparison = rhs.dataframe.iloc[0]
            # Get raw Pandas Series indices for desired data
            ids = [i for i, x in enumerate(lhs.dataframe) if x <= comparison]
            # Get Pandas Series set indices for desired data
            p_ids = [lhs.dataframe.index.values[i] for i in ids]
            # Create new Pandas Series with only the desired data, using set indices
            dataframe = pd.Series(lhs.dataframe, p_ids)
            dataframe.index.name = dataset.pid_col_name

        else:
            raise Exception(
                f"Invalid Less Than Equal Operation:{lhs} <= {rhs}")

        if (isinstance(expr.rhs, Literal)):
            metadata['query'] = " <= \'\'"  # override lhs metadata for query
        elif (isinstance(expr.rhs, Variable)):
            metadata['query'] = f" <= {rhs.metadata['var_name']}"
        else:
            raise ValueError(f"Not implemented for {rhs}")

        return VarData(metadata)

    elif isinstance(expr, GreaterThan):
        lhs = evaluate(dataset, expr.lhs)
        rhs = evaluate(dataset, expr.rhs)
        assert isinstance(lhs, VarData)
        assert isinstance(rhs, VarData)

        dataframe = None
        metadata = rhs.metadata

        if (not lhs.metadata):
            raise ValueError(
                'Malformed Relation. Filter on Variables must have variable as rhs'
            )
        elif (lhs.metadata['dtype'] is DataType.NOMINAL):
            raise ValueError('Cannot compare nominal values with Greater Than')
        elif (lhs.metadata['dtype'] is DataType.ORDINAL):
            # TODO May want to add a case should RHS and LHS both be variables
            # assert (rhs.metadata is None)
            comparison = rhs.dataframe.iloc[0]
            if (isinstance(comparison, str)):
                categories = lhs.metadata['categories']  # OrderedDict
                # Get raw Pandas Series indices for desired data
                ids = [
                    i for i, x in enumerate(lhs.dataframe)
                    if categories[x] > categories[comparison]
                ]
                # Get Pandas Series set indices for desired data
                p_ids = [lhs.dataframe.index.values[i] for i in ids]
                # Create new Pandas Series with only the desired data, using set indices
                dataframe = pd.Series(lhs.dataframe, p_ids)
                dataframe.index.name = dataset.pid_col_name

            elif (np.issubdtype(comparison, np.integer)):
                categories = lhs.metadata['categories']  # OrderedDict
                # Get raw Pandas Series indices for desired data
                ids = [
                    i for i, x in enumerate(lhs.dataframe)
                    if categories[x] > comparison
                ]
                # Get Pandas Series set indices for desired data
                p_ids = [lhs.dataframe.index.values[i] for i in ids]
                # Create new Pandas Series with only the desired data, using set indices
                dataframe = pd.Series(lhs.dataframe, p_ids)
                dataframe.index.name = dataset.pid_col_name

            else:
                raise ValueError(
                    f"Cannot compare ORDINAL variables to {type(rhs.dataframe.iloc[0])}"
                )

        elif (lhs.metadata['dtype'] is DataType.INTERVAL
              or lhs.metadata['dtype'] is DataType.RATIO):
            comparison = rhs.dataframe.iloc[0]
            # Get raw Pandas Series indices for desired data
            ids = [i for i, x in enumerate(lhs.dataframe) if x > comparison]
            # Get Pandas Series set indices for desired data
            p_ids = [lhs.dataframe.index.values[i] for i in ids]
            # Create new Pandas Series with only the desired data, using set indices
            dataframe = pd.Series(lhs.dataframe, p_ids)
            dataframe.index.name = dataset.pid_col_name

        else:
            raise Exception(f"Invalid Greater Than Operation:{lhs} > {rhs}")

        if (isinstance(expr.rhs, Literal)):
            metadata['query'] = " > \'\'"  # override lhs metadata for query
        elif (isinstance(expr.rhs, Variable)):
            metadata['query'] = f" > {rhs.metadata['var_name']}"
        else:
            raise ValueError(f"Not implemented for {rhs}")

        return VarData(metadata)

    elif isinstance(expr, GreaterThanEqual):
        lhs = evaluate(dataset, expr.lhs)
        rhs = evaluate(dataset, expr.rhs)
        assert isinstance(lhs, VarData)
        assert isinstance(rhs, VarData)

        dataframe = None
        metadata = rhs.metadata

        if (not lhs.metadata):
            raise ValueError(
                'Malformed Relation. Filter on Variables must have variable as rhs'
            )
        elif (lhs.metadata['dtype'] is DataType.NOMINAL):
            raise ValueError(
                'Cannot compare nominal values with Greater Than Equal')
        elif (lhs.metadata['dtype'] is DataType.ORDINAL):
            # TODO May want to add a case should RHS and LHS both be variables
            # assert (rhs.metadata is None)
            comparison = rhs.dataframe.iloc[0]
            if (isinstance(comparison, str)):
                categories = lhs.metadata['categories']  # OrderedDict
                # Get raw Pandas Series indices for desired data
                ids = [
                    i for i, x in enumerate(lhs.dataframe)
                    if categories[x] >= categories[comparison]
                ]
                # Get Pandas Series set indices for desired data
                p_ids = [lhs.dataframe.index.values[i] for i in ids]
                # Create new Pandas Series with only the desired data, using set indices
                dataframe = pd.Series(lhs.dataframe, p_ids)
                dataframe.index.name = dataset.pid_col_name

            elif (np.issubdtype(comparison, np.integer)):
                categories = lhs.metadata['categories']  # OrderedDict
                # Get raw Pandas Series indices for desired data
                ids = [
                    i for i, x in enumerate(lhs.dataframe)
                    if categories[x] >= comparison
                ]
                # Get Pandas Series set indices for desired data
                p_ids = [lhs.dataframe.index.values[i] for i in ids]
                # Create new Pandas Series with only the desired data, using set indices
                dataframe = pd.Series(lhs.dataframe, p_ids)
                dataframe.index.name = dataset.pid_col_name

            else:
                raise ValueError(
                    f"Cannot compare ORDINAL variables to {type(rhs.dataframe.iloc[0])}"
                )

        elif (lhs.metadata['dtype'] is DataType.INTERVAL
              or lhs.metadata['dtype'] is DataType.RATIO):
            comparison = rhs.dataframe.iloc[0]
            # Get raw Pandas Series indices for desired data
            ids = [i for i, x in enumerate(lhs.dataframe) if x >= comparison]
            # Get Pandas Series set indices for desired data
            p_ids = [lhs.dataframe.index.values[i] for i in ids]
            # Create new Pandas Series with only the desired data, using set indices
            dataframe = pd.Series(lhs.dataframe, p_ids)
            dataframe.index.name = dataset.pid_col_name

        else:
            raise Exception(
                f"Invalid Greater Than Equal Operation:{lhs} >= {rhs}")

        if (isinstance(expr.rhs, Literal)):
            metadata['query'] = " >= \'\'"  # override lhs metadata for query
        elif (isinstance(expr.rhs, Variable)):
            metadata['query'] = f" >= {rhs.metadata['var_name']}"
        else:
            raise ValueError(f"Not implemented for {rhs}")
        return VarData(metadata)

    elif isinstance(expr, Relate):
        vars = []

        for v in expr.vars:
            eval_v = evaluate(dataset, v, design)

            if not eval_v:
                raise ValueError(
                    "The variables you are referencing are not defined as variables in your list of variables."
                )
            assert isinstance(eval_v, VarData)

            vars.append(eval_v)

        # What kind of study are we analyzing?
        study_type = determine_study_type(vars, design)

        # Assign roles to variables we are analyzing
        vars = assign_roles(vars, study_type, design)

        combined_data = None
        # Do we have a Bivariate analysis?
        if len(vars) == 2:
            combined_data = BivariateData(vars,
                                          study_type,
                                          alpha=float(assumptions['alpha']))
        else:  # Do we have a Multivariate analysis?
            combined_data = MultivariateData(vars,
                                             study_type,
                                             alpha=float(assumptions['alpha']))

        # Add paired property
        add_paired_property(dataset, combined_data, study_type,
                            design)  # check sample sizes are identical

        # Infer stats tests (mingled with)
        tests = synthesize_tests(dataset, assumptions, combined_data)
        """"
        # verify_properties(properties_and_tests)
        # get_tests
        # execute_tests
        # interpret_tests_results
        # print(tests)
        for test in tests:
            print("\nValid test: %s" % test.name)
            print("Properties:")
            properties = test.properties()
            for prop in properties:
                property_identifier = ""
                if prop.scope == "test":
                    property_identifier = test.name + ": " + prop.name
                else:
                    for var_indices in test.properties_for_vars[prop]:
                        for var_index in var_indices:
                            property_identifier += f"variable {test.test_vars[var_index].name} "
                        property_identifier += ": %s" % prop.name
                print(property_identifier)
        """

        # Execute and store results from each valid test
        results = {}
        if len(tests) == 0:
            tests.append('bootstrap')  # Default to bootstrap

        for test in tests:
            test_result = execute_test(dataset, design, expr.predictions,
                                       combined_data, test)
            results[test] = test_result

        res_data = ResultData(results, combined_data)

        follow_up = []

        # There are multiple hypotheses to follow-up and correct for
        if expr.predictions and len(expr.predictions) > 1:
            for pred in expr.predictions:
                # create follow-up expr Node (to evaluate recursively)
                pred_res = evaluate(dataset, pred, assumptions, design)
                follow_up.append(pred_res)  # add follow-up result to follow_up

        res_data.add_follow_up(
            follow_up)  # add follow-up results to the res_data object
        """
        # TODO: use a handle here to more generally/modularly support corrections, need a more generic data structure for this!
        if expr.predictions:
            preds = expr.predictions

            # There are multiple comparisons
            # if len(preds > 1): 
            # FOR DEBUGGING: 
            if len(preds) >= 1: 
                correct_multiple_comparison(res_data,  len(preds))
        """
        # import pdb; pdb.set_trace()
        return res_data

    elif isinstance(expr, PositiveRelationship):
        # get variables
        vars = [expr.lhs.var, expr.rhs.var]

        # create a Relate object
        pos_relate_expr = Relate(vars)
        return evaluate(dataset, pos_relate_expr, assumptions, design)

Python Dataset примеры использования