예제 #1
def test_contrast():
    from patsy.contrasts import ContrastMatrix, Sum
    values = ["a1", "a3", "a1", "a2"]

    # No intercept in model, full-rank coding of 'a'
    m = make_matrix({"a": C(values)},
                    3, [["a"]],
                    column_names=["a[a1]", "a[a2]", "a[a3]"])

    assert np.allclose(m, [[1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]])

    for s in (Sum, Sum()):
        m = make_matrix({"a": C(values, s)},
                        3, [["a"]],
                        column_names=["a[mean]", "a[S.a1]", "a[S.a2]"])
        # Output from R
        assert np.allclose(m, [[1, 1, 0], [1, -1, -1], [1, 1, 0], [1, 0, 1]])

    m = make_matrix({"a": C(values, Sum(omit=0))},
                    3, [["a"]],
                    column_names=["a[mean]", "a[S.a2]", "a[S.a3]"])
    # Output from R
    assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]])

    # Intercept in model, non-full-rank coding of 'a'
    m = make_matrix({"a": C(values)},
                    3, [[], ["a"]],
                    column_names=["Intercept", "a[T.a2]", "a[T.a3]"])

    assert np.allclose(m, [[1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 1, 0]])

    for s in (Sum, Sum()):
        m = make_matrix({"a": C(values, s)},
                        3, [[], ["a"]],
                        column_names=["Intercept", "a[S.a1]", "a[S.a2]"])
        # Output from R
        assert np.allclose(m, [[1, 1, 0], [1, -1, -1], [1, 1, 0], [1, 0, 1]])

    m = make_matrix({"a": C(values, Sum(omit=0))},
                    3, [[], ["a"]],
                    column_names=["Intercept", "a[S.a2]", "a[S.a3]"])
    # Output from R
    assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]])

    # Weird ad hoc less-than-full-rank coding of 'a'
    m = make_matrix({"a": C(values, [[7, 12], [2, 13], [8, -1]])},
                    2, [["a"]],
                    column_names=["a[custom0]", "a[custom1]"])
    assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]])

    m = make_matrix(
              ContrastMatrix([[7, 12], [2, 13], [8, -1]], ["[foo]", "[bar]"]))
        2, [["a"]],
        column_names=["a[foo]", "a[bar]"])
    assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]])
예제 #2
    def fit_sum_coding(col, values, handle_missing, handle_unknown):
        if handle_missing == 'value':
            values = values[values > 0]

        values_to_encode = values.get_values()

        if len(values) < 2:
            return pd.DataFrame(index=values_to_encode)

        if handle_unknown == 'indicator':
            values_to_encode = np.append(values_to_encode, -1)

        sum_contrast_matrix = Sum().code_without_intercept(
        df = pd.DataFrame(
                str(col) + '_%d' % (i, )
                for i in range(len(sum_contrast_matrix.column_suffixes))

        if handle_unknown == 'return_nan':
            df.loc[-1] = np.nan
        elif handle_unknown == 'value':
            df.loc[-1] = np.zeros(len(values_to_encode) - 1)

        if handle_missing == 'return_nan':
            df.loc[values.loc[np.nan]] = np.nan
        elif handle_missing == 'value':
            df.loc[-2] = np.zeros(len(values_to_encode) - 1)

        return df
예제 #3
def MakeLinearSinus(t, k=[1, 2, 3], trend=False, YAM=False):
    N = len(t)
    freq = 365
    K = np.repeat(np.array([k]), N, axis=0)
    fix = 2 * np.pi / freq
    Fix = t.reshape(N, 1) * fix * K
    if trend:
        Xm = np.concatenate([
            np.array([1] * N).reshape(N, 1),
            t.reshape(N, 1),
    elif YAM:
        year = ((t / 365).astype("int"))
        year = Sum().code_without_intercept(list(set(year))).matrix[year, :]
        #year=sm.tools.categorical(year, drop=True)
        Xm = np.concatenate(
            [np.array([1] * N).reshape(N, 1), year,
        Xm = np.concatenate(
            [np.array([1] * N).reshape(N, 1),
    return Xm
예제 #4
    def fit_sum_coding(values):
        if len(values) < 2:
            return pd.DataFrame()

        sum_contrast_matrix = Sum().code_without_intercept(values)
        df = pd.DataFrame(data=sum_contrast_matrix.matrix, columns=sum_contrast_matrix.column_suffixes)
        df.index += 1
        df.loc[0] = np.zeros(len(values) - 1)
        return df
예제 #5
def remove_batch_effect(X, batches, coefs=None):
    """Python version of limma::removeBatchEffect.

    This should duplicate the original R code here (for the case
    where there is only a single vector of batches):

    For now, batches needs to be integer indexes.

    If coefs are provided, they should be an m x p vector, where m
    is the dimension of the design matrix and p is the number of features
    in the original dataset.
    from patsy.contrasts import Sum

    # use sum coding to code batches, this is what limma does
    # https://www.statsmodels.org/dev/examples/notebooks/generated/contrasts.html#Sum-(Deviation)-Coding
    # this is a bit easier/more intuitive in R, due to its built-in factor
    # type, but we can sort of emulate it here with pandas categorical data
    batches_df = pd.Series(batches, dtype='category')
    contrast = Sum().code_without_intercept(list(batches_df.cat.categories))
    design = contrast.matrix[batches.astype(int), :]

    # if coefficients are provided, just use them to correct the provided data
    # otherwise fit the model and correct the provided data
    if coefs is None:
        from sklearn.linear_model import LinearRegression
        # X is an n x p matrix
        # batches is a n x m vector of batch indicators
        # we want to find a m x p vector of coefficients
        reg = LinearRegression().fit(design, X)
        # per sklearn documentation, for multiple targets the coef_ is
        # always an (n_targets, n_features) array (i.e. m x p)
        assert reg.coef_.shape == (X.shape[1], design.shape[1])
        coefs = reg.coef_

    return X - (design.astype(float) @ coefs.T), coefs
예제 #6

mod = ols("write ~ C(race, Simple)", data=hsb2)
res = mod.fit()

# ### Sum (Deviation) Coding

# Sum coding compares the mean of the dependent variable for a given level
# to the overall mean of the dependent variable over all the levels. That
# is, it uses contrasts between each of the first k-1 levels and level k In
# this example, level 1 is compared to all the others, level 2 to all the
# others, and level 3 to all the others.

from patsy.contrasts import Sum
contrast = Sum().code_without_intercept(levels)

mod = ols("write ~ C(race, Sum)", data=hsb2)
res = mod.fit()

# This corresponds to a parameterization that forces all the coefficients
# to sum to zero. Notice that the intercept here is the grand mean where the
# grand mean is the mean of means of the dependent variable by each level.


# ### Backward Difference Coding

# In backward difference coding, the mean of the dependent variable for a
예제 #7
def contrasting():
    global c
    if c:
        #to account for multiple contrast variables
        contrastvars = []
        if "," in c:
            contrastvars = c.split(",")
        for i in range(len(contrastvars)):
            contrastvars[i] = contrastvars[i].strip()
            if " " in contrastvars[i]:
                contrastvars[i] = contrastvars[i].replace(" ", "_")
            if "/" in contrastvars[i]:  #to account for URLs
                splitted = contrastvars[i].split("/")
                contrastvars[i] = splitted[len(splitted) - 1]
            splitted = c.split("/")  #to account for URLs
            c = splitted[len(splitted) - 1]

        ind_vars_no_contrast_var = ''
        index = 1
        for i in range(len(full_model_variable_list)):
            if "/" in full_model_variable_list[i]:
                splitted = full_model_variable_list[i].split("/")
                full_model_variable_list[i] = splitted[len(splitted) - 1]
            if " " in full_model_variable_list[i]:
                full_model_variable_list[i] = full_model_variable_list[
                    i].replace(" ", "_")
        for var in full_model_variable_list:
            if var != c and not (var in contrastvars):
                if index == 1:
                    ind_vars_no_contrast_var = var
                    index += 1
                    ind_vars_no_contrast_var = ind_vars_no_contrast_var + " + " + var
        if len(contrastvars) > 0:
            contraststring = ' + '.join(contrastvars)
            if " " in c:
                c = c.replace(" ", "_")
            contraststring = c
        # With contrast (treatment coding)
            "\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept."
        ctrst = Treatment(reference=0).code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Treatment)",
        res = mod.fit()
        print("With contrast (treatment coding)")
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write("\n" + full_model)

                "\n\n\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept."
            f.write("With contrast (treatment coding)")
        # Defining the Simple class
        def _name_levels(prefix, levels):
            return ["[%s%s]" % (prefix, level) for level in levels]

        class Simple(object):
            def _simple_contrast(self, levels):
                nlevels = len(levels)
                contr = -1. / nlevels * np.ones((nlevels, nlevels - 1))
                contr[1:][np.diag_indices(nlevels -
                                          1)] = (nlevels - 1.) / nlevels
                return contr

            def code_with_intercept(self, levels):
                c = np.column_stack(
                    (np.ones(len(levels)), self._simple_contrast(levels)))
                return ContrastMatrix(c, _name_levels("Simp.", levels))

            def code_without_intercept(self, levels):
                c = self._simple_contrast(levels)
                return ContrastMatrix(c, _name_levels("Simp.", levels[:-1]))

        ctrst = Simple().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Simple)",
        res = mod.fit()
            "\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors."
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
                "\n\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors."

        #With contrast (sum/deviation coding)
        ctrst = Sum().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Sum)",
        res = mod.fit()
            "\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels."
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
                "\n\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels."

        #With contrast (backward difference coding)
        ctrst = Diff().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Diff)",
        res = mod.fit()
            "\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level."
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
                "\n\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level."

        #With contrast (Helmert coding)
        ctrst = Helmert().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Helmert)",
        res = mod.fit()
            "\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding."
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
                "\n\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding."