예제 #1
0
def test_contrast():
    from patsy.contrasts import ContrastMatrix, Sum
    values = ["a1", "a3", "a1", "a2"]

    # No intercept in model, full-rank coding of 'a'
    m = make_matrix({"a": C(values)},
                    3, [["a"]],
                    column_names=["a[a1]", "a[a2]", "a[a3]"])

    assert np.allclose(m, [[1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]])

    for s in (Sum, Sum()):
        m = make_matrix({"a": C(values, s)},
                        3, [["a"]],
                        column_names=["a[mean]", "a[S.a1]", "a[S.a2]"])
        # Output from R
        assert np.allclose(m, [[1, 1, 0], [1, -1, -1], [1, 1, 0], [1, 0, 1]])

    m = make_matrix({"a": C(values, Sum(omit=0))},
                    3, [["a"]],
                    column_names=["a[mean]", "a[S.a2]", "a[S.a3]"])
    # Output from R
    assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]])

    # Intercept in model, non-full-rank coding of 'a'
    m = make_matrix({"a": C(values)},
                    3, [[], ["a"]],
                    column_names=["Intercept", "a[T.a2]", "a[T.a3]"])

    assert np.allclose(m, [[1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 1, 0]])

    for s in (Sum, Sum()):
        m = make_matrix({"a": C(values, s)},
                        3, [[], ["a"]],
                        column_names=["Intercept", "a[S.a1]", "a[S.a2]"])
        # Output from R
        assert np.allclose(m, [[1, 1, 0], [1, -1, -1], [1, 1, 0], [1, 0, 1]])

    m = make_matrix({"a": C(values, Sum(omit=0))},
                    3, [[], ["a"]],
                    column_names=["Intercept", "a[S.a2]", "a[S.a3]"])
    # Output from R
    assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]])

    # Weird ad hoc less-than-full-rank coding of 'a'
    m = make_matrix({"a": C(values, [[7, 12], [2, 13], [8, -1]])},
                    2, [["a"]],
                    column_names=["a[custom0]", "a[custom1]"])
    assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]])

    m = make_matrix(
        {
            "a":
            C(values,
              ContrastMatrix([[7, 12], [2, 13], [8, -1]], ["[foo]", "[bar]"]))
        },
        2, [["a"]],
        column_names=["a[foo]", "a[bar]"])
    assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]])
예제 #2
0
    def fit_sum_coding(col, values, handle_missing, handle_unknown):
        if handle_missing == 'value':
            values = values[values > 0]

        values_to_encode = values.get_values()

        if len(values) < 2:
            return pd.DataFrame(index=values_to_encode)

        if handle_unknown == 'indicator':
            values_to_encode = np.append(values_to_encode, -1)

        sum_contrast_matrix = Sum().code_without_intercept(
            values_to_encode.tolist())
        df = pd.DataFrame(
            data=sum_contrast_matrix.matrix,
            index=values_to_encode,
            columns=[
                str(col) + '_%d' % (i, )
                for i in range(len(sum_contrast_matrix.column_suffixes))
            ])

        if handle_unknown == 'return_nan':
            df.loc[-1] = np.nan
        elif handle_unknown == 'value':
            df.loc[-1] = np.zeros(len(values_to_encode) - 1)

        if handle_missing == 'return_nan':
            df.loc[values.loc[np.nan]] = np.nan
        elif handle_missing == 'value':
            df.loc[-2] = np.zeros(len(values_to_encode) - 1)

        return df
예제 #3
0
def MakeLinearSinus(t, k=[1, 2, 3], trend=False, YAM=False):
    N = len(t)
    freq = 365
    K = np.repeat(np.array([k]), N, axis=0)
    fix = 2 * np.pi / freq
    Fix = t.reshape(N, 1) * fix * K
    #print(Fix.shape)
    if trend:
        Xm = np.concatenate([
            np.array([1] * N).reshape(N, 1),
            t.reshape(N, 1),
            np.sin(Fix),
            np.cos(Fix)
        ],
                            axis=1)
    elif YAM:
        year = ((t / 365).astype("int"))
        year = Sum().code_without_intercept(list(set(year))).matrix[year, :]
        #year=sm.tools.categorical(year, drop=True)
        Xm = np.concatenate(
            [np.array([1] * N).reshape(N, 1), year,
             np.sin(Fix),
             np.cos(Fix)],
            axis=1)
    else:
        Xm = np.concatenate(
            [np.array([1] * N).reshape(N, 1),
             np.sin(Fix),
             np.cos(Fix)],
            axis=1)
    return Xm
예제 #4
0
    def fit_sum_coding(values):
        if len(values) < 2:
            return pd.DataFrame()

        sum_contrast_matrix = Sum().code_without_intercept(values)
        df = pd.DataFrame(data=sum_contrast_matrix.matrix, columns=sum_contrast_matrix.column_suffixes)
        df.index += 1
        df.loc[0] = np.zeros(len(values) - 1)
        return df
예제 #5
0
def remove_batch_effect(X, batches, coefs=None):
    """Python version of limma::removeBatchEffect.

    This should duplicate the original R code here (for the case
    where there is only a single vector of batches):
    https://rdrr.io/bioc/limma/src/R/removeBatchEffect.R

    For now, batches needs to be integer indexes.

    If coefs are provided, they should be an m x p vector, where m
    is the dimension of the design matrix and p is the number of features
    in the original dataset.
    """
    from patsy.contrasts import Sum

    # use sum coding to code batches, this is what limma does
    # https://www.statsmodels.org/dev/examples/notebooks/generated/contrasts.html#Sum-(Deviation)-Coding
    # this is a bit easier/more intuitive in R, due to its built-in factor
    # type, but we can sort of emulate it here with pandas categorical data
    batches_df = pd.Series(batches, dtype='category')
    contrast = Sum().code_without_intercept(list(batches_df.cat.categories))
    design = contrast.matrix[batches.astype(int), :]

    # if coefficients are provided, just use them to correct the provided data
    # otherwise fit the model and correct the provided data
    if coefs is None:
        from sklearn.linear_model import LinearRegression
        # X is an n x p matrix
        # batches is a n x m vector of batch indicators
        # we want to find a m x p vector of coefficients
        reg = LinearRegression().fit(design, X)
        # per sklearn documentation, for multiple targets the coef_ is
        # always an (n_targets, n_features) array (i.e. m x p)
        assert reg.coef_.shape == (X.shape[1], design.shape[1])
        coefs = reg.coef_

    return X - (design.astype(float) @ coefs.T), coefs
예제 #6
0
print(contrast.matrix)

mod = ols("write ~ C(race, Simple)", data=hsb2)
res = mod.fit()
print(res.summary())

# ### Sum (Deviation) Coding

# Sum coding compares the mean of the dependent variable for a given level
# to the overall mean of the dependent variable over all the levels. That
# is, it uses contrasts between each of the first k-1 levels and level k In
# this example, level 1 is compared to all the others, level 2 to all the
# others, and level 3 to all the others.

from patsy.contrasts import Sum
contrast = Sum().code_without_intercept(levels)
print(contrast.matrix)

mod = ols("write ~ C(race, Sum)", data=hsb2)
res = mod.fit()
print(res.summary())

# This corresponds to a parameterization that forces all the coefficients
# to sum to zero. Notice that the intercept here is the grand mean where the
# grand mean is the mean of means of the dependent variable by each level.

hsb2.groupby('race')['write'].mean().mean()

# ### Backward Difference Coding

# In backward difference coding, the mean of the dependent variable for a
예제 #7
0
def contrasting():
    global c
    if c:
        #to account for multiple contrast variables
        contrastvars = []
        if "," in c:
            contrastvars = c.split(",")
        for i in range(len(contrastvars)):
            contrastvars[i] = contrastvars[i].strip()
            if " " in contrastvars[i]:
                contrastvars[i] = contrastvars[i].replace(" ", "_")
            if "/" in contrastvars[i]:  #to account for URLs
                splitted = contrastvars[i].split("/")
                contrastvars[i] = splitted[len(splitted) - 1]
        else:
            splitted = c.split("/")  #to account for URLs
            c = splitted[len(splitted) - 1]

        ind_vars_no_contrast_var = ''
        index = 1
        for i in range(len(full_model_variable_list)):
            if "/" in full_model_variable_list[i]:
                splitted = full_model_variable_list[i].split("/")
                full_model_variable_list[i] = splitted[len(splitted) - 1]
            if " " in full_model_variable_list[i]:
                full_model_variable_list[i] = full_model_variable_list[
                    i].replace(" ", "_")
        for var in full_model_variable_list:
            if var != c and not (var in contrastvars):
                if index == 1:
                    ind_vars_no_contrast_var = var
                    index += 1
                else:
                    ind_vars_no_contrast_var = ind_vars_no_contrast_var + " + " + var
        if len(contrastvars) > 0:
            contraststring = ' + '.join(contrastvars)
        else:
            if " " in c:
                c = c.replace(" ", "_")
            contraststring = c
        # With contrast (treatment coding)
        print(
            "\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept."
        )
        ctrst = Treatment(reference=0).code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Treatment)",
                  data=df_final)
        res = mod.fit()
        print("With contrast (treatment coding)")
        print(res.summary())
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write("\n" + full_model)
            f.write(
                "\n\n***********************************************************************************************************"
            )

            f.write(
                "\n\n\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept."
            )
            f.write("With contrast (treatment coding)")
            f.write(res.summary().as_text())
            f.close()
        # Defining the Simple class
        def _name_levels(prefix, levels):
            return ["[%s%s]" % (prefix, level) for level in levels]

        class Simple(object):
            def _simple_contrast(self, levels):
                nlevels = len(levels)
                contr = -1. / nlevels * np.ones((nlevels, nlevels - 1))
                contr[1:][np.diag_indices(nlevels -
                                          1)] = (nlevels - 1.) / nlevels
                return contr

            def code_with_intercept(self, levels):
                c = np.column_stack(
                    (np.ones(len(levels)), self._simple_contrast(levels)))
                return ContrastMatrix(c, _name_levels("Simp.", levels))

            def code_without_intercept(self, levels):
                c = self._simple_contrast(levels)
                return ContrastMatrix(c, _name_levels("Simp.", levels[:-1]))

        ctrst = Simple().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Simple)",
                  data=df_final)
        res = mod.fit()
        print(
            "\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors."
        )
        print(res.summary())
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write(
                "\n\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors."
            )
            f.write(res.summary().as_text())
            f.close()

        #With contrast (sum/deviation coding)
        ctrst = Sum().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Sum)",
                  data=df_final)
        res = mod.fit()
        print(
            "\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels."
        )
        print(res.summary())
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write(
                "\n\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels."
            )
            f.write(res.summary().as_text())
            f.close()

        #With contrast (backward difference coding)
        ctrst = Diff().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Diff)",
                  data=df_final)
        res = mod.fit()
        print(
            "\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level."
        )
        print(res.summary())
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write(
                "\n\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level."
            )
            f.write(res.summary().as_text())
            f.close()

        #With contrast (Helmert coding)
        ctrst = Helmert().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Helmert)",
                  data=df_final)
        res = mod.fit()
        print(
            "\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding."
        )
        print(res.summary())
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write(
                "\n\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding."
            )
            f.write(res.summary().as_text())
            f.close()