Exemplo n.º 1
0
    def test_weightstats_3(self):
        x1_2d, x2_2d = self.x1_2d, self.x2_2d
        w1, w2 = self.w1, self.w2

        d1w_2d = DescrStatsW(x1_2d, weights=w1)
        d2w_2d = DescrStatsW(x2_2d, weights=w2)
        x1r_2d = d1w_2d.asrepeats()
        x2r_2d = d2w_2d.asrepeats()

        assert_almost_equal(x2r_2d.mean(0), d2w_2d.mean, 14)
        assert_almost_equal(x2r_2d.var(0), d2w_2d.var, 14)
        assert_almost_equal(x2r_2d.std(0), d2w_2d.std, 14)
        assert_almost_equal(np.cov(x2r_2d.T, bias=1), d2w_2d.cov, 14)
        assert_almost_equal(np.corrcoef(x2r_2d.T), d2w_2d.corrcoef, 14)

#        print d1w_2d.ttest_mean(3)
#        #scipy.stats.ttest is also vectorized
#        print stats.ttest_1samp(x1r_2d, 3)
        t, p, d = d1w_2d.ttest_mean(3)
        assert_almost_equal([t, p], stats.ttest_1samp(x1r_2d, 3), 11)
        # print [stats.ttest_1samp(xi, 3) for xi in x1r_2d.T]
        cm = CompareMeans(d1w_2d, d2w_2d)
        ressm = cm.ttest_ind()
        resss = stats.ttest_ind(x1r_2d, x2r_2d)
        assert_almost_equal(ressm[:2], resss, 14)
Exemplo n.º 2
0
def test_ztest_ztost():
    # compare weightstats with separately tested proportion ztest ztost
    import statsmodels.stats.proportion as smprop

    x1 = [0, 1]
    w1 = [5, 15]

    res2 = smprop.proportions_ztest(15, 20., value=0.5)
    d1 = DescrStatsW(x1, w1)
    res1 = d1.ztest_mean(0.5)
    assert_allclose(res1, res2, rtol=0.03, atol=0.003)

    d2 = DescrStatsW(x1, np.array(w1)*21./20)
    res1 = d2.ztest_mean(0.5)
    assert_almost_equal(res1, res2, decimal=12)

    res1 = d2.ztost_mean(0.4, 0.6)
    res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6)
    assert_almost_equal(res1[0], res2[0], decimal=12)

    x2 = [0, 1]
    w2 = [10, 10]
    # d2 = DescrStatsW(x1, np.array(w1)*21./20)
    d2 = DescrStatsW(x2, w2)
    res1 = ztest(d1.asrepeats(), d2.asrepeats())
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20., 20]))
    # TODO: check this is this difference expected?, see test_proportion
    assert_allclose(res1[1], res2[1], rtol=0.03)

    res1a = CompareMeans(d1, d2).ztest_ind()
    assert_allclose(res1a[1], res2[1], rtol=0.03)
    assert_almost_equal(res1a, res1, decimal=12)
Exemplo n.º 3
0
    def setup_class(self):
        np.random.seed(9876789)
        n1, n2 = 20,30
        m1, m2 = 1, 1.2
        x1 = m1 + np.random.randn(n1, 3)
        x2 = m2 + np.random.randn(n2, 3)
        w1 = np.random.randint(1,4, n1)
        w2 = np.random.randint(1,4, n2)

        self.x1, self.x2 = x1, x2
        self.w1, self.w2 = w1, w2
        self.d1w = DescrStatsW(x1, weights=w1, ddof=0)
        self.d2w = DescrStatsW(x2, weights=w2, ddof=1)
        self.x1r = self.d1w.asrepeats()
        self.x2r = self.d2w.asrepeats()
Exemplo n.º 4
0
    def test_weightstats_3(self):
        x1_2d, x2_2d = self.x1_2d, self.x2_2d
        w1, w2 = self.w1, self.w2

        d1w_2d = DescrStatsW(x1_2d, weights=w1)
        d2w_2d = DescrStatsW(x2_2d, weights=w2)
        x1r_2d = d1w_2d.asrepeats()
        x2r_2d = d2w_2d.asrepeats()
#        print d1w_2d.ttest_mean(3)
#        #scipy.stats.ttest is also vectorized
#        print stats.ttest_1samp(x1r_2d, 3)
        t,p,d = d1w_2d.ttest_mean(3)
        assert_almost_equal([t, p], stats.ttest_1samp(x1r_2d, 3), 11)
        #print [stats.ttest_1samp(xi, 3) for xi in x1r_2d.T]
        ressm = CompareMeans(d1w_2d, d2w_2d).ttest_ind()
        resss = stats.ttest_ind(x1r_2d, x2r_2d)
        assert_almost_equal(ressm[:2], resss, 14)
Exemplo n.º 5
0
    def test_weightstats_2(self):
        x1, x2 = self.x1, self.x2
        w1, w2 = self.w1, self.w2

        d1 = DescrStatsW(x1)
        d1w = DescrStatsW(x1, weights=w1)
        d2w = DescrStatsW(x2, weights=w2)
        x1r = d1w.asrepeats()
        x2r = d2w.asrepeats()
#        print 'random weights'
#        print ttest_ind(x1, x2, weights=(w1, w2))
#        print stats.ttest_ind(x1r, x2r)
        assert_almost_equal(ttest_ind(x1, x2, weights=(w1, w2))[:2],
                            stats.ttest_ind(x1r, x2r), 14)
        #not the same as new version with random weights/replication
#        assert x1r.shape[0] == d1w.sum_weights
#        assert x2r.shape[0] == d2w.sum_weights
        assert_almost_equal(x2r.var(), d2w.var, 14)
        assert_almost_equal(x2r.std(), d2w.std, 14)


        #one-sample tests
#        print d1.ttest_mean(3)
#        print stats.ttest_1samp(x1, 3)
#        print d1w.ttest_mean(3)
#        print stats.ttest_1samp(x1r, 3)
        assert_almost_equal(d1.ttest_mean(3)[:2], stats.ttest_1samp(x1, 3), 11)
        assert_almost_equal(d1w.ttest_mean(3)[:2], stats.ttest_1samp(x1r, 3), 11)
Exemplo n.º 6
0
    def test_weightstats_2(self):
        x1, x2 = self.x1, self.x2
        w1, w2 = self.w1, self.w2

        d1 = DescrStatsW(x1)
        d1w = DescrStatsW(x1, weights=w1)
        d2w = DescrStatsW(x2, weights=w2)
        x1r = d1w.asrepeats()
        x2r = d2w.asrepeats()
#        print 'random weights'
#        print ttest_ind(x1, x2, weights=(w1, w2))
#        print stats.ttest_ind(x1r, x2r)
        assert_almost_equal(ttest_ind(x1, x2, weights=(w1, w2))[:2],
                            stats.ttest_ind(x1r, x2r), 14)
        # not the same as new version with random weights/replication
#        assert x1r.shape[0] == d1w.sum_weights
#        assert x2r.shape[0] == d2w.sum_weights

        assert_almost_equal(x2r.mean(0), d2w.mean, 14)
        assert_almost_equal(x2r.var(), d2w.var, 14)
        assert_almost_equal(x2r.std(), d2w.std, 14)
        # note: the following is for 1d
        assert_almost_equal(np.cov(x2r, bias=1), d2w.cov, 14)
        # assert_almost_equal(np.corrcoef(np.x2r), d2w.corrcoef, 19)
        # TODO: exception in corrcoef (scalar case)

        # one-sample tests
#        print d1.ttest_mean(3)
#        print stats.ttest_1samp(x1, 3)
#        print d1w.ttest_mean(3)
#        print stats.ttest_1samp(x1r, 3)
        assert_almost_equal(d1.ttest_mean(3)[:2], stats.ttest_1samp(x1, 3), 11)
        assert_almost_equal(d1w.ttest_mean(3)[:2],
                            stats.ttest_1samp(x1r, 3), 11)
Exemplo n.º 7
0
class TestWeightstats2d_ddof(CheckWeightstats2dMixin):

    @classmethod
    def setup_class(self):
        np.random.seed(9876789)
        n1, n2 = 20,20
        m1, m2 = 1, 1.2
        x1 = m1 + np.random.randn(n1, 3)
        x2 = m2 + np.random.randn(n2, 3)
        w1 = np.random.randint(1,4, n1)
        w2 = np.random.randint(1,4, n2)

        self.x1, self.x2 = x1, x2
        self.w1, self.w2 = w1, w2
        self.d1w = DescrStatsW(x1, weights=w1, ddof=1)
        self.d2w = DescrStatsW(x2, weights=w2, ddof=1)
        self.x1r = self.d1w.asrepeats()
        self.x2r = self.d2w.asrepeats()
Exemplo n.º 8
0
def weighted_std_from_stats(matrix, axis=0, halflife=90):
    Tn = matrix.shape[axis]  # number of Time period
    w = create_weight_by_halflife(n=Tn, halflife=halflife)

    weighted_stats = DescrStatsW(matrix, weights=w, ddof=0)
    return weighted_stats.std
Exemplo n.º 9
0
def compute_scores(
    coder_df,
    coder1,
    coder2,
    outcome_column,
    document_column,
    coder_column,
    weight_column=None,
    pos_label=None,
):
    """
    Computes a variety of inter-rater reliability scores, including Cohen's kappa, Krippendorf's alpha, precision,
    and recall. The input data must consist of a :py:class:`pandas.DataFrame` with the following columns:

        - A column with values that indicate the coder (like a name)
        - A column with values that indicate the document (like an ID)
        - A column with values that indicate the code value
        - (Optional) A column with document weights

    This function will return a :py:class:`pandas.DataFrame` with agreement scores between the two specified coders.

    :param coder_df: A :py:class:`pandas.DataFrame` of codes
    :type coder_df: :py:class:`pandas.DataFrame`
    :param coder1: The value in ``coder_column`` for rows corresponding to the first coder
    :type coder1: str or int
    :param coder2: The value in ``coder_column`` for rows corresponding to the second coder
    :type coder2: str or int
    :param outcome_column: The column that contains the codes
    :type outcome_column: str
    :param document_column: The column that contains IDs for the documents
    :type document_column: str
    :param coder_column: The column containing values that indicate which coder assigned the code
    :type coder_column: str
    :param weight_column: The column that contains sampling weights
    :type weight_column: str
    :param pos_label: The value indicating a positive label (optional)
    :type pos_label: str or int
    :return: A dictionary of scores
    :rtype: dict

    .. note:: If using a multi-class (non-binary) code, some scores may come back null or not compute as expected. \
        We recommend running the function separately for each specific code value as a binary flag by providing \
        each unique value to the ``pos_label`` argument. If ``pos_label`` is not provided for multi-class codes, \
        this function will attempt to compute scores based on support-weighted averages.

    Usage::

        from pewanalytics.stats.irr import compute_scores
        import pandas as pd

        df = pd.DataFrame([
            {"coder": "coder1", "document": 1, "code": "2"},
            {"coder": "coder2", "document": 1, "code": "2"},
            {"coder": "coder1", "document": 2, "code": "1"},
            {"coder": "coder2", "document": 2, "code": "2"},
            {"coder": "coder1", "document": 3, "code": "0"},
            {"coder": "coder2", "document": 3, "code": "0"},
        ])

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder")
        {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': None,
         'coder1_mean_unweighted': 1.0,
         'coder1_std_unweighted': 0.5773502691896257,
         'coder2_mean_unweighted': 1.3333333333333333,
         'coder2_std_unweighted': 0.6666666666666666,
         'alpha_unweighted': 0.5454545454545454,
         'accuracy': 0.6666666666666666,
         'f1': 0.5555555555555555,
         'precision': 0.5,
         'recall': 0.6666666666666666,
         'precision_recall_min': 0.5,
         'matthews_corrcoef': 0.6123724356957946,
         'roc_auc': None,
         'pct_agree_unweighted': 0.6666666666666666}

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="0")
         {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': '0',
         'coder1_mean_unweighted': 0.3333333333333333,
         'coder1_std_unweighted': 0.3333333333333333,
         'coder2_mean_unweighted': 0.3333333333333333,
         'coder2_std_unweighted': 0.3333333333333333,
         'alpha_unweighted': 1.0,
         'cohens_kappa': 1.0,
         'accuracy': 1.0,
         'f1': 1.0,
         'precision': 1.0,
         'recall': 1.0,
         'precision_recall_min': 1.0,
         'matthews_corrcoef': 1.0,
         'roc_auc': 1.0,
         'pct_agree_unweighted': 1.0}

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="1")
        {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': '1',
         'coder1_mean_unweighted': 0.3333333333333333,
         'coder1_std_unweighted': 0.3333333333333333,
         'coder2_mean_unweighted': 0.0,
         'coder2_std_unweighted': 0.0,
         'alpha_unweighted': 0.0,
         'cohens_kappa': 0.0,
         'accuracy': 0.6666666666666666,
         'f1': 0.0,
         'precision': 0.0,
         'recall': 0.0,
         'precision_recall_min': 0.0,
         'matthews_corrcoef': 1.0,
         'roc_auc': None,
         'pct_agree_unweighted': 0.6666666666666666}

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="2")
        {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': '2',
         'coder1_mean_unweighted': 0.3333333333333333,
         'coder1_std_unweighted': 0.3333333333333333,
         'coder2_mean_unweighted': 0.6666666666666666,
         'coder2_std_unweighted': 0.3333333333333333,
         'alpha_unweighted': 0.4444444444444444,
         'cohens_kappa': 0.3999999999999999,
         'accuracy': 0.6666666666666666,
         'f1': 0.6666666666666666,
         'precision': 0.5,
         'recall': 1.0,
         'precision_recall_min': 0.5,
         'matthews_corrcoef': 0.5,
         'roc_auc': 0.75,
         'pct_agree_unweighted': 0.6666666666666666}


    """

    old_np_settings = np.seterr(all="raise")

    coder_df = copy.deepcopy(coder_df)
    if pos_label:
        coder_df[outcome_column] = (
            coder_df[outcome_column] == pos_label).astype(int)
    coder1_df = coder_df[coder_df[coder_column] == coder1]
    coder1_df.index = coder1_df[document_column]
    coder2_df = coder_df[coder_df[coder_column] == coder2]
    coder2_df.index = coder2_df[document_column]
    coder1_df = coder1_df[coder1_df.index.isin(coder2_df.index)]
    coder2_df = coder2_df[coder2_df.index.isin(
        coder1_df.index)].loc[coder1_df.index]

    row = {
        "coder1": coder1,
        "coder2": coder2,
        "n": len(coder1_df),
        "outcome_column": outcome_column,
        "pos_label": pos_label,
    }

    for labelsetname, labelset in [
        ("coder1", coder1_df[outcome_column]),
        ("coder2", coder2_df[outcome_column]),
    ]:

        if weight_column:
            try:
                weighted_stats = DescrStatsW(labelset,
                                             weights=coder1_df[weight_column])
                if weighted_stats:
                    row["{}_mean".format(labelsetname)] = weighted_stats.mean
                    row["{}_std".format(
                        labelsetname)] = weighted_stats.std_mean
            except (TypeError, ValueError):
                try:
                    weighted_stats = DescrStatsW(
                        labelset.astype(int), weights=coder1_df[weight_column])
                    if weighted_stats:
                        row["{}_mean".format(
                            labelsetname)] = weighted_stats.mean
                        row["{}_std".format(
                            labelsetname)] = weighted_stats.std_mean
                except (TypeError, ValueError):
                    pass

        try:
            unweighted_stats = DescrStatsW(labelset,
                                           weights=[1.0 for x in labelset])
            if unweighted_stats:
                row["{}_mean_unweighted".format(
                    labelsetname)] = unweighted_stats.mean
                row["{}_std_unweighted".format(
                    labelsetname)] = unweighted_stats.std_mean
        except (TypeError, ValueError):
            try:
                unweighted_stats = DescrStatsW(labelset.astype(int),
                                               weights=[1.0 for x in labelset])
                if unweighted_stats:
                    row["{}_mean_unweighted".format(
                        labelsetname)] = unweighted_stats.mean
                    row["{}_std_unweighted".format(
                        labelsetname)] = unweighted_stats.std_mean
            except (TypeError, ValueError):
                pass

    alpha = AnnotationTask(
        data=coder_df[[coder_column, document_column, outcome_column]].values)
    try:
        alpha = alpha.alpha()
    except (ZeroDivisionError, ValueError):
        alpha = None
    row["alpha_unweighted"] = alpha

    labels = np.unique(coder_df[outcome_column])
    if len(labels) <= 2:

        try:
            row["cohens_kappa"] = cohen_kappa_score(
                coder1_df[outcome_column],
                coder2_df[outcome_column],
                sample_weight=coder1_df[weight_column]
                if weight_column else None,
                labels=labels,
            )
        except FloatingPointError:
            row["cohens_kappa"] = 1.0

    try:
        row["accuracy"] = accuracy_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
        )
    except ValueError:
        row["accuracy"] = None

    try:
        row["f1"] = f1_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else "binary",
        )
    except ValueError:
        row["f1"] = None

    try:
        row["precision"] = precision_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else "binary",
        )
    except ValueError:
        row["precision"] = None

    try:
        row["recall"] = recall_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else "binary",
        )
    except ValueError:
        row["recall"] = None

    if is_not_null(row["precision"]) and is_not_null(row["recall"]):
        row["precision_recall_min"] = min([row["precision"], row["recall"]])
    else:
        row["precision_recall_min"] = None

    try:
        row["matthews_corrcoef"] = matthews_corrcoef(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
        )
    except ValueError:
        row["matthews_corrcoef"] = None
    except FloatingPointError:
        row["matthews_corrcoef"] = 1.0

    try:

        row["roc_auc"] = (roc_auc_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else None,
        ) if len(np.unique(coder1_df[outcome_column])) > 1
                          and len(np.unique(coder2_df[outcome_column])) > 1
                          else None)
    except TypeError:
        try:
            row["roc_auc"] = (roc_auc_score(
                coder1_df[outcome_column],
                coder2_df[outcome_column],
                sample_weight=coder1_df[weight_column]
                if weight_column else None,
                average="weighted" if not pos_label else None,
            ) if len(np.unique(coder1_df[outcome_column])) > 1
                              and len(np.unique(coder2_df[outcome_column])) > 1
                              else None)
        except (ValueError, TypeError):
            row["roc_auc"] = None
    except (ValueError, TypeError):
        row["roc_auc"] = None

    row["pct_agree_unweighted"] = np.average([
        1 if c[0] == c[1] else 0
        for c in zip(coder1_df[outcome_column], coder2_df[outcome_column])
    ])

    for k, v in row.items():
        if type(v) == tuple:
            row[k] = v[0]
            # For some weird reason, some of the sklearn scorers return 1-tuples sometimes

    np.seterr(**old_np_settings)

    return row
Exemplo n.º 10
0
def varlist_initial(indata, target, weight, varlist):
    
#1 getting started
import statsmodels
dir(statsmodels.base)
print(statsmodels.base._model_params_doc)
base._missing_param_doc


"""
问题2:
逻辑回归加权重解决方案:
1)好像logit函数本身不支持。
2)目前可用下列GLM大类函数计算,但是有警告
3)其他解决手段? 1-用scipy 2-看看statsmodels的作者有什么手段??
"""
import statsmodels.api as sm
import numpy as np
spector_data = sm.datasets.spector.load_pandas()
spector_data_df = spector_data.data
spector_data_df['wt'] = spector_data_df.apply(lambda x: np.random.randint(1,10), axis=1)
spector_data_df.to_csv("D:\\Analysis\\SEMMA_project\\spector_data.csv")
spector_data_df = sm.add_constant(spector_data_df)

spector_data.exog = sm.add_constant(spector_data.exog)
trainingdata_x = spector_data.exog
trainingdata_y = spector_data.endog

#下面这种写法,weight不起作用
res = sm.Logit(spector_data_df['GRADE'], \
               spector_data_df[['const', 'PSI']], \
                   freq_weights=spector_data_df['wt']).fit()
    
print(res.summary())
print(res.summary2())

#这种写法的结果与sas一致,但报警告:
#__main__:3: DeprecationWarning: Calling Family(..) with a link class as argument is deprecated.
# Use an instance of a link class instead.
logmodel=sm.GLM(spector_data_df['GRADE'], \
                spector_data_df[['const', 'PSI']], \
                    family=sm.families.Binomial(sm.families.links.logit),\
                        freq_weights=spector_data_df['wt']).fit()

print(logmodel.summary())
print(logmodel.summary2())


trainingdata_y = pd.DataFrame()
trainingdata_y['Successes'] = spector_data.endog.apply(lambda x: x*np.random.randint(1,10) \
                      if x == 1 else 0)
trainingdata_y['Failures'] = spector_data.endog.apply(lambda x: np.random.randint(1,10) \
                      if x == 0 else 0)
    
df['true_cum']=df['a'].map(lambda x: if_true(x)).cumsum()

import statsmodels.api as sm
logmodel=sm.GLM(trainingdata_y[['Successes', 'Failures']], \
                trainingdata_x, \
                    family=sm.families.Binomial(sm.families.links.logit)).fit()
print(logmodel.summary())
print(logmodel.summary2())

trainingdata_x['wt'] = trainingdata_x.apply(lambda x: np.random.randint(1,10), axis=1)

logmodel=sm.GLM(trainingdata_y, \
                trainingdata_x[['const', 'GPA', 'TUCE', 'PSI']], \
                    family=sm.families.Binomial(sm.families.links.logit),\
                        freq_weights=trainingdata_x['wt']).fit()

print(logmodel.summary())
print(logmodel.summary2())

trainingdata_x.wt.sum()





"""
问题3:corr的权重计算法,和筛选法:

1)需和sas比对,看是否存在样本计算修正的问题(/n-1)
2)如何查看核输出结果。

"""
from statsmodels.stats.weightstats import DescrStatsW
def corr_check(indata_x, corr_threshold=0.75, weights=None):
    mask = list(indata_x.columns)
    if 'const' in mask
        mask.remove('const')
    if weights in mask:
        mask.remove(weights)
    d1_wt = DescrStatsW(indata_x[mask], weights=indata_x[weights])
    d1_wt_corr = d1_wt.corrcoef #相关系数
    corr_check = 'pass'
    for i in range(d1_wt_corr.shape[0]):
        for j in range(d1_wt_corr.shape[0] - i):
            if i != j and d1_wt_corr[i,j] > corr_threshold:
                var_i = mask[i]
                var_j = mask[j]
                print('correlation of %s and %s is higher than %.2f !'%(var_i, var_j, corr_threshold))
                corr_check = 'fail'
    print(corr_check)    

# example 1
np.random.seed(0)
x1_2d = 1.0 + np.random.randn(20, 3)
w1 = np.random.randint(1, 4, 20)
d1 = DescrStatsW(x1_2d, weights=w1)
d1.mean
d1.var
d1.std_mean


# example 2
mask = list(data1.columns)
mask.remove('wt')
d1_wt = DescrStatsW(data1[mask], weights=data1['wt'])

d1_wt_corr = d1_wt.corrcoef #相关系数

d1_wt_corr[d1_wt_corr > 0.2]

corr_threshold = 0.2
corr_threshold = 0.3
corr_threshold = 0.5

corr_check = 0
for i in range(d1_wt_corr.shape[0]):
    for j in range(d1_wt_corr.shape[0] - i):
        if i != j and d1_wt_corr[i,j] > corr_threshold:
            var_i = mask[i]
            var_j = mask[j]
            print('correlation of %s and %s is higher than %.2f !'%(var_i, var_j, corr_threshold))
            corr_check = 1
print(corr_check)    
        
var1 = mask[0]
var2 = mask[1]
print('correlation of % and s% is higher than %.2f !'%(var1, var2, corr_threshold))

"""
"""

%matplotlib inline
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std









np.random.seed(9876789)
nsample = 100
x = np.linspace(0, 10, 100)
X1 = np.column_stack((x)).T
X2 = np.column_stack((x, x**2))
beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)
X1 = sm.add_constant(X1)
X2 = sm.add_constant(X2)
y = np.dot(X2, beta) + e
model1 = sm.OLS(y, X1)
model2 = sm.OLS(y, X2)
results1 = model1.fit()
results2 = model2.fit()
results1.compare_lm_test(results2)

print(results.summary())
dir(results)
dir(model)
sm.regression.linear_model.RegressionResults.compare_lm_test(model)
sm.regression.linear_model.RegressionResults.compare_f_test(results)

results.compare_lm_test(results)

X, y = load_iris(return_X_y=True)


print(sm.datasets.__doc__)

dir(sm.datasets)

data = sm.datasets.anes96.load_pandas()

df = sm.datasets.anes96.load_pandas().data

y, X = dmatrices('vote ~ logpopul + TVnews + selfLR + ClinLR + age + educ + income',\
                 data=df, return_type='dataframe')
    

mod = sm.Logit(y, X)
res = mod.fit()
print(res.summary())
print(res.summary2())
print(res.wald_test.__doc__)

dir(res)
dir(mod)
print(mod.score(res.params))
print(mod.score_obs(res.params))

df.to_csv("D:\\Analysis\\SEMMA_project\\anes96.csv")


y, X = dmatrices('vote ~ selfLR',\
                 data=df, return_type='dataframe')

y, X = dmatrices('vote ~ selfLR + ClinLR',\
             data=df, return_type='dataframe')
  
mod = sm.Logit(y, X)
res = mod.fit()
print(res.summary())
print(res.summary2())

r = np.zeros_like(res.params)
r[1:] = [1]

A = np.identity(len(res.params))
A = A[1:,:]
print(res.wald_test(A))
print(res.t_test(A))
#这个wald可用
print(res.wald_test_terms(skip_single=False))
print(mod.score(res.params))

dir(res.wald_test_terms(skip_single=False))
res.wald_test_terms(skip_single=False).col_names
res.wald_test_terms(skip_single=False).statistic
res.wald_test_terms(skip_single=False).summary_frame
res.wald_test_terms(skip_single=False).table[index='selfLR']['statistic']
res.wald_test_terms(skip_single=False).dist_args

res.wald_test_terms(skip_single=False).table.loc['ClinLR', 'statistic']

#lm统计量
1)假设共x1-xn个自变量,要检验其中x1-xq个。原假设为该q个变量系数均为0;
2)y与x1-xq回归得到约束方程,和残差u
3)u与x1-xn回归,得到R^2,并计算样本容量n
4)LM=n*R^2 ~ chi-square(q)分布,比较其于该分布临界值c的关系,大于则推翻原假设。

# 信息矩阵 = 负的hessian矩阵 logit的information未实现
U = mod.score(res.params)
info_matrix = -1 * mod.hessian(res.params)
I = np.linalg.inv(info_matrix)

Score1 = np.dot(U.T,I)
Score = np.dot(Score1,U)

U_VAR1 = mod.score(res.params)[2:]
stats.diagnostic.linear_lm(resid= , exog=[selfLR + ClinLR])

dir(mod.information.__doc__)
print(res.llr_pvalue)

print(res.llnull)
print(res.llf)

print(r)
dir(mod.score(res.params))
dir(res.wald_test(A))
print(res.wald_test(A).conf_int)

print(res.t_test(r))
print(res.wald_test(r))
print(res.llr)
res.compare_lm_test(res)

dir(sm.regression.linear_model.RegressionResults.compare_lm_test())
dir(sm.test)
dir(sm.stats)
dir(res.wald_test)

res.wald_test()
sm.webdoc()
sm.webdoc('glm')


RegressionResults.compare_lm_test(restricted, demean=True, use_lr=False)


RegressionResults.compare_lm_test(restricted, demean=True, use_lr=False)[source]

statsmodels.regression.linear_model.RegressionResults.compare_lm_test


import numpy
import statsmodels.api as sm

# Random data with two (identical) groups with ten members each
# and there are 1000 repetitions of this setup
data = numpy.random.random( (20, 1000) )
model  = sm.add_constant(numpy.array([0]*10 + [1]*10))
restricted_model = numpy.ones((20,1))

fit = sm.OLS(data, model).fit()
print(fit.summary())
restricted_fit = sm.OLS(data, restricted_model).fit()

# The following raises a ValueError exception
# but should instead have the same results as the method shown below
fs, ps, dfs = fit.compare_f_test(restricted_fit)

## The current way you have to run this, running one at a time:
fs, ps, dfs = numpy.empty(1000), numpy.empty(1000), numpy.empty(1000)
for i in range(1000):
  fit = sm.OLS(data[:,i], model).fit()
  restricted_fit = sm.OLS(data[:,i], restricted_model).fit()
  fs[i], ps[i], dfs[i] = fit.compare_f_test(restricted_fit)


statsmodels.stats.diagnostic.linear_lm
statsmodels.stats.diagnostic.linear_lm(resid, exog, func=None)

exog = np.array(X.columns)
exog = data.exog.T
dir(res.resid_dev)
print(res.resid_dev.__doc__)
lm, lm_pval, ftest = sm.stats.diagnostic.linear_lm(res.resid_dev, exog, func=None)

 if func is None:
        def func(x):
            return np.power(x, 2)
Exemplo n.º 11
0
def test_weightstats_2d_w2():
    x1 = [[1]]
    w1 = [[1]]
    d1 = DescrStatsW(x1, w1)
    assert (d1.quantile([0, 0.5, 1.0]) == 1).all().all()
Exemplo n.º 12
0
def mean_var(throu, delay):
    thr_, delay_ = np.array(throu), np.array(delay)
    weighted_stats = DescrStatsW(thr_, weights=delay_)
    return weighted_stats.mean, weighted_stats.std
Exemplo n.º 13
0
 def _normalize(self, style_factor: np.ndarray):
     weighted_stats = DescrStatsW(style_factor,
                                  weights=self._mkt_cap.flatten())
     weighted_mu = weighted_stats.mean
     factor_std = np.std(style_factor, axis=0, ddof=1)
     return (style_factor - weighted_mu) / factor_std
Exemplo n.º 14
0
def weighted(x):
    stats = DescrStatsW(x["quantity"], x["sold"])
    return {"median": stats.quantile(0.5)[0.5], "std": stats.std}
Exemplo n.º 15
0
def print_statistics(data, distances, name, mask, gasTransferParameterisation=False):
    if len(data.xcoords) == 0 or len(mask) == 0:
        print name, "- No data available!\n";
        return (name, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan);

    eks = data.ekmanVals[mask,:];
    geo = data.geostrophicVals[mask,:];
    stokes = data.stokesVals[mask,:];
    
    #distances = np.array(distances);
    if (len(distances) != len(eks)):
        raise ValueError("distances must be the same length as data values");
    
    weights = np.transpose(np.array([distances]*eks.shape[1]));
    
    if gasTransferParameterisation:
        ks = data.kVals[mask,:];
        notnan = np.where(np.isnan(ks)==False);
        kStats = DescrStatsW(ks[notnan], weights=weights[notnan], ddof=0);
        kMean = kStats.mean;
        kSD = kStats.std;
        
        print name, "Gas transfer velocity stats";
        print "\tk: ", kMean, "+/-", kSD;
    
    totals = np.abs(eks) + np.abs(geo) + np.abs(stokes);
    eksProps = np.abs(eks) / totals;
    geoProps = np.abs(geo) / totals;
    stokesProps = np.abs(stokes) / totals;
    
    notnan = np.where(np.isnan(eksProps)==False);
    eksStats = DescrStatsW(eksProps[notnan], weights=weights[notnan], ddof=0);
    eksMean = eksStats.mean;
    eksSD = eksStats.std;
    
    notnan = np.where(np.isnan(geoProps)==False);
    geoStats = DescrStatsW(geoProps[notnan], weights=weights[notnan], ddof=0);
    geoMean = geoStats.mean;
    geoSD = geoStats.std;
    
    notnan = np.where(np.isnan(stokesProps)==False);
    stokesStats = DescrStatsW(stokesProps[notnan], weights=weights[notnan], ddof=0);
    stokesMean = stokesStats.mean;
    stokesSD = stokesStats.std;
    
    print name, "mean proportions"
    print "\tEkman: ", eksMean, "+/-", eksSD;
    print "\tGeostrophic: ", geoMean, "+/-", geoSD;
    print "\tStokes: ", stokesMean, "+/-", stokesSD;
    print "\t", "total:", eksMean+geoMean+stokesMean;
    
    totalOntoShelf = eks+geo+stokes;
    notnan = np.where(np.isnan(totalOntoShelf)==False);
    totalOntoShelfStats = DescrStatsW(totalOntoShelf[notnan], weights=weights[notnan], ddof=0);
    totalMean = totalOntoShelfStats.mean;
    totalSD = totalOntoShelfStats.std;
    
    eksPercent = eks / totalOntoShelf * 100.0;
    geoPercent = geo / totalOntoShelf * 100.0;
    stokesPercent = stokes / totalOntoShelf * 100.0;
    
    notnan = np.where(np.isnan(eksPercent)==False);
    eksPercentStats = DescrStatsW(eksPercent[notnan], weights=weights[notnan], ddof=0);
    eksPercentMean = eksPercentStats.mean;
    eksPercentSD = eksPercentStats.std;
    
    
    notnan = np.where(np.isnan(geoPercent)==False);
    geoPercentStats = DescrStatsW(geoPercent[notnan], weights=weights[notnan], ddof=0);
    geoPercentMean = geoPercentStats.mean;
    geoPercentSD = geoPercentStats.std;
    
    
    notnan = np.where(np.isnan(stokesPercent)==False);
    stokesPercentStats = DescrStatsW(stokesPercent[notnan], weights=weights[notnan], ddof=0);
    stokesPercentMean = stokesPercentStats.mean;
    stokesPercentSD = stokesPercentStats.std;
    
    
    print name, "percentage total onto-shelf current";
    print "\tEkman: ", eksPercentMean, "+/-", eksPercentSD;
    print "\tGeostrophic: ", geoPercentMean, "+/-", geoPercentSD;
    print "\tStokes: ", stokesPercentMean, "+/-", stokesPercentSD;
    print "\t", "total onto-shelf current (m/s):", totalMean, "+/-", totalSD;
    print "";
    
    if gasTransferParameterisation:
        return (name, totalMean, totalSD, eksMean, eksSD, geoMean, geoSD, stokesMean, stokesSD, eksPercentMean, eksPercentSD, geoPercentMean, geoPercentSD, stokesPercentMean, stokesPercentSD, kMean, kSD);
    else:
        return (name, totalMean, totalSD, eksMean, eksSD, geoMean, geoSD, stokesMean, stokesSD, eksPercentMean, eksPercentSD, geoPercentMean, geoPercentSD, stokesPercentMean, stokesPercentSD, np.nan, np.nan);
Exemplo n.º 16
0
def conf_int(a):
    print ('Confidence Intervall')
    print (DescrStatsW(a).tconfint_mean())
Exemplo n.º 17
0
def derive(data, params):
    """
    Derives connectivity from the data. A lot of data is inherently built with edges
     (e.g. communication between two individuals).
    However other networks are derived from the covariance of time series
     (e.g. brain networks between two regions).

    Covariance based metrics deriving time-resolved networks can be done in multiple ways.
     There are other methods apart from covariance based.

    Derive a weight vector for each time point and then the corrrelation coefficient
     for each time point.

    Paramters
    --------

    data : array 
        Time series data to perform connectivity derivation on. (Default dimensions are: (time as rows, nodes as columns). Change params{'dimord'} if you want it the other way (see below).
    
    params : dict 
        Parameters for each method (see below).

    Necessary paramters
    ===================

    method : str
        method: "distance","slidingwindow", "taperedslidingwindow",
     "jackknife", "multiplytemporalderivative". Alternatively, method can be a weight matrix of size time x time.

    **Different methods have method specific paramaters (see below)**

    Params for all methods (optional)
    =================================

    postpro : "no" (default). Other alternatives are: "fisher", "boxcox", "standardize"
     and any combination seperated by a + (e,g, "fisher+boxcox").
      See postpro_pipeline for more information.
    dimord : str
        Dimension order: 'node,time' (default) or 'time,node'. People like to represent their data differently and this is an easy way to be sure that you are inputing the data in the correct way.
    analysis_id : str or int 
        add to identify specfic analysis. Generated report will be placed in './report/' + analysis_id + '/derivation_report.html
    report : bool 
        False by default. If true, A report is saved in ./report/[analysis_id]/derivation_report.html if "yes"
    report_path : str 
        String where the report is saved. Default is ./report/[analysis_id]/derivation_report.html 

    Methods specific parameters 
    =========================== 

    method == "distance"
    ~~~~~~~~~~~~~~~~~~~

    Distance metric calculates 1/Distance metric weights, and scales between 0 and 1.
    W[t,t] is excluded from the scaling and then set to 1.

    params['distance']: str 
        Distance metric (e.g. 'euclidean'). See teneto.utils.getDistanceFunction for more info

    When method == "slidingwindow"
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    params['windowsize'] : int 
        Size of window.

    When method == "taperedslidingwindow"
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    params['windowsize'] : int 
        Size of window.
    params['distribution'] : str 
        Scipy distribution (e.g. 'norm','expon'). Any distribution here: https://docs.scipy.org/doc/scipy/reference/stats.html
    params['distribution_params'] : list 
        Each parameter, excluding the data "x" (in their scipy function order) to generate pdf.

        NOTE
        !!!!!!!!!!
        The data x should be considered to be centered at 0 and have a length of window size.
         (i.e. a window size of 5 entails x is [-2, -1, 0, 1, 2] a window size of 6 entails [-2.5, -1.5, 0.5, 0.5, 1.5, 2.5])
        Given x params['distribution_params'] contains the remaining parameters.

        e.g. normal distribution requires pdf(x, loc, scale) where loc=mean and scale=std.
         This means that the mean and std have to be provided in distribution_params.

        Say we have a gaussian distribution, a window size of 21 and params['distribution_params'] is [0,5].
         This will lead to a gaussian with its peak at in the middle of each window with a standard deviation of 5.

        Instead, if we set params['distribution_params'] is [10,5] this will lead to a half gaussian with its peak at the final time point with a standard deviation of 5.

    When method == "temporalderivative"
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    params['windowsize'] : int
        Size of window.

    When method == "jackknife"
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    No parameters are necessary. 
    
    Optional parameters: 
    
    params['weight-var'] : array, (optional) 
        NxN array to weight the JC estimates (standerdized-JC*W). If weightby is selected, do not standerdize in postpro.
    params['weight-mean'] : array, (optional) 
        NxN array to weight the JC estimates (standerdized-JC+W). If weightby is selected, do not standerdize in postpro.


    Returns
    -------    

    G : array 
        Connectivity estimates (nodes x nodes x time)


    READ MORE
    ---------
    About the general weighted pearson approach used for most methods, see: 
    Thompson & Fransson (2019) A common framework for the problem of deriving estimates of dynamic functional brain connectivity. 
    Neuroimage. (https://doi.org/10.1016/j.neuroimage.2017.12.057)

    SEE ALSO
    --------

    *postpro_pipeline*, *gen_report*

    """
    report = {}

    if 'dimord' not in params.keys():
        params['dimord'] = 'node,time'

    if 'report' not in params.keys():
        params['report'] = False

    if 'analysis_id' not in params.keys():
        params['analysis_id'] = ''

    if 'postpro' not in params.keys():
        params['postpro'] = 'no'

    if params['report'] == 'yes' or params['report'] == True:

        if 'analysis_id' not in params.keys():
            params['analysis_id'] = ''

        if 'report_path' not in params.keys():
            params['report_path'] = './report/' + params['analysis_id']

        if 'report_filename' not in params.keys():
            params['report_filename'] = 'derivation_report.html'

    if params['dimord'] == 'node,time':
        data = data.transpose()

    if isinstance(params['method'], str):
        if params['method'] == 'jackknife':
            weights, report = weightfun_jackknife(data.shape[0], report)
            relation = 'weight'
        elif params['method'] == 'sliding window' or params[
                'method'] == 'slidingwindow':
            weights, report = weightfun_sliding_window(data.shape[0], params,
                                                       report)
            relation = 'weight'
        elif params['method'] == 'tapered sliding window' or params[
                'method'] == 'taperedslidingwindow':
            weights, report = weightfun_tapered_sliding_window(
                data.shape[0], params, report)
            relation = 'weight'
        elif params['method'] == 'distance' or params[
                'method'] == "spatial distance" or params[
                    'method'] == "node distance" or params[
                        'method'] == "nodedistance" or params[
                            'method'] == "spatialdistance":
            weights, report = weightfun_spatial_distance(data, params, report)
            relation = 'weight'
        elif params['method'] == 'mtd' or params[
                'method'] == 'multiply temporal derivative' or params[
                    'method'] == 'multiplytemporalderivative' or params[
                        'method'] == 'temporal derivative' or params[
                            'method'] == "temporalderivative":
            R, report = temporal_derivative(data, params, report)
            relation = 'coupling'
        else:
            raise ValueError(
                'Unrecognoized method. See derive_with_weighted_pearson documentation for predefined methods or enter own weight matrix'
            )
    else:
        try:
            weights = np.array(params['method'])
            relation = 'weight'
        except:
            raise ValueError(
                'Unrecognoized method. See documentation for predefined methods'
            )
        if weights.shape[0] != weights.shape[1]:
            raise ValueError("weight matrix should be square")
        if weights.shape[0] != data.shape[0]:
            raise ValueError("weight matrix must equal number of time points")

    if relation == 'weight':
        # Loop over each weight vector and calculate pearson correlation.
        # Note, should see if this can be made quicker in future.
        R = np.array([
            DescrStatsW(data, weights[i, :]).corrcoef
            for i in range(0, weights.shape[0])
        ])
        # Make node,node,time
        R = R.transpose([1, 2, 0])

    # Correct jackknife direction
    if params['method'] == 'jackknife':
        # Correct inversion
        R = R * -1
        jc_z = 0
        if 'weight-var' in params.keys():
            R = np.transpose(R, [2, 0, 1])
            R = (R - R.mean(axis=0)) / R.std(axis=0)
            jc_z = 1
            R = R * params['weight-var']
            R = R.transpose([1, 2, 0])
        if 'weight-mean' in params.keys():
            R = np.transpose(R, [2, 0, 1])
            if jc_z == 0:
                R = (R - R.mean(axis=0)) / R.std(axis=0)
            R = R + params['weight-mean']
            R = np.transpose(R, [1, 2, 0])
        R = teneto.utils.set_diagonal(R, 1)

    if params['postpro'] != 'no':
        R, report = teneto.derive.postpro_pipeline(R, params['postpro'],
                                                   report)
        R = teneto.utils.set_diagonal(R, 1)

    if params['report'] == 'yes' or params['report'] == True:
        teneto.derive.gen_report(report, params['report_path'],
                                 params['report_filename'])
    return R
Exemplo n.º 18
0
def main(job_no, suffix, dir):
    start_time = time()
    if not os.path.exists(dir):
        os.makedirs(dir)
    LOGGER.log_file_path = dir + "/" + str(
        os.path.basename(__file__)) + '_' + job_no + ".log"
    LOGGER.log_args()
    LOGGER.log_message(get_file_hexdigest(__file__),
                       label="Hex digest of script.".ljust(17))
    try:
        LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']),
                           label="Conda environment.".ljust(17))
    except KeyError:
        pass
    LOGGER.log_message('Name = ' + np.__name__ + ', version = ' +
                       np.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' +
                       pd.__version__,
                       label="Imported module".ljust(30))
    LOGGER.log_message('Name = ' + statsmodels.__name__ + ', version = ' +
                       statsmodels.__version__,
                       label="Imported module".ljust(30))

    result = pd.DataFrame(
        columns=['kmer', 'variance', 'Marginalise over central base?'])

    #Find variance due to CpG
    filename = dir + '/var_counts_1' + suffix + '.pklz'
    infile = open(filename, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(filename, 'rb') as var_counts:
        var_counts = pickle.load(var_counts)
    filename = dir + '/context_counts_1' + suffix + '.pklz'
    infile = open(filename, 'r')
    LOGGER.input_file(infile.name)
    infile.close()
    with gzip.open(filename, 'rb') as context_counts:
        context_counts = pickle.load(context_counts)

    cpg_contexts = context_counts.loc['CG', 'C'] + context_counts.loc['TG', 'C'] + context_counts.loc['AG', 'C'] + \
                   context_counts.loc['GG', 'C'] + \
                   context_counts.loc['CC', 'G'] + context_counts.loc['CT', 'G'] + context_counts.loc['CA', 'G'] + \
                   context_counts.loc['CG', 'G']
    CpG_ratio = cpg_contexts / context_counts.values.sum()
    non_cpg_contexts = context_counts.values.sum() - cpg_contexts
    print('Total CpG sites           : ', cpg_contexts)
    print('Total intronic sites      : ', context_counts.values.sum())
    print('Proportion CpG sites      : ', CpG_ratio)
    var_counts[
        'C'] = var_counts['C->T'] + var_counts['C->A'] + var_counts['C->G']
    var_counts[
        'G'] = var_counts['G->T'] + var_counts['G->A'] + var_counts['G->C']
    CpG_vars = var_counts.loc['CG', 'C'] + var_counts.loc['TG', 'C'] + var_counts.loc['AG', 'C'] + \
               var_counts.loc['GG', 'C'] + \
               var_counts.loc['CC', 'G'] + var_counts.loc['CT', 'G'] + var_counts.loc['CA', 'G'] + \
               var_counts.loc['CG', 'G']
    print('Total CpG variants        : ', CpG_vars)
    non_CpG_vars = var_counts.values.sum() - CpG_vars
    m1 = CpG_vars / cpg_contexts
    m0 = non_CpG_vars / non_cpg_contexts
    m_ave = var_counts.values.sum() / context_counts.values.sum()
    print('SNV density at CpG sites  : ', m1)
    print('SNV density at other sites: ', m0)
    print('Average SNV density       : ', m_ave)
    t1 = CpG_ratio * (m1 - m_ave)**2
    t2 = (1 - CpG_ratio) * (m0 - m_ave)**2
    print('Variance due to CpG sites : ', t1 + t2)
    LOGGER.log_message("%.2e" % (t1 + t2),
                       label="Variance due to CpG".ljust(50))

    #Deal with the 1-mer case.
    var_counts[
        'C'] = var_counts['C->T'] + var_counts['C->A'] + var_counts['C->G']
    var_counts[
        'T'] = var_counts['T->C'] + var_counts['T->A'] + var_counts['T->G']
    var_counts[
        'A'] = var_counts['A->T'] + var_counts['A->C'] + var_counts['A->G']
    var_counts[
        'G'] = var_counts['G->T'] + var_counts['G->A'] + var_counts['G->C']
    variant_counts = var_counts.sum(axis=0)
    variant_counts = variant_counts[variant_counts.index.isin(
        ['C', 'T', 'A', 'G'])]
    con_counts = context_counts.sum(axis=0)
    mut_rates = variant_counts / con_counts
    w = DescrStatsW(mut_rates, weights=con_counts, ddof=0)
    row = np.array([1, w.var, 'no'])
    row = pd.Series(row, index=result.columns, name=0)
    result = result.append(row)
    row = np.array([1, 0.0, 'yes'])
    row = pd.Series(row, index=result.columns, name=1)
    result = result.append(row)

    i = 2
    for kmer_variable in [1, 2, 3]:
        filename = dir + '/var_counts_' + str(kmer_variable) + suffix + '.pklz'
        infile = open(filename, 'r')
        LOGGER.input_file(infile.name)
        infile.close()
        with gzip.open(filename, 'rb') as var_counts:
            var_counts = pickle.load(var_counts)
        filename = dir + '/context_counts_' + str(
            kmer_variable) + suffix + '.pklz'
        infile = open(filename, 'r')
        LOGGER.input_file(infile.name)
        infile.close()
        with gzip.open(filename, 'rb') as context_counts:
            context_counts = pickle.load(context_counts)

        #Reformat context counts by repeating columns to match snv_densities dataframe.
        extended_context_counts, cols = probpoly_bayes.reformat_context_counts(
            context_counts, var_counts)
        extended_context_counts.set_index(context_counts.index, inplace=True)
        snv_densities = var_counts / extended_context_counts

        #Calculate variance across mutation types, marginalising over the central base.
        context_ratios = context_counts.div(context_counts.sum(axis=1), axis=0)
        extended_context_ratios, cols = probpoly_bayes.reformat_context_counts(
            context_ratios, snv_densities)
        extended_context_ratios.set_index(context_ratios.index, inplace=True)
        con_weighted = (snv_densities * extended_context_ratios).sum(axis=1)
        u = DescrStatsW(con_weighted,
                        weights=context_counts.sum(axis=1),
                        ddof=0)
        print(
            'Marginalised variance due to ' + str(2 * kmer_variable + 1) +
            ' -mers = ', u.var)
        row = np.array([2 * kmer_variable + 1, u.var, 'yes'])
        row = pd.Series(row, index=result.columns, name=i)
        result = result.append(row)
        i += 1

        #Calculate variance conditioned on kmer, not marginalising over the central base.
        #Firstly we reorganise the SNV densities table so that rows correspond to kmers
        # (including central base) and columns correspond to the derived base.
        contexts_generator = product('ACGT', repeat=2 * kmer_variable + 1)
        contexts = tuple(''.join(context) for context in contexts_generator)
        kmer_densities = np.zeros((len(contexts), 4))
        kmer_densities = pd.DataFrame(kmer_densities,
                                      index=contexts,
                                      columns=['C', 'T', 'A', 'G'])
        for context in snv_densities.index:
            for mut in snv_densities.columns:
                ref = mut[0]
                derived = mut[3]
                kmer = context[0:kmer_variable] + ref + context[
                    kmer_variable:2 * kmer_variable]
                kmer_densities.loc[kmer, derived] = snv_densities.loc[context,
                                                                      mut]

        #We also reorganise context counts into counts of kmers.
        kmer_counts = np.zeros((len(contexts)))
        kmer_counts = pd.Series(kmer_counts, index=contexts)
        for kmer in kmer_counts.index:
            context = kmer[0:kmer_variable] + kmer[kmer_variable +
                                                   1:2 * kmer_variable + 1]
            ref = kmer[kmer_variable]
            kmer_counts[kmer] = context_counts.loc[context, ref]

        #Calculate the weighted variance over the full kmer.
        v = DescrStatsW(kmer_densities.sum(axis=1),
                        weights=kmer_counts,
                        ddof=0)
        print(
            'Unmarginalised variance due to ' + str(2 * kmer_variable + 1) +
            ' -mers = ', v.var)
        row = np.array([2 * kmer_variable + 1, v.var, 'no'])
        row = pd.Series(row, index=result.columns, name=i)
        result = result.append(row)
        i += 1

    print(result)
    filename = dir + "/aggregated_results" + job_no + ".csv"
    result.to_csv(filename, sep=',')
    outfile = open(filename, 'r')
    LOGGER.output_file(outfile.name)
    outfile.close()

    duration = time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="Run duration (minutes)".ljust(50))
Exemplo n.º 19
0
    def bootstrap_correlation_difference(self,
                                         x_data_1,
                                         x_data_2,
                                         y_data_1,
                                         y_data_2,
                                         weights=None,
                                         corr_type='pearson',
                                         test_value=0,
                                         ci_factor=1.96,
                                         two_tailed=True,
                                         detect_inliers=True,
                                         outlier_num_stds=3,
                                         generate_diagnostic=False):
        """
        
        This function performs either pearson or spearman correlation on two sets of data.
        Within each fold, it does so for two sets of data (same permutation, different variables), and subtracts the correlation coefficients.
        Finally, it tests whether the correlations are different from each other (whether difference in correlations is different from 0).

        :param x_data_1: input x_data
        :type x_data_1: 1-D array 
        :param y_data_1: input y_data
        :type y_data_1: 1-D array with same shape as x_data
        :param x_data_2: input x_data
        :type x_data_2: 1-D array 
        :param y_data_2: input y_data
        :type y_data_2: 1-D array with same shape as x_data
        :param weights: possible input weights when calculating mean
        :type weights: 1-D array with same n_observations as data
        :param reps: amount of bootstrap repetitions
        :type reps: int
        :param test_value: value to test bootstrap distribution against
        :type test_value: float
        :param ci_factor: z-score to multiply std of bootstrap distr with
        :type ci_factor: float
        :param two_tailed: when True, returns two-sided p-val, else returns one-sided p-val
        :type two_tailed: bool
        :param detect_inliers: uses outlier detection when True
        :type detect_inliers: bool
        :param outlier_num_stds: number of stds from median in outlier detection
        :type outlier_num_stds: float
        :param generate_diagnostics: if True, make plot of the bootstrap distr
        :type generate_diagnostics: bool

        :return center: central estimate of bootstrap distribution
        :type center: float
        :return ci: ci of central estimate
        :type ci: float
        :return p-val: p-val for difference with test_value
        :type p-val: float

        """

        # set weights to one if none provided
        if weights is None:
            weights = np.ones(len(x_data))

        if detect_inliers:
            x_1_inliers = self.detect_inliers_mad(x_data_1, outlier_num_stds)
            y_1_inliers = self.detect_inliers_mad(y_data_1, outlier_num_stds)
            x_2_inliers = self.detect_inliers_mad(x_data_2, outlier_num_stds)
            y_2_inliers = self.detect_inliers_mad(y_data_2, outlier_num_stds)
            all_inliers = x_1_inliers * x_2_inliers * y_1_inliers * y_2_inliers
            x_data_1 = x_data_1[all_inliers]
            y_data_1 = y_data_1[all_inliers]
            x_data_2 = x_data_2[all_inliers]
            y_data_2 = y_data_2[all_inliers]
            weights = weights[all_inliers]

        N = len(x_data_1)

        # get random ints for random indices
        permute_indices = np.random.randint(0,
                                            len(x_data_1),
                                            size=(len(x_data_1),
                                                  int(self.reps))).T

        # rank transform data if spearman is requested
        if corr_type == 'spearman':
            x_data_1 = stats.rankdata(x_data_1)
            y_data_1 = stats.rankdata(y_data_1)
            x_data_2 = stats.rankdata(x_data_2)
            y_data_2 = stats.rankdata(y_data_2)

        bootstrap_distr = []
        bootstrap_distr_z = []
        # loop over permutes
        for fold in permute_indices:
            r_1 = DescrStatsW(data=np.vstack([x_data_1[fold],
                                              y_data_1[fold]]).T,
                              weights=weights[fold]).corrcoef[0, 1]
            r_2 = DescrStatsW(data=np.vstack([x_data_2[fold],
                                              y_data_2[fold]]).T,
                              weights=weights[fold]).corrcoef[0, 1]
            bootstrap_distr.append(r_1 - r_2)

            # fisher transform correlations
            zr1 = np.arctanh(r_1)
            zr2 = np.arctanh(r_2)
            bootstrap_distr_z.append(zr1 - zr2)

        # calculate p-val
        p = self.p_val_from_bootstrap_dist(bootstrap_distr_z, test_value,
                                           two_tailed)

        # compute central corr on all data
        r1 = DescrStatsW(data=np.vstack([x_data_1, y_data_1]).T,
                         weights=weights).corrcoef[0, 1]
        r2 = DescrStatsW(data=np.vstack([x_data_2, y_data_2]).T,
                         weights=weights).corrcoef[0, 1]
        r_diff = r1 - r2

        # return standard deviation of bootstrap distro as CI
        r_diff_ci = np.std(bootstrap_distr) * ci_factor

        if generate_diagnostic:
            f = pl.figure(figsize=(5, 5))
            s = f.add_subplot(111)
            pl.title('input data')
            pl.hist(bootstrap_distr, 100)
            pl.axvline(corr, color='k', label='center', lw=5)
            pl.axvline(corr + corr_ci, color='r', label='ci', lw=5)
            pl.axvline(corr - corr_ci, color='r', label='ci', lw=5)
            pl.legend(loc='best')
            sn.despine(offset=2)
            pl.savefig(
                '/home/vanes/temp/plots/corr_bootstrap_distr_outlier_detection_%s_%d.pdf'
                % (detect_inliers, np.random.randint(1e8)))
            pl.close()

        return r_diff, r_diff_ci, p, N
Exemplo n.º 20
0
    def bootstrap_correlation(self,
                              x_data,
                              y_data,
                              weights=None,
                              corr_type='pearson',
                              test_value=0,
                              ci_factor=1.96,
                              two_tailed=True,
                              detect_inliers=True,
                              outlier_num_stds=3,
                              generate_diagnostic=False):
        """
        Fits linear regression to data, either weighted or not. 
        Returns bootstrapped CIs for slope and intercept

        :param x_data: input x_data
        :type x_data: 1-D array 
        :param y_data: input y_data
        :type y_data: 1-D array with same shape as x_data
        :param weights: possible input weights when calculating mean
        :type weights: 1-D array with same n_observations as data
        :param reps: amount of bootstrap repetitions
        :type reps: int
        :param test_value: value to test bootstrap distribution against
        :type test_value: float
        :param ci_factor: z-score to multiply std of bootstrap distr with
        :type ci_factor: float
        :param two_tailed: when True, returns two-sided p-val, else returns one-sided p-val
        :type two_tailed: bool
        :param detect_inliers: uses outlier detection when True
        :type detect_inliers: bool
        :param outlier_num_stds: number of stds from median in outlier detection
        :type outlier_num_stds: float
        :param generate_diagnostics: if True, make plot of the bootstrap distr
        :type generate_diagnostics: bool

        :return center: central estimate of bootstrap distribution
        :type center: float
        :return ci: ci of central estimate
        :type ci: float
        :return p-val: p-val for difference with test_value
        :type p-val: float

        """
        # remove nans from signal if present

        # set weights to one if none provided
        if weights is None:
            weights = np.ones(len(x_data))

        if detect_inliers:
            x_inliers = self.detect_inliers_mad(x_data, outlier_num_stds)
            y_inliers = self.detect_inliers_mad(y_data, outlier_num_stds)
            x_data = x_data[x_inliers * y_inliers]
            y_data = y_data[x_inliers * y_inliers]
            weights = weights[x_inliers * y_inliers]

        N = len(x_data)

        # get random ints for random indices
        permute_indices = np.random.randint(0,
                                            len(x_data),
                                            size=(len(x_data),
                                                  int(self.reps))).T

        # rank transform data if spearman is requested
        if corr_type == 'spearman':
            x_data = stats.rankdata(x_data)
            y_data = stats.rankdata(y_data)

        bootstrap_distr = []
        bootstrap_distr_z = []
        # loop over permutes
        for fold in permute_indices:
            r = DescrStatsW(data=np.vstack([x_data[fold], y_data[fold]]).T,
                            weights=weights[fold]).corrcoef[0, 1]
            z = np.arctanh(r)
            bootstrap_distr.append(r)
            bootstrap_distr_z.append(z)

        # calculate p-val
        p = self.p_val_from_bootstrap_dist(bootstrap_distr_z, test_value,
                                           two_tailed)

        # compute central corr on all data
        corr = DescrStatsW(data=np.vstack([x_data, y_data]).T,
                           weights=weights).corrcoef[0, 1]

        # return standard deviation of bootstrap distro as CI
        # corr_ci = np.std(bootstrap_distr)*ci_factor
        corr_ci = self.get_ci(bootstrap_distr, ci_factor)

        if generate_diagnostic:
            f = pl.figure(figsize=(5, 5))
            s = f.add_subplot(111)
            pl.title('input data')
            pl.hist(bootstrap_distr, 100)
            pl.axvline(corr, color='k', label='center', lw=5)
            pl.axvline(corr + corr_ci, color='r', label='ci', lw=5)
            pl.axvline(corr - corr_ci, color='r', label='ci', lw=5)
            pl.legend(loc='best')
            sn.despine(offset=2)
            pl.savefig(
                '/home/vanes/temp/plots/corr_bootstrap_distr_outlier_detection_%s_%d.pdf'
                % (detect_inliers, np.random.randint(1e8)))
            pl.close()

        return corr, corr_ci, p, N
Exemplo n.º 21
0
    def bootstrap(self,
                  data,
                  center_estimate='mean',
                  weights=None,
                  test_value=0,
                  ci_factor=1.96,
                  two_tailed=True,
                  detect_inliers=True,
                  outlier_num_stds=3,
                  generate_diagnostic=False,
                  return_d=False):
        """
        This finds a distribution of (weighted) average or regular median of the data,
        returning the median of this distribution along with std from the median.

        :param data: input data
        :type data: array of shape: [n_variables,n_observations]
        :param center_estimate: central measure: mean or median
        :type center_estimate: string
        :param weights: possible input weights when calculating mean
        :type weights: 1-D array with same n_observations as data
        :param reps: amount of bootstrap repetitions
        :type reps: int
        :param test_value: value to test bootstrap distribution against
        :type test_value: float
        :param ci_factor: z-score to multiply std of bootstrap distr with
        :type ci_factor: float
        :param two_tailed: when True, returns two-sided p-val, else returns one-sided p-val
        :type two_tailed: bool
        :param detect_inliers: uses outlier detection when True
        :type detect_inliers: bool
        :param outlier_num_stds: number of stds from median in outlier detection
        :type outlier_num_stds: float
        :param generate_diagnostics: if True, make plot of the bootstrap distr
        :type generate_diagnostics: bool

        :return center: central estimate of bootstrap distribution
        :type center: float
        :return ci: ci of central estimate
        :type ci: float
        :return p-val: p-val for difference with test_value
        :type p-val: float

        """

        means = []
        ps = []
        Ns = []
        cis = []
        cohen_d = []
        # put single array in iterable container
        if np.ndim(data) == 1:
            data = [data]

        # loop over different variables
        for di in range(np.shape(data)[0]):

            # if there's no data in here, or when all values are nans, set results to nans:
            if (len(data[di][np.invert(np.isnan(data[di]))]) == 0):
                means.append(np.nan)
                ps.append(np.nan)
                cohen_d.append(np.nan)
                Ns.append(np.nan)

            # else do bootstrap
            else:

                # set weights to one if none provided
                if weights is None:
                    these_weights = np.ones(len(data[di]))
                else:
                    these_weights = copy.copy(weights)

                # remove nan values from data and weights
                valid_values = np.invert(np.isnan(data[di]))
                these_data = data[di][valid_values]
                these_weights = these_weights[valid_values]

                # remove outliers from data and weights
                if detect_inliers:
                    inliers = self.detect_inliers_mad(
                        these_data, outlier_num_stds
                    )  #,weights=weights,center_estimate=center_estimate)
                    # inliers = self.detect_inliers_std(these_data,outlier_num_stds,weights=weights)
                    these_data = these_data[inliers]
                    these_weights = these_weights[inliers]

                Ns.append(len(these_data))

                # get random ints for random indices
                permute_indices = np.random.randint(0,
                                                    len(these_data),
                                                    size=(len(these_data),
                                                          int(self.reps)))

                # now average over all these random draws
                if center_estimate == 'mean':
                    # in weighted fashion in case of average
                    bootstrap_distr = np.average(
                        these_data[permute_indices],
                        weights=these_weights[permute_indices],
                        axis=0)
                elif center_estimate == 'median':
                    # or regular median
                    bootstrap_distr = np.median(these_data[permute_indices],
                                                axis=0)
                elif center_estimate == 'std':
                    bootstrap_distr = np.std(these_data[permute_indices],
                                             axis=0)

                # calculate p-val
                ps.append(
                    self.p_val_from_bootstrap_dist(bootstrap_distr, test_value,
                                                   two_tailed))

                # get ci
                cis.append(self.get_ci(bootstrap_distr, ci_factor))

                # return standard deviation of bootstrap distro for plotting
                if center_estimate == 'mean':
                    means.append(np.average(these_data, weights=these_weights))
                elif center_estimate == 'median':
                    means.append(np.median(these_data))
                elif center_estimate == 'std':
                    means.append(np.std(these_data))
                # ses.append(np.std(bootstrap_distr)*ci_factor)

                if generate_diagnostic:
                    f = pl.figure(figsize=(5, 5))
                    s = f.add_subplot(111)
                    pl.title('input data')
                    pl.hist(bootstrap_distr, 100)
                    pl.axvline(np.average(these_data, weights=these_weights),
                               color='k',
                               label='center',
                               lw=5)
                    pl.axvline(np.average(these_data, weights=these_weights) +
                               np.std(bootstrap_distr) * ci_factor,
                               color='r',
                               label='ci',
                               lw=5)
                    pl.axvline(np.average(these_data, weights=these_weights) -
                               np.std(bootstrap_distr) * ci_factor,
                               color='r',
                               label='ci',
                               lw=5)
                    pl.legend(loc='best')
                    sn.despine(offset=2)
                    pl.savefig(
                        '/home/vanes/temp/plots/bootstrap_distr_outlier_detection_%s_%d.pdf'
                        % (detect_inliers, np.random.randint(1e8)))
                    pl.close()

                # calculate cohen's d:
                cohen_d.append(
                    (np.average(these_data, weights=these_weights) -
                     test_value) /
                    DescrStatsW(data=these_data, weights=these_weights).std)

        if return_d:
            return np.squeeze(means), np.squeeze(cis), np.squeeze(
                ps), np.squeeze(Ns), np.squeeze(cohen_d)
        else:
            return np.squeeze(means), np.squeeze(cis), np.squeeze(
                ps), np.squeeze(Ns)
Exemplo n.º 22
0
def tabulate_march_inequality(year):
    """
    #
    For years 1964-2009 (year is March year, not earnings year), tabulate:

    These inequality metrics:

    - 90/50, 50/10, 90/10, Vln
    - 60/50, 70/50, 80/50, 95/50, 97/50
    - 50/3, 50/5, 50/20, 50/30, 50/40

    For these samples

    - Males
    - Females
    - Both

    For these wage measures

    - All hourly

    For these conditioning variables

    - raw wage inequality
    - residual wage inequality

    Also note:

    - Always dropping allocators where possible

    D. Autor, 2/24/2004
    D. Autor, 6/15/2004 - Updated for consistency of controls for quantime simulation methods
    M. Anderson, 12/13/2005 - Updated for new quantiles and years
    D. Autor, 9/5/2006. Updated for 2005 March
    M. Wasserman, 10/14/2009 Updated for 2007/8 March
    #
    """

    df = tabulate_march_basic(year)
    df = df.eval("""
            lnwinc = log(winc_ws) + log(gdp)
            lnhinc = log(hinc_ws) + log(gdp)
        """)

    # Full-time and hourly samples
    df = df.eval("ftfy = fulltime*fullyear")
    df.ftfy.describe().to_frame().T
    df = df.eval("""
            ftsamp = (lnwinc == lnwinc) * ftfy * abs(bcwkwgkm-1)
            hrsamp = (lnhinc == lnhinc) * abs(bchrwgkm-1)
        """)
    # @ ftsamp: weekly real wage not none + ftfy + above weekly real wage limit
    # @ hrsamp: hourly real wage not none + above hourly real wage limit

    df.loc[df.ftsamp == 0, "lnwinc"] = np.nan
    df.loc[df.hrsamp == 0, "lnhinc"] = np.nan
    df.query("ftsamp == 1")["lnwinc"].describe().to_frame().T
    df.query("hrsamp == 1")["lnhinc"].describe().to_frame().T
    df = df.query("ftsamp == 1 | hrsamp == 1")

    # Generate experience categories
    df = df.assign(expcat=(df.exp/3).astype(int) + 1)
    df.loc[df.expcat == 17, "expcat"] = 16
    assert df.eval("1<= expcat <= 16").all()

    df.groupby("expcat")["exp"].agg(["mean", "min", "max"])

    # interaction terms - 80 of these
    # @ move to residual wage part

    # Drop reference group's interaction term: HSG with 0-2 years of experience
    # @ simiarly skip now

    df = df.filter(["year", "wgt", "wgt_hrs", "female", "lnwinc", "lnhinc", "hrsamp", "ftsamp", "edcat", "expcat"])

    ######################################################################
    # Summarize raw inequality
    ######################################################################

    pctiles = pd.Series([3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 97])
    pctiles_ = pctiles / 100
    tot_pct = pd.DataFrame(index=pctiles)
    tot_stat = pd.DataFrame(index=["mn", "vln"])

    dt = df.query("ftsamp==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_ft_mf"] = [wq.mean, wq.var]

    dt = df.query("ftsamp==1 & female==0")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_ft_m"] = [wq.mean, wq.var]

    dt = df.query("ftsamp==1 & female==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_ft_f"] = [wq.mean, wq.var]

    dt = df.query("hrsamp==1")
    wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs)
    tot_pct["tot_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_hr_mf"] = [wq.mean, wq.var]

    dt = df.query("hrsamp==1 & female==0")
    wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs)
    tot_pct["tot_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_hr_m"] = [wq.mean, wq.var]

    dt = df.query("hrsamp==1 & female==1")
    wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs)
    tot_pct["tot_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_hr_f"] = [wq.mean, wq.var]

    df_stat = pd.concat([tot_stat, tot_pct], axis=0, sort=False)

    ######################################################################
    # Summarize residual inequality - Weekly & Hourly
    ######################################################################

    res_pct = pd.DataFrame(index=pctiles)
    res_stat = pd.DataFrame(index=["mn", "vln"])

    dt = df.query("ftsamp==1")
    y, X = dmatrices('lnwinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt)
    res_stat["res_ft_mf"] = [wq.mean, wq.var]  # @ mean is not necessary but to be consistent
    res_pct["res_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==0")
    y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt)
    res_stat["res_ft_m"] = [wq.mean, wq.var]
    res_pct["res_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==1")
    y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt)
    res_stat["res_ft_f"] = [wq.mean, wq.var]
    res_pct["res_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("hrsamp==1")
    y, X = dmatrices('lnhinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt_hrs).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt_hrs)
    res_stat["res_hr_mf"] = [wq.mean, wq.var]
    res_pct["res_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("hrsamp==1 & female==0")
    y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt_hrs).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt_hrs)
    res_stat["res_hr_m"] = [wq.mean, wq.var]
    res_pct["res_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("hrsamp==1 & female==1")
    y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt_hrs).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt_hrs)
    res_stat["res_hr_f"] = [wq.mean, wq.var]
    res_pct["res_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False)

    df_stat_ = pd.concat([res_stat, res_pct], axis=0)
    df_stat = pd.concat([df_stat, df_stat_], axis=1)

    # march-ineq-data-`1'
    df_stat = df_stat.T.rename_axis('sample').reset_index().assign(year=year)  # @ tidy data

    ######################################################################
    # Percentiles of weekly earnings
    ######################################################################

    # @ simply generate more percentiles under full-time samples
    # @ note here year is march census year thus minus one to be earnings year

    pctiles = pd.Series(range(3, 98))
    pctiles_ = pctiles / 100
    tot_pct = pd.DataFrame(index=pctiles)

    dt = df.query("ftsamp==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==0")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False)

    # march-pctile-`yr'
    tot_pct = tot_pct.T.rename_axis('sample').reset_index().assign(year=year-1)  # @ tidy data

    # @ the code then combine 1963-2008 generated files
    # @ we remove this as not sure necessary
    # @ actually this part can be combined with #Summarize raw inequality#

    return df_stat, tot_pct
Exemplo n.º 23
0
EA_err_2 = np.array([0.8, 0.7])
x_03 = np.arange(5, 7, 1)
EA_3 = np.array([0.4716198, 0.4716227]) * 1e6
EA_err_3 = np.array([0.3, 1.4])
x_04 = np.arange(7, 10, 1)
EA_4 = np.array([0.4716217, 0.4716233, 0.4716227]) * 1e6
EA_err_4 = np.array([0.36, 0.56, 0.26])
x_05 = np.arange(10, 14, 1)
EA_5 = np.array([0.4716183, 0.4716189, 0.4716212, 0.4716210]) * 1e6
EA_err_5 = np.array([0.9, 0.3, 0.4, 0.8])

EA_T = EA_2.tolist() + EA_3.tolist() + EA_4.tolist() + EA_5.tolist()
EA_err_T = EA_err_2.tolist() + EA_err_3.tolist() + EA_err_4.tolist(
) + EA_err_5.tolist()
np.average(EA_T, weights=EA_err_T)
w_stats = DescrStatsW(EA_T, weights=EA_err_T, ddof=0)

EA_6 = np.array([0.4716115, 0.471626]) * 1e6
EA_err_6 = np.array([1, 25])
x_06 = np.arange(14, 16, 1)

plt.figure(40)
# plt.scatter(x_01,EA_1, color = 'blue',label = 'Feb-11, Feb-12')
# plt.errorbar(x_01, EA_1, yerr=EA_err_1, fmt='.k',color='blue', capthick=0.5,capsize=5,elinewidth=0.5)
plt.scatter(x_02, EA_2, color='black', label='Feb-16')
plt.errorbar(x_02,
             EA_2,
             yerr=EA_err_2,
             fmt='.k',
             color='black',
             capthick=0.5,
Exemplo n.º 24
0
    def test_weightstats_ddof_tests(self):
        # explicit test that ttest and confint are independent of ddof
        # one sample case
        x1_2d = self.x1_2d
        w1 = self.w1

        d1w_d0 = DescrStatsW(x1_2d, weights=w1, ddof=0)
        d1w_d1 = DescrStatsW(x1_2d, weights=w1, ddof=1)
        d1w_d2 = DescrStatsW(x1_2d, weights=w1, ddof=2)

        # check confint independent of user ddof
        res0 = d1w_d0.ttest_mean()
        res1 = d1w_d1.ttest_mean()
        res2 = d1w_d2.ttest_mean()
        # concatenate into one array with np.r_
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        res0 = d1w_d0.ttest_mean(0.5)
        res1 = d1w_d1.ttest_mean(0.5)
        res2 = d1w_d2.ttest_mean(0.5)
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        # check confint independent of user ddof
        res0 = d1w_d0.tconfint_mean()
        res1 = d1w_d1.tconfint_mean()
        res2 = d1w_d2.tconfint_mean()
        assert_almost_equal(res1, res0, 14)
        assert_almost_equal(res2, res0, 14)
Exemplo n.º 25
0
    def _create_histogram_distribution(self,
                                       df,
                                       min_x=None,
                                       max_x=None,
                                       extend_x_proportion_percentage=20,
                                       postfix_label=None,
                                       obs_weights=None,
                                       denormalised=True):

        # get min/max values for our histogram
        min_hist_x = df.min()
        max_hist_x = df.max()

        extend_x_proportion_percentage = 1.0 + (
            float(extend_x_proportion_percentage) / 100.0)

        # extend axes for PDF, so just outside histogram
        if min_x is not None:
            min_x = min(min_x, min_hist_x) * extend_x_proportion_percentage
        else:
            min_x = min_hist_x

        if max_x is not None:
            max_x = max(max_x, max_hist_x) * extend_x_proportion_percentage
        else:
            max_x = max_hist_x

        if denormalised: density = False

        vals = df.T.values.astype(np.float64)

        # Create a histogram with 10 buckets
        hist, bins = np.histogram(vals,
                                  bins=10,
                                  range=[float(min_hist_x),
                                         float(max_hist_x)],
                                  density=density,
                                  weights=obs_weights)
        bin_cent = (bins[1:] + bins[:-1]) * 0.5

        number_of_elements = len(df.values)

        dist_space = np.linspace(min_x, max_x, 100)

        if postfix_label is None:
            postfix_label = ''
        else:
            postfix_label = ": " + postfix_label

        if number_of_elements > 1:

            # Create a best fit PDF using Gaussian KDE model (forcibly cast to float64)
            if obs_weights is None:
                kde = gaussian_kde(vals)
            else:
                kde = gaussian_weighted_kde(vals,
                                            weights=obs_weights.values.astype(
                                                np.float64))

            # Sometimes need to transpose so the dimensions are consistent
            try:
                pdf_fit = kde(dist_space)
            except:
                pdf_fit = kde(dist_space.T)

            if obs_weights is None:
                # Calculated normal PDF
                weighted_stats = DescrStatsW(df.values, ddof=0)
            else:
                weighted_stats = DescrStatsW(df.values,
                                             weights=obs_weights.T.values,
                                             ddof=0)

            mu = weighted_stats.mean
            std = weighted_stats.std

            normal_pdf_fit = norm.pdf(dist_space, mu, std)

            # Scale pdf_fit (and normal PDF) by total/bin size
            if denormalised:
                bin_width = abs(bins[1] - bins[0])
                N = np.sum(hist)
                pdf_fit = pdf_fit * (bin_width * N)
                normal_pdf_fit = normal_pdf_fit * (bin_width * N)

            df_hist = pd.DataFrame(index=bin_cent,
                                   data=hist,
                                   columns=['Histogram' + postfix_label])
            df_pdf = pd.DataFrame(index=dist_space,
                                  data=pdf_fit,
                                  columns=['KDE-PDF' + postfix_label])
            df_pdf['Norm-PDF' + postfix_label] = normal_pdf_fit
        else:
            return pd.DataFrame(), pd.DataFrame()

        return df_hist, df_pdf
Exemplo n.º 26
0
 def get_descriptives(cls, ddof=0):
     cls.descriptive = DescrStatsW(cls.data, cls.weights, ddof)
diagnostico_b = df.query("diagnosis == 'B'")


# Efetuando o Zteste para a média (Comparando os resultados)
ztest(diagnostico_m['mean_radius'], value = diagnostico_m['mean_radius'].mean())
ztest(diagnostico_m['mean_radius'], value = diagnostico_b['mean_radius'].mean())


# Gerando o intervalo de confiança
zconfint(diagnostico_m['mean_radius'])
zconfint(diagnostico_b['mean_radius'])


"""----------------------------------------------------------------------------
        T Test
"""
diagnostico_m = df.query("diagnosis == 'M'")
diagnostico_b = df.query("diagnosis == 'B'")

# Aplicando o teste
resultados_m = DescrStatsW(diagnostico_m['mean_radius'])
resultados_b = DescrStatsW(diagnostico_b['mean_radius'])

# Gerando o intervalo de confiança
resultados_m.tconfint_mean()
resultados_b.tconfint_mean()




Exemplo n.º 28
0
def test_weightstats_len_1():
    x1 = [1]
    w1 = [1]
    d1 = DescrStatsW(x1, w1)
    assert (d1.quantile([0.0, 0.5, 1.0]) == 1).all()
Exemplo n.º 29
0
    def fit(self):
        """Calculate the augmented inverse probability weights and effect measures from the predicted exposure
        probabilities and predicted outcome values.

        Note
        ----
        Exposure and outcome models must be specified prior to `fit()`

        Returns
        -------
        For binary outcomes, gains `risk_difference`, `risk_difference_ci`, and `risk_ratio` attributes. For continuous
        outcomes, gains `average_treatment_effect` and `average_treatment_effect_ci` attributes
        """
        if (self._fit_exposure_ is False) or (self._fit_outcome_ is False):
            raise ValueError(
                'The exposure and outcome models must be specified before the doubly robust estimate can '
                'be generated')

        if self._miss_flag and not self._fit_missing_:
            warnings.warn(
                "All missing outcome data is assumed to be missing completely at random. To relax this "
                "assumption to outcome data is missing at random please use the `missing_model()` "
                "function", UserWarning)

        # Doubly robust estimator under all treated
        a_obs = self.df[self.exposure]
        y_obs = self.df[self.outcome]
        py_a1 = self.df['_pY1_']
        py_a0 = self.df['_pY0_']

        if self._fit_missing_:
            ps_g1 = self.df['_g1_'] * self.df['_ipmw_a1_']
            ps_g0 = self.df['_g0_'] * self.df['_ipmw_a0_']
        else:
            ps_g1 = self.df['_g1_']
            ps_g0 = self.df['_g0_']

        # Doubly robust estimator under all treated
        dr_a1 = np.where(a_obs == 1,
                         (y_obs / ps_g1) - ((py_a1 * ps_g0) / ps_g1), py_a1)

        # Doubly robust estimator under all untreated
        dr_a0 = np.where(a_obs == 1, py_a0,
                         (y_obs / ps_g0 - ((py_a0 * ps_g1) / ps_g0)))

        # Generating estimates for the risk difference and risk ratio
        zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)

        if self._weight_ is None:
            if self._continuous_outcome:
                self.average_treatment_effect = np.nanmean(dr_a1) - np.nanmean(
                    dr_a0)
                var_ic = np.nanvar(
                    (dr_a1 - dr_a0) - self.average_treatment_effect,
                    ddof=1) / self.df.shape[0]
                self.average_treatment_effect_se = np.sqrt(var_ic)
                self.average_treatment_effect_ci = [
                    self.average_treatment_effect - zalpha * np.sqrt(var_ic),
                    self.average_treatment_effect + zalpha * np.sqrt(var_ic)
                ]

            else:
                self.risk_difference = np.nanmean(dr_a1) - np.nanmean(dr_a0)
                self.risk_ratio = np.nanmean(dr_a1) / np.nanmean(dr_a0)
                var_ic = np.nanvar((dr_a1 - dr_a0) - self.risk_difference,
                                   ddof=1) / self.df.shape[0]
                self.risk_difference_se = np.sqrt(var_ic)
                self.risk_difference_ci = [
                    self.risk_difference - zalpha * np.sqrt(var_ic),
                    self.risk_difference + zalpha * np.sqrt(var_ic)
                ]
        else:
            dr_m1 = DescrStatsW(dr_a1, weights=self.df[self._weight_]).mean
            dr_m0 = DescrStatsW(dr_a0, weights=self.df[self._weight_]).mean

            if self._continuous_outcome:
                self.average_treatment_effect = dr_m1 - dr_m0
            else:
                self.risk_difference = dr_m1 - dr_m0
                self.risk_ratio = dr_m1 / dr_m0
Exemplo n.º 30
0
def test_weightstats_2d_w1():
    x1 = [[1], [2]]
    w1 = [[1], [2]]
    d1 = DescrStatsW(x1, w1)
    print(len(np.array(w1).shape))
    assert (d1.quantile([0.5, 1.0]) == 2).all().all()
Exemplo n.º 31
0
def test_ztest_ztost():
    # compare weightstats with separately tested proportion ztest ztost
    import statsmodels.stats.proportion as smprop

    x1 = [0, 1]
    w1 = [5, 15]

    res2 = smprop.proportions_ztest(15, 20., value=0.5)
    d1 = DescrStatsW(x1, w1)
    res1 = d1.ztest_mean(0.5)
    assert_allclose(res1, res2, rtol=0.03, atol=0.003)

    d2 = DescrStatsW(x1, np.array(w1) * 21. / 20)
    res1 = d2.ztest_mean(0.5)
    assert_almost_equal(res1, res2, decimal=12)

    res1 = d2.ztost_mean(0.4, 0.6)
    res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6)
    assert_almost_equal(res1[0], res2[0], decimal=12)

    x2 = [0, 1]
    w2 = [10, 10]
    # d2 = DescrStatsW(x1, np.array(w1)*21./20)
    d2 = DescrStatsW(x2, w2)
    res1 = ztest(d1.asrepeats(), d2.asrepeats())
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20., 20]))
    # TODO: check this is this difference expected?, see test_proportion
    assert_allclose(res1[1], res2[1], rtol=0.03)

    res1a = CompareMeans(d1, d2).ztest_ind()
    assert_allclose(res1a[1], res2[1], rtol=0.03)
    assert_almost_equal(res1a, res1, decimal=12)
Exemplo n.º 32
0
    def _standardized_difference_(df, treatment, var_type, weight, weighted=True):
        """Background function to calculate the standardized mean difference between the treat and untreated for a
        specified variable. Useful for checking whether a confounder was balanced between the two treatment groups
        by the specified IPTW model SMD based on: Austin PC 2011; https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3144483/
        """
        def _categorical_cov_(a, b):
            """Turns out, pandas and numpy don't have the correct covariance matrix I need for categorical variables.
            The covariance matrix is defined as

            S = [S_{kl}] = (P_{1k}*(1-P_{1k}) + P_{2k}*(1-P{2k})) / 2     if k == l
                           (P_{1k}*P_{1l} + P_{2k}*P_{2l}) / 2            if k != l
            """
            cv2 = []
            for i, v in enumerate(a):
                cv1 = []
                if i == 0:
                    pass
                else:
                    for j, w in enumerate(b):
                        if j == 0:
                            pass
                        elif i == j:
                            cv1.append((v * (1 - v) + w * (1 - w)) / 2)
                        else:
                            cv1.append((a[i] * a[j] + b[i] * b[j]) / -2)
                    cv2.append(cv1)

            return np.array(cv2)

        # Pulling out relevant data
        dft = df.loc[(df[treatment] == 1) & (df[weight].notnull())].copy()
        dfn = df.loc[(df[treatment] == 0) & (df[weight].notnull())].copy()
        vcols = list(df.columns)
        vcols.remove(treatment)
        vcols.remove(weight)

        if var_type == 'binary':
            if weighted:
                dwt = DescrStatsW(dft[vcols], weights=dft[weight])
                wt = dwt.mean
                dwn = DescrStatsW(dfn[vcols], weights=dfn[weight])
                wn = dwn.mean
            else:
                wt = np.mean(dft[vcols].dropna(), axis=0)
                wn = np.mean(dfn[vcols].dropna(), axis=0)
            return float((wt - wn) / np.sqrt((wt * (1 - wt) + wn * (1 - wn)) / 2))

        elif var_type == 'continuous':
            if weighted:
                dwt = DescrStatsW(dft[vcols], weights=dft[weight], ddof=1)
                wmt = dwt.mean
                wst = dwt.std
                dwn = DescrStatsW(dfn[vcols], weights=dfn[weight], ddof=1)
                wmn = dwn.mean
                wsn = dwn.std
            else:
                dwt = DescrStatsW(dft[vcols], ddof=1)
                wmt = dwt.mean
                wst = dwt.std
                dwn = DescrStatsW(dfn[vcols], ddof=1)
                wmn = dwn.mean
                wsn = dwn.std
            return float((wmt - wmn) / np.sqrt((wst ** 2 + wsn ** 2) / 2))

        elif var_type == 'categorical':
            if weighted:
                wt = np.average(dft[vcols], weights=dft[weight], axis=0)
                wn = np.average(dfn[vcols], weights=dfn[weight], axis=0)
            else:
                wt = np.mean(dft[vcols], axis=0)
                wn = np.mean(dfn[vcols], axis=0)

            t_c = wt - wn
            s_inv = np.linalg.inv(_categorical_cov_(a=wt, b=wn))
            return float(np.sqrt(np.dot(np.transpose(t_c[1:]), np.dot(s_inv, t_c[1:]))))

        else:
            raise ValueError('Not supported')
Exemplo n.º 33
0
    def test_weightstats_ddof_tests(self):
        # explicit test that ttest and confint are independent of ddof
        # one sample case
        x1_2d = self.x1_2d
        w1 = self.w1

        d1w_d0 = DescrStatsW(x1_2d, weights=w1, ddof=0)
        d1w_d1 = DescrStatsW(x1_2d, weights=w1, ddof=1)
        d1w_d2 = DescrStatsW(x1_2d, weights=w1, ddof=2)

        # check confint independent of user ddof
        res0 = d1w_d0.ttest_mean()
        res1 = d1w_d1.ttest_mean()
        res2 = d1w_d2.ttest_mean()
        # concatenate into one array with np.r_
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        res0 = d1w_d0.ttest_mean(0.5)
        res1 = d1w_d1.ttest_mean(0.5)
        res2 = d1w_d2.ttest_mean(0.5)
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        # check confint independent of user ddof
        res0 = d1w_d0.tconfint_mean()
        res1 = d1w_d1.tconfint_mean()
        res2 = d1w_d2.tconfint_mean()
        assert_almost_equal(res1, res0, 14)
        assert_almost_equal(res2, res0, 14)
Exemplo n.º 34
0
def getMean(array):
    weights = np.ones_like(array)
    stats = DescrStatsW(array, weights=weights, ddof=0)
    return stats.mean
Exemplo n.º 35
0
    def create_stats(self):
        """Compute statistical properties of column variable

        This function computes the statistical properties of values in the
        specified column.  It is called by other functions that use the
        resulting figures to create a statistical overview.
        """

        # reset stats containers
        self.stat_vars = []
        self.stat_vals = {}
        self.print_lines = []
        self.latex_table = []

        # determine column properties
        col_props = self.get_col_props()

        # get value counts
        cnt, var_cnt, dist_cnt = (len(self.col), len(self.col_nn),
                                  self.col.nunique())
        if self.weights_nn is not None:
            cnt, var_cnt = int(sum(self.weights)), int(sum(self.weights_nn))
        for stat_var, stat_val in zip(('count', 'filled', 'distinct'),
                                      (cnt, var_cnt, dist_cnt)):
            self.stat_vars.append(stat_var)
            self.stat_vals[stat_var] = (stat_val, '{:d}'.format(stat_val))
        n_nan = self.col.isnull().sum()
        if n_nan:
            self.stat_vars.append('nan')
            self.stat_vals['nan'] = (n_nan, '{:d}'.format(n_nan))
        # add value counts to print lines
        self.print_lines.append(
            '{}:'.format(self.label if self.label else self.name))
        ratio = (var_cnt / cnt) * 100 if cnt != 0 else 0
        self.print_lines.append('{0:d} entries ({1:.0f}%)'.format(
            var_cnt, ratio))
        self.print_lines.append('{0:d} unique entries'.format(dist_cnt))

        # convert time stamps to integers
        if col_props['is_ts']:
            col_num = self.col_nn.astype(int)
        else:
            col_num = self.col_nn

        # get additional statistics for numeric variables
        if col_props['is_num'] and len(col_num):
            stat_vars = ('mean', 'std', 'min', 'max', 'p01', 'p05', 'p16',
                         'p50', 'p84', 'p95', 'p99')
            quant_probs = (0, 1, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99)
            #stat_vals = (col_num.mean(), col_num.std(), col_num.min(), col_num.max())\
            #            + tuple(col_num.quantile((0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99)))
            # two lines below also work if weights are None
            des = DescrStatsW(col_num, self.weights_nn)
            stat_vals = (des.mean, des.std) + tuple(
                weighted_quantile(col_num, self.weights_nn, quant_probs))
            self.stat_vars += stat_vars
            for stat_var, stat_val in zip(stat_vars, stat_vals):
                if not col_props['is_ts']:
                    # value entry for floats and integers
                    self.stat_vals[stat_var] = (stat_val,
                                                '{:+g}'.format(stat_val))
                else:
                    if stat_var != 'std':
                        # display time stamps as date/time strings
                        self.stat_vals[stat_var] = (pd.Timestamp(
                            int(stat_val)), str(pd.Timestamp(int(stat_val))))
                    else:
                        # display time-stamp range as number of days
                        stat_val /= NUM_NS_DAY
                        self.stat_vals[stat_var] = (stat_val,
                                                    '{:g}'.format(stat_val))

            # append statistics to print lines
            name_len = max(len(n) for n in stat_vars)
            for stat_var in stat_vars:
                self.print_lines.append(
                    '{{0:{:d}s}} : {{1:s}}'.format(name_len).format(
                        stat_var, self.stat_vals[stat_var][1]))
Exemplo n.º 36
0
porcentagem = (sum(intervalos) / total) * 100
porcentagemStr = '%.2f' % (porcentagem) + "%"

intervalos2 = DF.loc[DF['Ponto médio'] < 160, ['Nº de Mulheres']].values
qtdMulheres = sum(intervalos2)

intervalos3 = DF.loc[DF['Ponto médio'] >= 100, ['Nº de Mulheres']].values
qtdMulheres = sum(intervalos3)
porcentagem2 = (sum(intervalos3) / total) * 100
porcentagemStr2 = '%.2f' % (porcentagem2) + "%"

#DF['Nº de Mulheres'].idxmax()

pontoM = DF['Ponto médio'].values
numMulheres = DF['Nº de Mulheres'].values
calcP = DescrStatsW(pontoM, numMulheres)
desvioP = calcP.std
desvioStr = '%.3f' % (desvioP)

mediaPonderada = calcP.mean
coeficientV = (desvioP / mediaPonderada) * 100
coeficientStr = '%.3f' % (coeficientV) + '%'

#histograma questão 1

NumeroMulheres = DF['Nº de Mulheres']
pressaoS = DF['Ponto médio']

# NumeroMulheres = DF['Nº de Mulheres']
# pressaoS = DF['Ponto médio']
Exemplo n.º 37
0
    def _standardized_difference(self, variable, var_type, weighted=True):
        """Calculates the standardized mean difference between the treat/exposed and untreated/unexposed for a
        specified variable. Useful for checking whether a confounder was balanced between the two treatment groups
        by the specified IPTW model SMD based on: Austin PC 2011; https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3144483/

        For efficiency, it is recommended you use standardized_mean_differences(). That function calculates the
        standardized mean differences for all variables included in the denominator

        Parameters
        ---------------
        variable : str, list
            Label for variable to calculate the standardized difference. If categorical variables, it should be a list
            of variable labels
        var_type : str
            Variable type. Options are 'binary' 'continuous' or 'categorical'. For categorical variable should be a
            list of columns labels
        weighted : bool, optional
            Whether to return the weighted standardized mean difference or the unweighted. Default is to return the
            weighted.

        Returns
        --------------
        None
            Prints the positivity results to the console but does not return any objects
        """
        # Pulling out relevant data
        dft = variable.loc[(variable[self.ex] == 1)
                           & (variable['iptw'].notnull())].copy()
        dfn = variable.loc[(variable[self.ex] == 0)
                           & (variable['iptw'].notnull())].copy()
        # removing self.ex and 'iptw' from vars to calculate for
        vcols = list(variable.columns)
        vcols.remove(self.ex)
        vcols.remove('iptw')

        if var_type == 'binary':
            if weighted:
                dwt = DescrStatsW(dft[vcols], weights=dft['iptw'])
                wt = dwt.mean
                dwn = DescrStatsW(dfn[vcols], weights=dfn['iptw'])
                wn = dwn.mean
            else:
                wt = np.mean(dft[vcols].dropna(), axis=0)
                wn = np.mean(dfn[vcols].dropna(), axis=0)
            return float((wt - wn) / np.sqrt(
                (wt * (1 - wt) + wn * (1 - wn)) / 2))

        if var_type == 'continuous':
            if weighted:
                dwt = DescrStatsW(dft[vcols], weights=dft['iptw'], ddof=1)
                wmt = dwt.mean
                wst = dwt.std
                dwn = DescrStatsW(dfn[vcols], weights=dfn['iptw'], ddof=1)
                wmn = dwn.mean
                wsn = dwn.std
            else:
                dwt = DescrStatsW(dft[vcols], ddof=1)
                wmt = dwt.mean
                wst = dwt.std
                dwn = DescrStatsW(dfn[vcols], ddof=1)
                wmn = dwn.mean
                wsn = dwn.std
            return float((wmt - wmn) / np.sqrt((wst**2 + wsn**2) / 2))

        if var_type == 'categorical':
            if weighted:
                wt = np.average(dft[vcols], weights=dft['iptw'], axis=0)
                wn = np.average(dfn[vcols], weights=dfn['iptw'], axis=0)
            else:
                wt = np.average(dft[vcols], axis=0)
                wn = np.mean(dfn[vcols], axis=0)

            t_c = wt - wn
            s_inv = np.linalg.inv(
                self._categorical_cov(treated=wt, untreated=wn))
            return float(
                np.sqrt(np.dot(np.transpose(t_c[1:]), np.dot(s_inv, t_c[1:]))))
Exemplo n.º 38
0
import numpy as np

np.random.seed(75243)
temp = nota_media_dos_filmes_com_pelo_menos_10_votos.sample(frac=1)

medias = [temp[0:i].mean() for i in range(1, len(temp))]

plt.plot(medias)

from statsmodels.stats.weightstats import zconfint

zconfint(nota_media_dos_filmes_com_pelo_menos_10_votos)

from statsmodels.stats.weightstats import DescrStatsW

descr_todos_com_10_votos = DescrStatsW(nota_media_dos_filmes_com_pelo_menos_10_votos)
descr_todos_com_10_votos.tconfint_mean()

"""# Vamos ver o filme 1..."""

filmes = pd.read_csv("movies.csv")
filmes.query("movieId==1")

notas1 = notas.query("movieId == 1")
notas1.head()

ax = sns.distplot(notas1.rating)
ax.set(xlabel="Nota", ylabel="Densidade")
ax.set_title("Distribuição das notas para o Toy Story")

ax = sns.boxplot(notas1.rating)