def test_when_some_vars_are_in_the_same_time_window():
    graph = MarkedPatternGraph(nodes=['X_t=2', 'Y_t=2'],
                               undirected_edges=[('X_t=2', 'Y_t=2')])

    TimeEdgeOrienter(graph).orient()

    assert graph.get_unmarked_arrows() == set({})
示例#2
0
def test_equals_same():
    var_names = ['a', 'b', 'c', 'd', 'e']

    graph_1 = MarkedPatternGraph(
        nodes=var_names,
        marked_arrows=[('c', 'MI_b')],
        undirected_edges=[
            ('a', 'b'),
            ('b', 'c'),
            ('e', 'd'),
            ('d', 'c'),
            ('b', 'd'),  # extraneous edge
        ],
        unmarked_arrows=[('a', 'e')],
        bidirectional_edges=[('a', 'b')])

    graph_2 = MarkedPatternGraph(
        nodes=var_names,
        marked_arrows=[('c', 'MI_b')],
        undirected_edges=[
            ('a', 'b'),
            ('b', 'c'),
            ('e', 'd'),
            ('d', 'c'),
            ('b', 'd'),  # extraneous edge
        ],
        unmarked_arrows=[('a', 'e')],
        bidirectional_edges=[('a', 'b')])

    assert graph_1 == graph_2
示例#3
0
def test_equals_undirected_edges_diff():
    var_names = ['a', 'b', 'c', 'd', 'e']

    graph_1 = MarkedPatternGraph(nodes=var_names,
                                 marked_arrows=[('c', 'MI_b')],
                                 undirected_edges=[
                                     ('a', 'b'),
                                     ('b', 'c'),
                                     ('e', 'd'),
                                     ('d', 'c'),
                                     ('b', 'd'),
                                 ],
                                 unmarked_arrows=[],
                                 bidirectional_edges=[])

    graph_2 = MarkedPatternGraph(nodes=var_names,
                                 marked_arrows=[('c', 'MI_b')],
                                 undirected_edges=[
                                     ('a', 'b'),
                                     ('b', 'c'),
                                     ('e', 'd'),
                                     ('d', 'c'),
                                 ],
                                 unmarked_arrows=[],
                                 bidirectional_edges=[])

    assert graph_1 != graph_2
示例#4
0
def test_has_path_when_there_is_a_longer_one():
    graph = MarkedPatternGraph(nodes=['x', 'y', 'z'],
                               marked_arrows=[],
                               undirected_edges=[('x', 'z')],
                               unmarked_arrows=[('z', 'y')],
                               bidirectional_edges=[])

    assert graph.has_path(('x', 'y')) == True
示例#5
0
def test_has_path_when_there_are_none():
    graph = MarkedPatternGraph(nodes=['x', 'y'],
                               marked_arrows=[],
                               undirected_edges=[],
                               unmarked_arrows=[],
                               bidirectional_edges=[])

    assert graph.has_path(('x', 'y')) == False
示例#6
0
def test_bidirectional_edges():
    graph = MarkedPatternGraph(nodes=['a', 'b'])
    graph.add_undirected_edge(('a', 'b'))
    graph.add_arrowhead(('a', 'b'))
    graph.add_arrowhead(('b', 'a'))

    assert set(graph.get_undirected_edges()) == set({})
    assert set(graph.get_unmarked_arrows()) == set({})
    assert set(graph.get_marked_arrows()) == set({})
    assert set(graph.get_bidirectional_edges()) == set({frozenset({'a', 'b'})})
示例#7
0
def test_has_arrowhead_with_marked_arrowhead():
    graph = MarkedPatternGraph(nodes=['a', 'b'])
    graph.add_undirected_edge(('a', 'b'))
    assert graph.has_arrowhead(('a', 'b')) == False

    graph.add_marked_arrowhead(('a', 'b'))

    assert graph.has_arrowhead(('a', 'b')) == True
    assert graph.has_marked_arrowhead(('a', 'b')) == True
示例#8
0
def test_remove_undirected_edge_when_not_exist():
    graph = MarkedPatternGraph(nodes=['a', 'b'])
    graph.remove_undirected_edge(('a', 'b'))

    assert set(graph.get_undirected_edges()) == set({})
    assert set(graph.get_unmarked_arrows()) == set({})
    assert set(graph.get_marked_arrows()) == set({})
    assert set(graph.get_bidirectional_edges()) == set({})
    assert set(graph.get_edges()) == set({})
def test_immorality_across_time():
    # X_t=1 --> Y_t=2 <-- X_t=3
    graph = MarkedPatternGraph(nodes=['X_t=1', 'Y_t=2', 'X_t=3'],
                               unmarked_arrows=[('X_t=1', 'Y_t=2'),
                                                ('X_t=3', 'Y_t=2')])

    TimeEdgeOrienter(graph).orient()

    # X_t=1 --> Y_t=2 <--> X_t=3
    assert graph.get_unmarked_arrows() == set({('X_t=1', 'Y_t=2')})

    assert graph.get_bidirectional_edges() == set(
        {frozenset({'X_t=3', 'Y_t=2'})})
示例#10
0
def test_long_chains_collider_bias_with_MI(
        df_long_chains_and_collider_with_MI,
        df_long_chains_and_collider_without_MI):

    size = 10000
    var_names = ['a', 'b', 'c', 'd', 'e']

    graph = MarkedPatternGraph(
        nodes=var_names,
        marked_arrows=[('c', 'MI_b')],
        undirected_edges=[
            ('a', 'b'),
            ('b', 'c'),
            ('e', 'd'),
            ('d', 'c'),
            ('b', 'd'),  # extraneous edge
        ])

    df_no_missing = df_long_chains_and_collider_without_MI(size=size)
    df_no_missing['count'] = 0

    assert df_no_missing['b'].mean() == approx(0.175, abs=0.01)
    no_missing_counts = (df_no_missing.groupby(['b', 'd']).count() /
                         df_no_missing.groupby('d').count())['count']

    # B & D are marginally independent
    assert no_missing_counts.xs([False, False], level=['b', 'd']).values[0] \
            == approx(1 - 0.175, abs=0.02)

    assert no_missing_counts.xs([False, True], level=['b', 'd']).values[0] \
            == approx(1 - 0.175, abs=0.02)

    assert no_missing_counts.xs([True, False], level=['b', 'd']).values[0] \
            == approx(0.175, abs=0.02)

    assert no_missing_counts.xs([True, True], level=['b', 'd']).values[0] \
            == approx(0.175, abs=0.02)

    corrected_df = DensityRatioWeightedCorrection(
        data=df_long_chains_and_collider_with_MI(size=size),
        var_names=['b', 'd'],
        graph=graph).correct()

    corrected_df['count'] = 0

    corrected_df_counts = (corrected_df.groupby(['b', 'd']).count() /
                           corrected_df.groupby('d').count())['count']

    # B & D are marginally independent
    assert corrected_df_counts.xs([0, False], level=['b', 'd']).values[0] \
            == approx(1 - 0.175, abs=0.02)

    assert corrected_df_counts.xs([0, True], level=['b', 'd']).values[0] \
            == approx(1 - 0.175, abs=0.02)

    assert corrected_df_counts.xs([1, False], level=['b', 'd']).values[0] \
            == approx(0.175, abs=0.02)

    assert corrected_df_counts.xs([1, True], level=['b', 'd']).values[0] \
            == approx(0.175, abs=0.02)
def test_firing_squad_example():
    undirected_edges = [
        frozenset(('captain', 'rifle_person_1')),
        frozenset(('captain', 'rifle_person_2')),
        frozenset(('rifle_person_1', 'death')),
        frozenset(('rifle_person_2', 'death')),
    ]

    marked_pattern_graph = MarkedPatternGraph(
        nodes=['captain',
               'rifle_person_1',
               'rifle_person_2',
               'death',
               'MI_captain'],
        marked_arrows=[('death', 'MI_captain')],
        undirected_edges=undirected_edges
    )

    potentially_extraneous_edges_finder = PotentiallyExtraneousEdgesFinder(
        marked_pattern_graph=marked_pattern_graph
    )

    potentially_extraneous_edges = \
        potentially_extraneous_edges_finder.find()

    assert potentially_extraneous_edges == set([])
示例#12
0
def test_deterministic_cause_of_missingness():
    size = 1000
    x = np.random.binomial(n=1, p=0.6, size=size)
    y = np.random.binomial(n=1, p=0.3, size=size)
    z = np.random.binomial(n=1, p=0.3, size=size)

    missing = np.where(x == 1)[0]

    df = pd.DataFrame({
        'x': x,
        'y': y,
        'z': z,
    })

    df.at[missing, 'z'] = np.nan

    graph = MarkedPatternGraph(nodes=['x', 'y', 'z', 'MI_z'],
                               marked_arrows=[('x', 'MI_z')])

    corrector = DensityRatioWeightedCorrection(data=df,
                                               var_names=['x', 'y', 'z'],
                                               graph=graph).correct()

    # no errors thrown
    assert 1
示例#13
0
def test_long_chains_and_collider_with_MI(df_long_chains_and_collider_with_MI):
    df = df_long_chains_and_collider_with_MI(size=1000, proba_noise=0.6)

    graph = MarkedPatternGraph(
        nodes=list(set(df.columns).union(set({'MI_b'}))),
        undirected_edges=set({
            frozenset({'b', 'a'}),
            frozenset({'d', 'e'}),
            frozenset({'d', 'c'}),
            frozenset({'b', 'c'}),
            frozenset({'d', 'b'})
        }),
        marked_arrows=[('c', 'MI_b')]
    )

    cond_sets = ConditioningSets()

    finder = RemovableEdgesFinder(
        data=df,
        cond_sets=cond_sets,
        graph=graph,
        potentially_extraneous_edges=set({
            frozenset({'d', 'b'}),
            frozenset({'d', 'c'}),
            frozenset({'b', 'c'})
        }),
        data_correction=DensityRatioWeightedCorrection,
    )

    removables = finder.find()

    assert cond_sets[key_for_pair(('b','d'))] != set({})

    assert set(removables) == set({ frozenset({'b', 'd'}) })
def test_3_multinom_RVs_MAR(df_Z_causes_X_Y_and_X_Z_causes_MI_Y):
    size = 1000

    df = df_Z_causes_X_Y_and_X_Z_causes_MI_Y(size=size)
    graph = MarkedPatternGraph(nodes=['x', 'y', 'z'])

    direct_causes_of_missingness_finder = DirectCausesOfMissingnessFinder(
        data=df, graph=graph)

    marked_arrows = direct_causes_of_missingness_finder.find()
    assert set(marked_arrows) == set([('z', 'MI_y'), ('x', 'MI_y')])
def test_when_marked_path_exists():
    #   a -*> b -*> c
    #    \        /
    #      \    /
    #       \ /
    #
    graph = MarkedPatternGraph(nodes=['a', 'b', 'c'])
    graph.add_undirected_edge(('a', 'b'))
    graph.add_undirected_edge(('b', 'c'))
    graph.add_undirected_edge(('a', 'c'))

    graph.add_marked_arrowhead(('a', 'b'))
    graph.add_marked_arrowhead(('b', 'c'))

    RecursiveEdgeOrienter(marked_pattern_graph=graph).orient()

    assert graph.get_unmarked_arrows() == set({('a', 'c')})
def test_simple():
    #  a     c
    #   \   /
    #    v v
    #     b
    #     |
    #     d
    graph = MarkedPatternGraph(nodes=['a', 'b', 'c', 'd'])
    graph.add_undirected_edge(('a', 'b'))
    graph.add_undirected_edge(('c', 'b'))
    graph.add_undirected_edge(('b', 'd'))

    graph.add_arrowhead(('a', 'b'))
    graph.add_arrowhead(('c', 'b'))

    RecursiveEdgeOrienter(marked_pattern_graph=graph).orient()

    assert graph.get_marked_arrows() == set({('b', 'd')})
def test_mcar():
    marked_pattern_graph = MarkedPatternGraph(
        nodes=['X', 'Y', 'MI_x']
    )

    potentially_extraneous_edges_finder = PotentiallyExtraneousEdgesFinder(
        marked_pattern_graph=marked_pattern_graph
    )

    potentially_extraneous_edges = \
        potentially_extraneous_edges_finder.find()

    assert potentially_extraneous_edges == set({})
示例#18
0
def test_add_marked_arrows():
    graph = MarkedPatternGraph(nodes=['a', 'b'], undirected_edges=[('a', 'b')])

    graph.add_marked_arrow(('c', 'd'))

    assert graph.get_marked_arrows() == set({('c', 'd')})
    assert set(graph.get_edges()) == set(
        {frozenset({'a', 'b'}), frozenset({'c', 'd'})})
示例#19
0
def test_simple():
    # a     c
    # \   /
    # v v
    # b
    # |
    # d
    graph = MarkedPatternGraph(nodes=['a', 'b', 'c', 'd'])
    graph.add_undirected_edge(('a', 'b'))
    graph.add_undirected_edge(('c', 'b'))
    graph.add_undirected_edge(('b', 'd'))

    graph.add_arrowhead(('a', 'b'))
    graph.add_arrowhead(('c', 'b'))

    assert graph.get_edges() == set(
        {frozenset({'a', 'b'}),
         frozenset({'b', 'c'}),
         frozenset({'b', 'd'})})
示例#20
0
    def find(self):
        """
            Go through each pair of variables (in var_names).
            For each pair, find a conditioning set that renders the two variables
            independent.

            Returns:
                marked_pattern: MarkedPatternGraph
                    It'll store the skeleton (a set of undirected edges). It
                    can be used for later steps, such as finding immoralities.

                cond_sets_satisfying_cond_indep: dict

                    key: str.
                        The pair of variables that are conditionally
                        independent, delimited by " _||_ ".  E.g. If "X _||_ Y"
                        is a key, then X and Y are the variables that are
                        conditionally independent.

                    value: list(sets(str)).
                        The conditioning sets that make X and Y conditionally
                        independent.
        """
        undirected_edges = []
        cond_sets_satisfying_cond_indep = {}

        for var_name_1, var_name_2 in combinations(self.orig_cols, 2):
            possible_conditioning_set_vars = \
                set(self.orig_cols) \
                - set([var_name_1, var_name_2])

            cond_sets = conditioning_sets_satisfying_conditional_independence(
                data=self.data,
                var_name_1=var_name_1,
                var_name_2=var_name_2,
                cond_indep_test=self.cond_indep_test,
                possible_conditioning_set_vars=possible_conditioning_set_vars,
                only_find_one=self.only_find_one,
                max_depth=self.max_depth)

            if len(cond_sets) == 0:
                undirected_edges.append(frozenset((var_name_1, var_name_2)))
            else:
                cond_sets_satisfying_cond_indep[key_for_pair(
                    [var_name_1, var_name_2])] = cond_sets

        marked_pattern = MarkedPatternGraph(nodes=list(self.data.columns),
                                            undirected_edges=undirected_edges)

        return marked_pattern, cond_sets_satisfying_cond_indep
def test_2_multinom_RVs_MAR(df_X_Y_indep_Y_causes_MI_X):

    size = 2000

    df = df_X_Y_indep_Y_causes_MI_X(size=size)

    graph = MarkedPatternGraph(nodes=['x', 'y'])

    direct_causes_of_missingness_finder = DirectCausesOfMissingnessFinder(
        data=df, graph=graph)

    marked_arrows = direct_causes_of_missingness_finder.find()

    assert marked_arrows == [('y', 'MI_x')]
def test_2_multinom_RVs_MCAR(df_2_multinomial_indep_RVs):
    size = 2000
    df = df_2_multinomial_indep_RVs(size=size)

    missingness_of_x = np.random.binomial(n=1, p=0.3, size=size)
    missingness_indices = np.where(missingness_of_x == 1)

    df.at[missingness_indices[0], 'x'] = np.nan

    graph = MarkedPatternGraph(nodes=['x', 'y'])

    direct_causes_of_missingness_finder = DirectCausesOfMissingnessFinder(
        data=df, graph=graph)

    marked_arrows = direct_causes_of_missingness_finder.find()

    assert marked_arrows == []
示例#23
0
def test_chain_and_collider_with_MI(
    df_chain_and_collider_with_MI
):
    size = 10000

    df = df_chain_and_collider_with_MI(size=size)

    cond_sets = ConditioningSets()

    graph = MarkedPatternGraph(
        nodes=list(set(df.columns).union(set({'MI_y'}))),
        undirected_edges=set({
            frozenset({'a', 'c'}), # extraneous edge
            frozenset({'a', 'b'}),
            frozenset({'b', 'c'}),
            frozenset({'a', 'd'}),
            frozenset({'c', 'd'}),
        }),
        marked_arrows=[
            ('d', 'MI_a')
        ]
    )

    # we expect a-c in this intermediate stage. a-c is spurious, due to
    # collider bias.

    expected_undirected_edges = frozenset({
        frozenset({'a', 'c'}),
    })

    finder = RemovableEdgesFinder(
        data=df,
        cond_sets=cond_sets,
        graph=graph,
        potentially_extraneous_edges=set({
            frozenset({'a', 'c'}),
        }),
        data_correction=DensityRatioWeightedCorrection,
    )

    removables = finder.find()

    assert cond_sets[key_for_pair(('a', 'c'))] != set({})

    assert set(removables) == set({ frozenset({'a', 'c'}) })
def test_two_causes_MI_collider():
    undirected_edges = [
        frozenset(('z', 'y'))
    ]

    marked_pattern_graph = MarkedPatternGraph(
        nodes=['x', 'y', 'z', 'MI_x'],
        marked_arrows=[('y', 'MI_x'), ('z', 'MI_x' )],
        undirected_edges=undirected_edges
    )

    potentially_extraneous_edges_finder = PotentiallyExtraneousEdgesFinder(
        marked_pattern_graph=marked_pattern_graph
    )

    potentially_extraneous_edges = \
        potentially_extraneous_edges_finder.find()

    assert potentially_extraneous_edges == set(undirected_edges)
def test_marked_arrow_exists_with_no_MI():
    undirected_edges = [
        frozenset(('z', 'y'))
    ]

    marked_pattern_graph = MarkedPatternGraph(
        nodes=['x', 'y', 'z', 'MI_x'],
        marked_arrows=[('x', 'y')],
        undirected_edges=undirected_edges
    )

    potentially_extraneous_edges_finder = PotentiallyExtraneousEdgesFinder(
        marked_pattern_graph=marked_pattern_graph
    )

    potentially_extraneous_edges = \
        potentially_extraneous_edges_finder.find()

    assert potentially_extraneous_edges == set([])
示例#26
0
def test_missing_data_because_of_ses():
    size = 10000

    ses = np.random.binomial(n=1, p=0.3, size=size)

    b_1_given_ses_low = np.random.binomial(n=1, p=0.4, size=size)
    b_1_given_ses_high = np.random.binomial(n=1, p=0.9, size=size)

    missing_b_1_given_ses_low = np.random.binomial(n=1, p=0.5, size=size)
    missing_b_1_given_ses_high = np.random.binomial(n=1, p=0.1, size=size)

    b = ses * b_1_given_ses_high + (ses == 0) * b_1_given_ses_low

    missing = ses * missing_b_1_given_ses_high \
        + (ses == 0) * missing_b_1_given_ses_low

    # true mean
    assert b.mean() == approx(0.55, abs=0.015)

    # Those with lower SES are more likely to be missing.
    missing_index = np.where(missing == 1)[0]

    df_with_missing_data = pd.DataFrame({'ses': ses, 'b': b})

    df_with_missing_data.loc[missing_index, 'b'] = np.nan

    # A naive analysis leads to an overestimate.
    assert df_with_missing_data['b'].mean() == approx(0.62, abs=0.015)

    graph = MarkedPatternGraph(nodes=['ses', 'b', 'MI_b'],
                               marked_arrows=[('ses', 'MI_b')],
                               undirected_edges=[('ses', 'b')])

    corrector = DensityRatioWeightedCorrection(data=df_with_missing_data,
                                               var_names=['ses', 'b', 'MI_b'],
                                               graph=graph)

    # reweight data before running statistics on it
    reweighted_df = corrector.correct()

    # we're able to recover the true mean
    assert reweighted_df['b'].mean() == approx(0.55, abs=0.015)
def test_chain_and_collider_with_MI(df_chain_and_collider_with_MI):
    size = 10000

    df = df_chain_and_collider_with_MI(size=size)

    graph = MarkedPatternGraph(nodes=df.columns,
                               undirected_edges=[
                                   set({'a', 'b'}),
                                   set({'b', 'c'}),
                                   set({'c', 'd'}),
                                   set({'a', 'd'}),
                               ])

    direct_causes_of_missingness_finder = DirectCausesOfMissingnessFinder(
        data=df, graph=graph)

    marked_arrows = direct_causes_of_missingness_finder.find()

    expected_marked_arrows = frozenset({('d', 'MI_a')})

    assert frozenset(marked_arrows) == expected_marked_arrows
示例#28
0
def test_3_multinom_RVs_MAR(
    df_Z_causes_X_Y_and_X_Z_causes_MI_Y
):
    size = 1000

    df = df_Z_causes_X_Y_and_X_Z_causes_MI_Y(size=size)

    graph = MarkedPatternGraph(
        nodes=list(set(df.columns).union(set({'MI_y'}))),
        undirected_edges=set({
            frozenset({'x', 'y'}), # extraneous edge
            frozenset({'x', 'z'}),
            frozenset({'z', 'y'}),
        }),
        marked_arrows=[
            ('x', 'MI_y'),
            ('z', 'MI_y')
        ]
    )

    cond_sets = ConditioningSets()

    finder = RemovableEdgesFinder(
        data=df,
        cond_sets=cond_sets,
        graph=graph,
        potentially_extraneous_edges=set({
            frozenset({'x', 'y'}),
        }),
        data_correction=DensityRatioWeightedCorrection,
    )

    removables = finder.find()

    assert set(removables) == set({ frozenset({'x', 'y'}) })

    assert cond_sets[key_for_pair(('x', 'y'))] != set({})
示例#29
0
def test_cond_on_collider(df_X_and_Y_cause_Z_and_Z_cause_MI_X):
    df = df_X_and_Y_cause_Z_and_Z_cause_MI_X(size=2000)

    cond_sets = ConditioningSets()

    # extraneous edge x-y
    graph = MarkedPatternGraph(
        nodes=['x', 'y', 'z', 'MI_x'],
        undirected_edges=[set({'x', 'y'}), set({'x', 'z'}), set({'y', 'z'})],
        marked_arrows=[('z', 'MI_x')]
    )

    finder = RemovableEdgesFinder(
        data=df,
        cond_sets=cond_sets,
        graph=graph,
        potentially_extraneous_edges=[set({'x', 'y'})],
        data_correction=DensityRatioWeightedCorrection,
    )

    removables = finder.find()

    assert removables == [set({'x', 'y'})]
    assert cond_sets[key_for_pair(('x','y'))] != set({})
def test_firing_squad():
    undirected_edges = [
        frozenset(('captain', 'rifle_person_1')),
        frozenset(('captain', 'rifle_person_2')),
        frozenset(('rifle_person_1', 'death')),
        frozenset(('rifle_person_2', 'death')),
    ]

    graph = MarkedPatternGraph(nodes=[
        'captain', 'rifle_person_1', 'rifle_person_2', 'prisoner shot',
        'prisoner death'
    ])

    graph.add_undirected_edge(('captain', 'rifle_person_1'))
    graph.add_undirected_edge(('captain', 'rifle_person_2'))
    graph.add_undirected_edge(('rifle_person_1', 'prisoner shot'))
    graph.add_undirected_edge(('rifle_person_2', 'prisoner shot'))
    graph.add_undirected_edge(('prisoner shot', 'prisoner death'))

    graph.add_arrowhead(('rifle_person_1', 'prisoner shot'))
    graph.add_arrowhead(('rifle_person_2', 'prisoner shot'))

    RecursiveEdgeOrienter(marked_pattern_graph=graph).orient()

    assert graph.get_marked_arrows() == set({('prisoner shot',
                                              'prisoner death')})