Пример #1
0
def test_df_extra_ev_value_error():
    """to_matrix should raise NotImplementError if given a DataFrame and extra
       evidence (for now)."""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cs = CountsScorer(lr, source_list)
    cs.to_matrix(test_df, extra_evidence=[[5]])
Пример #2
0
def test_fit_df():
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'medscan', 'trips', 'rlimsp']
    cw = CountsScorer(lr, source_list)
    cw.fit(test_df, y_arr_df)
    # Once the model is fit, the coef_ attribute should be defined
    assert 'coef_' in cw.model.__dict__
Пример #3
0
def test_fit_stmts():
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cw = CountsScorer(lr, source_list)
    cw.fit(test_stmts, y_arr_stmts)
    # Once the model is fit, the coef_ attribute should be defined
    assert 'coef_' in cw.model.__dict__
Пример #4
0
def test_extra_evidence_length():
    """Should raise ValueError because the extra_evidence list is not the
    same length as the list of statements."""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cs = CountsScorer(lr, source_list)
    extra_ev = [[5]]
    x_arr = cs.stmts_to_matrix(test_stmts, extra_evidence=extra_ev)
Пример #5
0
def test_use_members_with_stmts():
    """Check that we can set use_num_members when passing statements."""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cw = CountsScorer(lr, source_list, use_num_members=True)
    x_arr = cw.to_matrix(test_stmts)
    assert x_arr.shape == (len(test_stmts), len(source_list)+1), \
            'stmt matrix dimensions should match test stmts plus num_members'
Пример #6
0
def test_check_missing_source_counts():
    lr = LogisticRegression()
    source_list = ['reach', 'sparser']
    cw = CountsScorer(lr, source_list)
    # Drop the source_counts column
    df_no_sc = test_df.drop('source_counts', axis=1)
    # Should error
    cw.fit(df_no_sc, y_arr_df)
Пример #7
0
def test_missing_source():
    """Check that all source_apis in training data are in source list."""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser']
    cw = CountsScorer(lr, source_list)
    # Should error because test stmts are from signor and signor
    # is not in list
    cw.stmts_to_matrix(test_stmts)
Пример #8
0
def test_extra_evidence_content():
    """Should raise ValueError if extra_evidence list entries are not
    Evidence objects or empty lists."""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cs = CountsScorer(lr, source_list)
    extra_ev = ([[5]] * (len(test_stmts) - 1)) + [[]]
    x_arr = cs.stmts_to_matrix(test_stmts, extra_evidence=extra_ev)
Пример #9
0
def test_use_members_with_df():
    """Check that we can't set use_num_members when passing a DataFrame."""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cw = CountsScorer(lr, source_list, use_num_members=True)
    # This should error because stmt DataFrame doesn't contain num_members
    # info
    stmt_arr = cw.to_matrix(test_df)
Пример #10
0
def test_matrix_to_matrix():
    """Check that we get a matrix back when passed to to_matrix."""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cw = CountsScorer(lr, source_list)
    # Train on statement data
    stmt_arr = cw.to_matrix(test_df)
    assert cw.to_matrix(stmt_arr) is stmt_arr, \
            'If passed a numpy array to_matrix should return it back.'
Пример #11
0
def test_check_source_columns():
    lr = LogisticRegression()
    source_list = ['reach', 'sparser']
    cw = CountsScorer(lr, source_list)
    # Drop the source_counts column
    df_sc = test_df.drop('source_counts', axis=1)
    # Add reach and sparser columns
    df_sc['reach'] = 0
    df_sc['sparser'] = 0
    # Should not error
    cw.fit(df_sc, y_arr_df)
Пример #12
0
def setup_belief(include_more_specific=False):
    # Make a model
    lr = LogisticRegression()
    # Get all the sources
    source_list = CountsScorer.get_all_sources(test_stmts_cur)
    cs = CountsScorer(lr,
                      source_list,
                      include_more_specific=include_more_specific)
    # Train on curated stmt data
    if include_more_specific:
        extra_evidence = [[
            ev for supp in stmt.supports for ev in supp.evidence
        ] for stmt in test_stmts_cur]
    else:
        extra_evidence = None
    # Fit with extra evidence, if any
    cs.fit(test_stmts_cur, y_arr_stmts_cur, extra_evidence)
    # Run predictions on test statements without extra evidence to get prior
    # probs
    probs = cs.predict_proba(test_stmts_cur)[:, 1]
    # Now check if we get these same beliefs set on the statements when we
    # run with the belief engine:
    # Get scorer and belief engine instances for trained model
    be = BeliefEngine(scorer=cs)
    # Make a shallow copy of the test stmts so that we don't change beliefs
    # of the global instances as a side-effect of this test
    test_stmts_copy = copy(test_stmts_cur)
    return be, test_stmts_copy, probs
Пример #13
0
def test_fit_stmts_predict_stmts():
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cw = CountsScorer(lr, source_list)
    cw.fit(test_stmts, y_arr_stmts)
    probs = cw.predict_proba(test_stmts)
    assert probs.shape == (len(test_stmts), 2), \
        'prediction results should have dimension (# stmts, # classes)'
    log_probs = cw.predict_log_proba(test_stmts)
    assert log_probs.shape == (len(test_stmts), 2), \
        'prediction results should have dimension (# stmts, # classes)'
    preds = cw.predict(test_stmts)
    assert preds.shape == (len(test_stmts),), \
        'prediction results should have dimension (# stmts)'
Пример #14
0
def test_hybrid_scorer():
    # First instantiate and train the SimpleScorer on readers
    # Make a model
    lr = LogisticRegression()
    # Get all the sources
    source_list = CountsScorer.get_all_sources(test_stmts_cur)
    # The sources for this sample (test_stmts_cur) include only: trips,
    # sparser, medscan, hprd, and reach. Of these, we'll set aside hprd to be
    # scored by the simplescorer and the other to be scored by the CountsScorer
    skl_sources = ['trips', 'sparser', 'medscan', 'reach']
    cs = CountsScorer(lr, skl_sources)
    # Train on curated stmt data
    cs.fit(test_stmts_cur, y_arr_stmts_cur)
    # Run predictions on test statements for later comparison
    cs_beliefs = cs.predict_proba(test_stmts_cur)[:, 1]
    # Next, get the default SimpleScorer:
    ss = default_scorer
    # Let's check the prior probability associated with HPRD
    hprd_rand = ss.prior_probs['rand']['hprd']
    hprd_syst = ss.prior_probs['syst']['hprd']
    # Now instantiate a HybridScorer
    hs = HybridScorer(cs, ss)
    # Check that sources are accounted for
    hs.check_prior_probs(test_stmts_cur)
    # Score the statements with the HybridScorer
    hybrid_beliefs = hs.score_statements(test_stmts_cur)
    # Look at each statement and check that the belief is what's expected
    # based on the skl-predicted belief and the HPRD evidence from the
    # simple scorer
    expected_beliefs = []
    for ix, stmt in enumerate(test_stmts_cur):
        # Check the sources
        stmt_sources = Counter([ev.source_api for ev in stmt.evidence])
        # If statement has no HPRD evidence, we expect the belief to be
        # the same as the skl-predicted belief
        if 'hprd' not in stmt_sources:
            expected_beliefs.append(cs_beliefs[ix])
        # Otherwise, calculate belief incorporating HPRD evidences
        else:
            # How many HPRD evidences?
            hprd_count = stmt_sources['hprd']
            print("hprd_count", hprd_count)
            hprd_belief = 1 - (hprd_syst + hprd_rand**hprd_count)
            expected_beliefs.append(1 - (1 - cs_beliefs[ix]) *
                                    (1 - hprd_belief))
            print(expected_beliefs[ix], hybrid_beliefs[ix])

    assert np.allclose(hybrid_beliefs, expected_beliefs)
Пример #15
0
def test_df_to_matrix():
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cw = CountsScorer(lr, source_list)
    x_arr = cw.df_to_matrix(test_df)
    assert isinstance(x_arr, np.ndarray), 'x_arr should be a numpy array'
    assert x_arr.shape == (len(test_df), len(source_list)), \
            'stmt matrix dimensions should match test stmts'
    assert x_arr.shape == (len(test_df), len(source_list))
    # Try again with statement type
    cw = CountsScorer(lr, source_list, use_stmt_type=True)
    num_types = len(cw.stmt_type_map)
    x_arr = cw.df_to_matrix(test_df)
    assert x_arr.shape == (len(test_df), len(source_list) + num_types), \
        'matrix should have a col for sources and other cols for every ' \
        'statement type.'
Пример #16
0
def test_stmts_to_matrix():
    """Check that all source_apis in training data are in source list."""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cw = CountsScorer(lr, source_list)
    x_arr = cw.stmts_to_matrix(test_stmts)
    assert isinstance(x_arr, np.ndarray), 'x_arr should be a numpy array'
    assert x_arr.shape == (len(test_stmts), len(source_list)), \
            'stmt matrix dimensions should match test stmts'
    assert set(x_arr.sum(axis=0)) == set([0, 0, len(test_stmts)]), \
           'Signor col should be 1 in every row, other cols 0.'
    # Try again with statement type
    cw = CountsScorer(lr, source_list, use_stmt_type=True)
    num_types = len(cw.stmt_type_map)
    x_arr = cw.stmts_to_matrix(test_stmts)
    assert x_arr.shape == (len(test_stmts), len(source_list) + num_types), \
        'matrix should have a col for sources and other cols for every ' \
        'statement type.'
Пример #17
0
def setup_belief():
    # Make a model
    lr = LogisticRegression()
    # Get all the sources
    source_list = CountsScorer.get_all_sources(test_stmts_cur)
    cs = CountsScorer(lr, source_list)
    # Train on curated stmt data
    cs.fit(test_stmts_cur, y_arr_stmts_cur)
    # Run predictions on test statements
    probs = cs.predict_proba(test_stmts_cur)[:, 1]
    # Now check if we get these same beliefs set on the statements when we
    # run with the belief engine:
    # Get scorer and belief engine instances for trained model
    be = BeliefEngine(scorer=cs)
    # Make a shallow copy of the test stmts so that we don't change beliefs
    # of the global instances as a side-effect of this test
    test_stmts_copy = copy(test_stmts_cur)
    return be, test_stmts_copy, probs
Пример #18
0
def test_counts_wrapper():
    """Instantiate counts wrapper and make stmt matrix"""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser']
    cw = CountsScorer(lr, source_list)
Пример #19
0
def test_check_df_cols_err():
    """Drop a required column and make sure we get a ValueError."""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cw = CountsScorer(lr, source_list)
    cw.df_to_matrix(test_df.drop('stmt_type', axis=1))
Пример #20
0
def test_check_df_cols_noerr():
    """Test dataframe should not raise ValueError."""
    lr = LogisticRegression()
    source_list = ['reach', 'sparser', 'signor']
    cw = CountsScorer(lr, source_list)
    cw.df_to_matrix(test_df)