Пример #1
0
def test_init_and_fit(full_graph, mocker):
    predictor = p.StwfsapyPredictor(
        full_graph,
        c.test_type_concept,
        c.test_type_thesaurus,
        SKOS.broader)
    spy_deprecated = mocker.spy(t, "extract_deprecated")
    spy_case = mocker.spy(handlers, 'title_case_handler')
    predictor._init()
    spy_deprecated.assert_called_once_with(full_graph)
    assert len(spy_case.mock_calls) == len(c.test_labels)
    for _, label in c.test_labels:
        assert call(label.toPython()) in spy_case.mock_calls
    assert isinstance(
        predictor.pipeline_.named_steps['Classifier'],
        DecisionTreeClassifier)
    combined = predictor.pipeline_.named_steps['Combined Features']
    assert isinstance(
        combined,
        ColumnTransformer)
    assert combined.transformers[0][0] == 'Thesaurus Features'
    assert combined.transformers[1][0] == 'Text Features'
    assert combined.transformers[0][1].thesaurus_relation == SKOS.broader
    spy_fit = mocker.spy(predictor.pipeline_, "fit")
    predictor._fit_after_init(train_texts, y=train_labels)
    spy_fit.assert_called_once_with(
        [
            (c.test_concept_uri_0_0, "concept-0_0"),
            (c.test_concept_uri_100_00, "Concept-100_00"),
            (c.test_concept_uri_10_0, "concept-10_0"),
            (c.test_concept_uri_01_00, "concept-01_00"),
            ],
        [1, 0, 1, 1]
    )
Пример #2
0
def mocked_predictor(mocker):
    predictor = p.StwfsapyPredictor(None, None, None, None)
    predictor.concept_map_ = _concept_map
    predictor.match_and_extend = mocker.Mock(
        return_value=(_concepts_with_text, _doc_counts))
    predictor.pipeline_ = mocker.MagicMock()
    predictor.pipeline_.predict_proba = mocker.Mock(return_value=_predictions)
    predictor.pipeline_.predict = mocker.Mock(return_value=_classifications)
    return predictor
Пример #3
0
def test_fit(mocker):
    predictor = p.StwfsapyPredictor(None, None, None, None)
    predictor._init = mocker.Mock()
    predictor._fit_after_init = mocker.Mock()
    X = [list(range(i)) for i in range(13)]
    y = [i % 2 for i in range(13)]
    predictor.fit(X, y)
    predictor._init.assert_called_once()
    predictor._fit_after_init.assert_called_once_with(X, y=y)
Пример #4
0
def test_set_title_case(case_graph):
    predictor = p.StwfsapyPredictor(case_graph,
                                    c.test_type_concept,
                                    c.test_type_thesaurus,
                                    SKOS.broader,
                                    handle_title_case=True)
    predictor._init()
    assert 1 == len(list(predictor.dfa_.search("three word label")))
    assert 1 == len(list(predictor.dfa_.search("Three Word label")))
Пример #5
0
def test_match_and_extend_without_truth(patched_dfa):
    predictor = p.StwfsapyPredictor(None, None, None, None)
    predictor.dfa_ = patched_dfa
    concepts, counts = predictor.match_and_extend(["a", "bbb", "xx"])
    assert concepts == [
        ("9", "a"), ("9", "bbb"),
        ("11", "bbb"), ("13", "bbb"),
        ("9", "xx"), ("11", "xx")]
    assert counts == [1, 3, 2]
Пример #6
0
def test_english_plural(case_graph):
    predictor = p.StwfsapyPredictor(
        case_graph,
        c.test_type_concept,
        c.test_type_thesaurus,
        SKOS.broader,
        simple_english_plural_rules=True,
    )
    predictor._init()
    assert 1 == len(list(predictor.dfa_.search('three word labels')))
    assert 1 == len(list(predictor.dfa_.search('three word labels')))
Пример #7
0
def test_match_and_extend_with_truth(patched_dfa):
    predictor = p.StwfsapyPredictor(None, None, None, None)
    predictor.dfa_ = patched_dfa
    concepts, ys = predictor.match_and_extend(
        ["a", "bbb", "xx"],
        [[], [11, 14], [9]]
    )
    assert concepts == [
        ("9", "a"), ("9", "bbb"),
        ("11", "bbb"), ("13", "bbb"),
        ("9", "xx"), ("11", "xx")]
    assert ys == [0, 0, 1, 0, 1, 0]
Пример #8
0
def test_match_and_extend_without_truth_without_vec(patched_dfa):
    predictor = p.StwfsapyPredictor(None, None, None, None, use_txt_vec=False)
    predictor.dfa_ = patched_dfa
    predictor.text_features_ = mk_text_features().fit([])
    in_txts = ["a", "bbb", "xx"]
    matches, counts = predictor.match_and_extend(in_txts)
    txt_indices = [
        idx for idx, txt in enumerate(in_txts) for _ in range(len(txt))
    ]
    for txt_idx, match, expected in zip(txt_indices, matches,
                                        expected_matches):
        check_fit_arg(None, predictor.text_features_, in_txts[txt_idx], match,
                      expected)
    assert counts == [1, 3, 2]
Пример #9
0
def test_match_and_extend_with_truth_with_vec(patched_dfa, mock_vectorizer):
    predictor = p.StwfsapyPredictor(None, None, None, None, use_txt_vec=True)
    predictor.dfa_ = patched_dfa
    predictor.text_features_ = mk_text_features().fit([])
    predictor.text_vectorizer_ = mock_vectorizer
    in_txts = ["a", "bbb", "xx"]
    matches, ys = predictor.match_and_extend(in_txts, [[], [11, 14], [9]])
    txt_indices = [
        idx for idx, txt in enumerate(in_txts) for _ in range(len(txt))
    ]
    for txt_idx, match, expected in zip(txt_indices, matches,
                                        expected_matches):
        check_fit_arg(predictor.text_vectorizer_, predictor.text_features_,
                      in_txts[txt_idx], match, expected)
    assert ys == [0, 0, 1, 0, 1, 0]
Пример #10
0
def test_serialization_inversion_with_vec(tmpdir, full_graph):
    predictor = p.StwfsapyPredictor(full_graph,
                                    c.test_type_concept,
                                    c.test_type_thesaurus,
                                    SKOS.broader,
                                    use_txt_vec=True)
    predictor.fit(train_texts, train_labels)
    pth = tmpdir.mkdir("tmp").join("model.zip")
    predictor.store(pth.strpath)
    loaded = p.StwfsapyPredictor.load(pth.strpath)
    assert loaded.input == predictor.input
    assert loaded.use_txt_vec
    assert loaded.extract_any_case_from_braces == (
        predictor.extract_any_case_from_braces)
    assert loaded.extract_upper_case_from_braces == (
        predictor.extract_upper_case_from_braces)
    assert loaded.expand_ampersand_with_spaces == (
        predictor.expand_ampersand_with_spaces)
    assert loaded.expand_abbreviation_with_punctuation == (
        predictor.expand_abbreviation_with_punctuation)
    assert loaded.simple_english_plural_rules == (
        predictor.simple_english_plural_rules)
    assert loaded.concept_type_uri == predictor.concept_type_uri
    assert loaded.sub_thesaurus_type_uri == predictor.sub_thesaurus_type_uri
    assert loaded.thesaurus_relation_type_uri == (
        predictor.thesaurus_relation_type_uri)
    assert loaded.thesaurus_relation_is_specialisation == (
        predictor.thesaurus_relation_is_specialisation)
    assert loaded.concept_map_ == predictor.concept_map_
    assert loaded.dfa_ == predictor.dfa_
    assert len(loaded.graph) == len(predictor.graph)
    assert loaded.concept_map_ == predictor.concept_map_
    assert loaded.dfa_ == predictor.dfa_
    assert len(loaded.graph) == len(predictor.graph)
    assert loaded.pipeline_[0].transformers[0][1].mapping_ == (
        predictor.pipeline_[0].transformers[0][1].mapping_)
    assert loaded.text_vectorizer_ is not None
    assert loaded.text_vectorizer_.vocabulary_ == (
        predictor.text_vectorizer_.vocabulary_)
    loaded_txt_feat_names = [
        name for name, _ in loaded.text_features_.transformer_list
    ]
    pred_txt_feat_names = [
        name for name, _ in predictor.text_features_.transformer_list
    ]
    assert loaded_txt_feat_names == pred_txt_feat_names
    for triple in loaded.graph:
        assert triple in predictor.graph
Пример #11
0
def test_init_and_fit_with_vec(full_graph, mocker):
    predictor = p.StwfsapyPredictor(full_graph,
                                    c.test_type_concept,
                                    c.test_type_thesaurus,
                                    SKOS.broader,
                                    use_txt_vec=True)
    spy_deprecated = mocker.spy(t, "extract_deprecated")
    spy_case = mocker.spy(handlers, 'title_case_handler')
    predictor._init()
    assert predictor.text_vectorizer_ is not None
    spy_deprecated.assert_called_once_with(full_graph)
    assert len(spy_case.mock_calls) == len(c.test_labels)
    for _, label in c.test_labels:
        assert call(label.toPython()) in spy_case.mock_calls
    assert isinstance(predictor.pipeline_.named_steps['Classifier'],
                      DecisionTreeClassifier)
    combined = predictor.pipeline_.named_steps['Combined Features']
    assert isinstance(combined, ColumnTransformer)
    assert len(combined.transformers) == 5
    assert combined.transformers[0][0] == 'Thesaurus Features'
    assert combined.transformers[1][0] == 'Text Features'
    assert combined.transformers[2][0] == 'Position Features'
    assert combined.transformers[3][0] == 'Frequency Features'
    assert combined.transformers[4][0] == 'Text Vector'
    assert combined.transformers[0][1].thesaurus_relation == SKOS.broader
    spy_fit_pipeline = mocker.spy(predictor.pipeline_, "fit")
    spy_fit_vec = mocker.spy(predictor.text_vectorizer_, "fit")
    predictor._fit_after_init(train_texts, y=train_labels)
    spy_fit_pipeline.assert_called_once()

    pipe_fit_arg_list = spy_fit_pipeline.call_args_list[0]
    assert len(pipe_fit_arg_list) == 2
    pipe_fit_args = pipe_fit_arg_list[0]
    assert pipe_fit_arg_list[1].get('y') == [1, 0, 1, 1]
    expected_features = [
        (c.test_concept_uri_0_0, [0], 1),
        (c.test_concept_uri_100_00, [4], 1),
        (c.test_concept_uri_10_0, [0, 35], 0),
        (c.test_concept_uri_01_00, [17], 1),
    ]
    assert len(expected_features) == len(pipe_fit_args[0])
    txt_indices = [0, 3, 4, 4]
    for txt_idx, actual, expected in zip(txt_indices, pipe_fit_args[0],
                                         expected_features):
        check_fit_arg(predictor.text_vectorizer_, predictor.text_features_,
                      train_texts[txt_idx], actual, expected)

    spy_fit_vec.assert_called_once_with(train_texts)
Пример #12
0
def test_sparse_matrix_creation():
    predictor = p.StwfsapyPredictor(None, None, None, None)
    predictor.concept_map_ = _concept_map
    res = predictor._create_sparse_matrix(_predictions[:, 1],
                                          [c[0] for c in _concepts_with_text],
                                          _doc_counts)
    assert res.shape[0] == len(_doc_counts)
    assert res.shape[1] == 23
    for i, count in enumerate(_doc_counts):
        row = res.getrow(i)
        slice_start = sum(_doc_counts[:i])
        assert row.getnnz() == count
        # reverse slices because of mapping.
        assert list(row.nonzero()[1]) == list(
            reversed(
                [22 - i for i in _concepts[slice_start:slice_start + count]]))
        assert list(row.data) == list(
            reversed(_predictions[slice_start:slice_start + count, 1]))
Пример #13
0
def test_expansion(full_graph, mocker):
    stub = mocker.stub(name='expansion_stub')

    def mock_expander(**kwargs):
        return [stub]

    mocker.patch('stwfsapy.expansion.collect_expansion_functions',
                 mock_expander)
    predictor = p.StwfsapyPredictor(full_graph,
                                    c.test_type_concept,
                                    c.test_type_thesaurus,
                                    SKOS.broader,
                                    True,
                                    extract_upper_case_from_braces=False,
                                    extract_any_case_from_braces=True,
                                    expand_ampersand_with_spaces=False,
                                    expand_abbreviation_with_punctuation=False,
                                    simple_english_plural_rules=True)
    predictor._init()
    for _, label in c.test_labels:
        label_text = label.toPython()
        assert call(label_text) in stub.mock_calls
Пример #14
0
def no_match_predictor(mocker):
    predictor = p.StwfsapyPredictor(None, None, None, None)
    predictor.concept_map_ = _concept_map
    predictor.match_and_extend = mocker.Mock(
        return_value=([], [0, 0, 0]))
    return predictor
Пример #15
0
def test_mark_doc_end_empty():
    predictor = p.StwfsapyPredictor(None, None, None, None)
    lst = []
    predictor._mark_last_concept_in_doc(lst)
    assert lst == []
Пример #16
0
def test_mark_doc_end():
    predictor = p.StwfsapyPredictor(None, None, None, None)
    matches = [('a', None, None, [0], 5, 0), ('b', None, None, [2, 5], 7, 0)]
    predictor._mark_last_concept_in_doc(matches)
    assert matches == [('a', None, None, [0], 5, 0),
                       ('b', None, None, [2, 5], 7, 1)]