예제 #1
0
def test_extract_phrases():
    phrases = [("machine", ), ("machine", "learning"), ("algorithm", )]
    phrase_map = tokenization._build_phrase_map(phrases)
    sentence = "the new machine learning algorithm".split()
    all_phrases = tokenization._extract_phrases(phrase_map, sentence, "keep")
    assert all_phrases == [
        ("the", ),
        ("new", ),
        ("machine", "learning"),
        ("algorithm", ),
    ]
    all_phrases = tokenization._extract_phrases(phrase_map, sentence,
                                                tokenization.OUT_OF_VOC_TOKEN)
    assert all_phrases == [
        (tokenization.OUT_OF_VOC_TOKEN, ),
        (tokenization.OUT_OF_VOC_TOKEN, ),
        ("machine", "learning"),
        ("algorithm", ),
    ]
    all_phrases = tokenization._extract_phrases(phrase_map, sentence, "[]")
    assert all_phrases == [
        ("[the]", ),
        ("[new]", ),
        ("machine", "learning"),
        ("algorithm", ),
    ]
    all_phrases = tokenization._extract_phrases(phrase_map, sentence, "{}")
    assert all_phrases == [
        ("{the}", ),
        ("{new}", ),
        ("machine", "learning"),
        ("algorithm", ),
    ]
예제 #2
0
def test_build_phrase_map():
    phrases = [
        ("machine", "learning"),
        ("default", "mode", "network"),
        ("resting", "state"),
        ("learning", ),
        ("network", ),
        ("brain", ),
        ("machine", ),
        ("speech", "perception"),
        ("speech", "production"),
        ("speech", ),
    ]
    phrase_map = tokenization._build_phrase_map(phrases)
    assert phrase_map == {
        "brain": {
            "": {}
        },
        "default": {
            "mode": {
                "network": {
                    "": {}
                }
            }
        },
        "learning": {
            "": {}
        },
        "machine": {
            "": {},
            "learning": {
                "": {}
            }
        },
        "network": {
            "": {}
        },
        "resting": {
            "state": {
                "": {}
            }
        },
        "speech": {
            "": {},
            "perception": {
                "": {}
            },
            "production": {
                "": {}
            }
        },
    }