def test_extract_phrases(): phrases = [("machine", ), ("machine", "learning"), ("algorithm", )] phrase_map = tokenization._build_phrase_map(phrases) sentence = "the new machine learning algorithm".split() all_phrases = tokenization._extract_phrases(phrase_map, sentence, "keep") assert all_phrases == [ ("the", ), ("new", ), ("machine", "learning"), ("algorithm", ), ] all_phrases = tokenization._extract_phrases(phrase_map, sentence, tokenization.OUT_OF_VOC_TOKEN) assert all_phrases == [ (tokenization.OUT_OF_VOC_TOKEN, ), (tokenization.OUT_OF_VOC_TOKEN, ), ("machine", "learning"), ("algorithm", ), ] all_phrases = tokenization._extract_phrases(phrase_map, sentence, "[]") assert all_phrases == [ ("[the]", ), ("[new]", ), ("machine", "learning"), ("algorithm", ), ] all_phrases = tokenization._extract_phrases(phrase_map, sentence, "{}") assert all_phrases == [ ("{the}", ), ("{new}", ), ("machine", "learning"), ("algorithm", ), ]
def test_build_phrase_map(): phrases = [ ("machine", "learning"), ("default", "mode", "network"), ("resting", "state"), ("learning", ), ("network", ), ("brain", ), ("machine", ), ("speech", "perception"), ("speech", "production"), ("speech", ), ] phrase_map = tokenization._build_phrase_map(phrases) assert phrase_map == { "brain": { "": {} }, "default": { "mode": { "network": { "": {} } } }, "learning": { "": {} }, "machine": { "": {}, "learning": { "": {} } }, "network": { "": {} }, "resting": { "state": { "": {} } }, "speech": { "": {}, "perception": { "": {} }, "production": { "": {} } }, }