def test_tee_filter(): target = u("Alfa Bravo Charlie") f1 = analysis.LowercaseFilter() f2 = analysis.ReverseTextFilter() ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) result = " ".join([t.text for t in ana(target)]) assert_equal(result, "alfa aflA bravo ovarB charlie eilrahC") class ucfilter(analysis.Filter): def __call__(self, tokens): for t in tokens: t.text = t.text.upper() yield t f2 = analysis.ReverseTextFilter() | ucfilter() ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) result = " ".join([t.text for t in ana(target)]) assert_equal(result, "alfa AFLA bravo OVARB charlie EILRAHC") f1 = analysis.PassFilter() f2 = analysis.BiWordFilter() ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter( f1, f2) | analysis.LowercaseFilter() result = " ".join([t.text for t in ana(target)]) assert_equal(result, "alfa alfa-bravo bravo bravo-charlie charlie")
def test_substitution(): mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("-", "") assert_equal([t.text for t in mf(u("one-two th-re-ee four"))], ["onetwo", "threee", "four"]) mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter( "([^=]*)=(.*)", r"\2=\1") assert_equal([t.text for t in mf(u("a=b c=d ef"))], ["b=a", "d=c", "ef"])
def test_regextokenizer(): value = u("AAAaaaBBBbbbCCCcccDDDddd") rex = analysis.RegexTokenizer("[A-Z]+") assert_equal([t.text for t in rex(value)], ["AAA", "BBB", "CCC", "DDD"]) rex = analysis.RegexTokenizer("[A-Z]+", gaps=True) assert_equal([t.text for t in rex(value)], ["aaa", "bbb", "ccc", "ddd"])
def test_stop_lang(): stopper = analysis.RegexTokenizer() | analysis.StopFilter() ls = [token.text for token in stopper(u("this is a test"))] assert ls == [u("test")] es_stopper = analysis.RegexTokenizer() | analysis.StopFilter(lang="es") ls = [token.text for token in es_stopper(u("el lapiz es en la mesa"))] assert ls == ["lapiz", "mesa"]
def test_double_metaphone(): from whoosh.lang.dmetaphone import double_metaphone names = {'maurice': ('MRS', None), 'aubrey': ('APR', None), 'cambrillo': ('KMPRL', 'KMPR'), 'heidi': ('HT', None), 'katherine': ('K0RN', 'KTRN'), 'Thumbail': ('0MPL', 'TMPL'), 'catherine': ('K0RN', 'KTRN'), 'richard': ('RXRT', 'RKRT'), 'bob': ('PP', None), 'eric': ('ARK', None), 'geoff': ('JF', 'KF'), 'Through': ('0R', 'TR'), 'Schwein': ('XN', 'XFN'), 'dave': ('TF', None), 'ray': ('R', None), 'steven': ('STFN', None), 'bryce': ('PRS', None), 'randy': ('RNT', None), 'bryan': ('PRN', None), 'Rapelje': ('RPL', None), 'brian': ('PRN', None), 'otto': ('AT', None), 'auto': ('AT', None), 'Dallas': ('TLS', None), 'maisey': ('MS', None), 'zhang': ('JNK', None), 'Chile': ('XL', None), 'Jose': ('HS', None), 'Arnow': ('ARN', 'ARNF'), 'solilijs': ('SLLS', None), 'Parachute': ('PRKT', None), 'Nowhere': ('NR', None), 'Tux': ('TKS', None)} dmn = name = None for name in names.keys(): dmn = double_metaphone(name) assert dmn == names[name] mf = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter()) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert results == [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)] mf = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter(combine=True)) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert results == [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0), ('F', 1.0), ('FF', 0.5)] namefield = fields.TEXT(analyzer=mf) texts = list(namefield.process_text(u("Spruce View"), mode="query")) assert texts == [u('spruce'), 'SPRS', u('view'), 'F', 'FF']
def populate_whoosh(text_dir, whoosh_dir): loaded = 0 # Create analyzer used for tokenizing and normalizing tokens # 000, 001, 010, 011, my_analyzers = [(analysis.RegexTokenizer()), (analysis.RegexTokenizer() | analysis.LowercaseFilter()), (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter()), (analysis.StemmingAnalyzer())] # Create schemas schemas = [] for my_analyzer in my_analyzers: schema = Schema(url=ID(stored=True), body=TEXT(stored=True, analyzer=my_analyzer)) schemas.append(schema) # Setup index ixs = [] for i, my_analyzer in enumerate(my_analyzers): whoosh_dir_current = whoosh_dir + str(i) + '/' os.makedirs(whoosh_dir_current, exist_ok=True) ix = index.create_in(whoosh_dir_current, schemas[i]) ixs.append(ix) # Clear index writers = [] for i, my_analyzer in enumerate(my_analyzer): writer = ixs[i].writer() writer.commit(mergetype=writing.CLEAR) writer = ixs[i].writer() writers.append(writer) # Index documents for root, dirs, files in os.walk(text_dir, topdown=False): for name in files: text_file = os.path.join(root, name) print('.', end='') with open(text_file) as tf: body = tf.read() url = text_file.replace(text_dir, "") for writer in writers: writer.add_document(url=url, body=body) # print("Added", url) loaded += 1 for writer in writers: writer.commit() print("\n\nLoaded", loaded, "documents")
def test_double_metaphone(): mf = analysis.RegexTokenizer() | analysis.LowercaseFilter( ) | analysis.DoubleMetaphoneFilter() results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert_equal(results, [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)]) mf = analysis.RegexTokenizer() | analysis.LowercaseFilter( ) | analysis.DoubleMetaphoneFilter(combine=True) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert_equal(results, [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0), ('F', 1.0), ('FF', 0.5)]) namefield = fields.TEXT(analyzer=mf) texts = list(namefield.process_text(u("Spruce View"), mode="query")) assert_equal(texts, [u('spruce'), 'SPRS', u('view'), 'F', 'FF'])
def populate_whoosh(text_dir, whoosh_dir): loaded = 0 ## Create analyzer used for tokenizing and normalizing tokens my_analyzer = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter()) # Create schema schema = Schema(url=ID(stored=True), body=TEXT(stored=True, analyzer=my_analyzer)) # Setup index os.makedirs(whoosh_dir, exist_ok=True) ix = index.create_in(whoosh_dir, schema) # Clear index writer = ix.writer() writer.commit(mergetype=writing.CLEAR) # Index documents writer = ix.writer() for root, dirs, files in os.walk(text_dir, topdown=False): for name in files: text_file = os.path.join(root, name) with open(text_file, encoding="utf8") as tf: body = tf.read() url = text_file.replace(text_dir, "") writer.add_document(url=url, body=body) print("Added", url) loaded += 1 writer.commit() print("\n\nLoaded", loaded, "documents")
def test_word_segments(): wordset = set(u("alfa bravo charlie delta").split()) cwf = analysis.CompoundWordFilter(wordset, keep_compound=True) ana = analysis.RegexTokenizer(r"\S+") | cwf target = u("alfacharlie bravodelta delto bravo subalfa") tokens = [t.text for t in ana(target)] assert tokens == ["alfacharlie", "alfa", "charlie", "bravodelta", "bravo", "delta", "delto", "bravo", "subalfa"] cwf = analysis.CompoundWordFilter(wordset, keep_compound=False) ana = analysis.RegexTokenizer(r"\S+") | cwf target = u("alfacharlie bravodelta delto bravo subalfa") tokens = [t.text for t in ana(target)] assert tokens == ["alfa", "charlie", "bravo", "delta", "delto", "bravo", "subalfa"]
def test_fractional_weights(): ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() # With Positions format schema = fields.Schema(f=fields.TEXT(analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5")) w.commit() with ix.searcher() as s: wts = [] for word in s.lexicon("f"): p = s.postings("f", word) wts.append(p.weight()) assert_equal(wts, [0.5, 1.5, 2.0, 1.5]) # Try again with Frequency format schema = fields.Schema(f=fields.TEXT(analyzer=ana, phrase=False)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5")) w.commit() with ix.searcher() as s: wts = [] for word in s.lexicon("f"): p = s.postings("f", word) wts.append(p.weight()) assert_equal(wts, [0.5, 1.5, 2.0, 1.5])
def test_composing_functions(): def filter(tokens): for t in tokens: t.text = t.text.upper() yield t analyzer = analysis.RegexTokenizer() | filter assert_equal([t.text for t in analyzer(u("abc def"))], ["ABC", "DEF"])
def test_biword_stopwords(): # Note that the stop list is None here ana = (analysis.RegexTokenizer() | analysis.StopFilter(stoplist=None, minsize=3) | analysis.BiWordFilter()) texts = [t.text for t in ana(u("stuff and then some"))] assert texts == ["stuff-and", "and-then", "then-some"] # Use a stop list here ana = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter() | analysis.BiWordFilter()) texts = [t.text for t in ana(u("stuff and then some"))] assert texts == ["stuff-then", "then-some"]
def test_shared_composition(): shared = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() ana1 = shared | analysis.NgramFilter(3) ana2 = shared | analysis.DoubleMetaphoneFilter() assert_equal([t.text for t in ana1(u("hello"))], ["hel", "ell", "llo"]) assert_equal([t.text for t in ana2(u("hello"))], ["HL"])
def test_multifilter(): f1 = analysis.LowercaseFilter() f2 = analysis.PassFilter() mf = analysis.MultiFilter(a=f1, b=f2) ana = analysis.RegexTokenizer(r"\S+") | mf text = u("ALFA BRAVO CHARLIE") assert [t.text for t in ana(text, mode="a")] == ["alfa", "bravo", "charlie"] assert [t.text for t in ana(text, mode="b")] == ["ALFA", "BRAVO", "CHARLIE"]
def test_composition2(): ca = analysis.RegexTokenizer() | analysis.LowercaseFilter() sa = ca | analysis.StopFilter() assert_equal(len(sa), 3) assert_equal(sa.__class__.__name__, "CompositeAnalyzer") assert_equal(sa[0].__class__.__name__, "RegexTokenizer") assert_equal(sa[1].__class__.__name__, "LowercaseFilter") assert_equal(sa[2].__class__.__name__, "StopFilter") assert_equal([t.text for t in sa(u("The ABC 123"))], ["abc", "123"])
def test_posboost_postings(): pbs = PositionBoosts() ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa") assert _roundtrip(content, pbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]), ("bravo", [(1, 0.1), (3, 0.5)]), ("charlie", [(2, 2)])] assert _roundtrip(content, pbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] assert _roundtrip(content, pbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
def test_intraword_possessive(): iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter() target = u("O'Malley's-Bar") tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)] assert_equal(tokens, [("o", 0, 1), ("malley", 2, 8), ("bar", 11, 14), ("omalleybar", 0, 14)])
def test_composing_functions(): tokenizer = analysis.RegexTokenizer() def filter(tokens): for t in tokens: t.text = t.text.upper() yield t with pytest.raises(TypeError): tokenizer | filter
def test_frowny_face(): # See https://bitbucket.org/mchaput/whoosh/issue/166/ ana = analysis.RegexTokenizer(r"\S+") | analysis.IntraWordFilter() # text is all delimiters tokens = [t.text for t in ana(u(":-("))] assert_equal(tokens, []) # text has consecutive delimiters tokens = [t.text for t in ana(u("LOL:)"))] assert_equal(tokens, ["LOL"])
def test_shingles(): ana = analysis.RegexTokenizer(r"\w+") | analysis.ShingleFilter(3, " ") source = u("better a witty fool than a foolish wit") results = [t.copy() for t in ana(source, positions=True, chars=True)] assert [t.text for t in results] == [u('better a witty'), u('a witty fool'), u('witty fool than'), u('fool than a'), u('than a foolish'), u('a foolish wit')] assert [t.pos for t in results] == list(range(len(results))) for t in results: assert t.text == source[t.startchar:t.endchar]
def test_start_pos(): from whoosh import formats ana = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() kw = {"positions": True} tks = formats.tokens(u("alfa bravo charlie delta"), ana, kw) assert_equal([t.pos for t in tks], [0, 1, 2, 3]) kw["start_pos"] = 3 ts = [t.copy() for t in formats.tokens(u("A B C D").split(), ana, kw)] assert_equal(" ".join([t.text for t in ts]), "A B C D") assert_equal([t.pos for t in ts], [3, 4, 5, 6])
def test_biword(): ana = analysis.RegexTokenizer(r"\w+") | analysis.BiWordFilter() result = [t.copy() for t in ana(u("the sign of four"), chars=True, positions=True)] assert ["the-sign", "sign-of", "of-four"] == [t.text for t in result] assert [(0, 8), (4, 11), (9, 16)] == [(t.startchar, t.endchar) for t in result] assert [0, 1, 2] == [t.pos for t in result] result = [t.copy() for t in ana(u("single"))] assert len(result) == 1 assert result[0].text == "single"
def test_shingle_stopwords(): # Note that the stop list is None here ana = (analysis.RegexTokenizer() | analysis.StopFilter(stoplist=None, minsize=3) | analysis.ShingleFilter(size=3)) texts = [t.text for t in ana(u("some other stuff and then some things To Check "))] assert texts == ["some-other-stuff", "other-stuff-and", "stuff-and-then", "and-then-some", "then-some-things", "some-things-Check"] # Use a stop list here ana = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter() | analysis.ShingleFilter(size=3)) texts = [t.text for t in ana(u("some other stuff and then some things To Check "))] assert texts == ["some-other-stuff", "other-stuff-then", "stuff-then-some", "then-some-things", "some-things-check"]
def test_intraword(): iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) ana = analysis.RegexTokenizer(r"\S+") | iwf def check(text, ls): assert_equal([(t.pos, t.text) for t in ana(text)], ls) check(u("PowerShot)"), [(0, "Power"), (1, "Shot"), (1, "PowerShot")]) check(u("A's+B's&C's"), [(0, "A"), (1, "B"), (2, "C"), (2, "ABC")]) check(u("Super-Duper-XL500-42-AutoCoder!"), [(0, "Super"), (1, "Duper"), (2, "XL"), (2, "SuperDuperXL"), (3, "500"), (4, "42"), (4, "50042"), (5, "Auto"), (6, "Coder"), (6, "AutoCoder")])
def __init__(self, toolbox, index_help=True): """ Create a searcher for `toolbox`. """ self.schema = Schema(id=STORED, stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.toolbox = toolbox self.build_index(index_help)
def __init__(self, toolbox, panel_view_id: str, index_dir: str, index_help: bool = True): self.schema = Schema(id=ID(stored=True, unique=True), old_id=ID, stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.index_dir = index_dir self.toolbox = toolbox self.panel_view_id = panel_view_id self.index = self._index_setup()
def test_multifilter(): iwf_for_index = analysis.IntraWordFilter(mergewords=True, mergenums=False) iwf_for_query = analysis.IntraWordFilter(mergewords=False, mergenums=False) mf = analysis.MultiFilter(index=iwf_for_index, query=iwf_for_query) ana = analysis.RegexTokenizer() | mf | analysis.LowercaseFilter() schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("Our BabbleTron5000 is great")) w.commit() with ix.searcher() as s: hit = s.search(query.Term("text", "5000"))[0] assert_equal(hit.highlights("text"), 'Our BabbleTron<b class="match term0">5000</b> is great')
def test_charboost_postings(): cbs = CharacterBoosts() ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa") assert _roundtrip(content, cbs, "character_boosts", ana) == [("alfa", [(0, 0, 4, 2), (4, 37, 41, 1), (5, 42, 46, 1)]), ("bravo", [(1, 7, 12, 0.1), (3, 27, 32, 0.5)]), ("charlie", [(2, 17, 24, 2)])] assert _roundtrip(content, cbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]), ("bravo", [(1, 0.1), (3, 0.5)]), ("charlie", [(2, 2)])] assert _roundtrip(content, cbs, "characters", ana) == [("alfa", [(0, 0, 4), (4, 37, 41), (5, 42, 46)]), ("bravo", [(1, 7, 12), (3, 27, 32)]), ("charlie", [(2, 17, 24)])] assert _roundtrip(content, cbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] assert _roundtrip(content, cbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
def test_intraword_chars(): iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter() target = u("WiKiWo-rd") tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)] assert_equal(tokens, [("wi", 0, 2), ("ki", 2, 4), ("wo", 4, 6), ("rd", 7, 9), ("wikiword", 0, 9)]) target = u("Zo WiKiWo-rd") tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)] assert_equal(tokens, [("zo", 0, 2), ("wi", 3, 5), ("ki", 5, 7), ("wo", 7, 9), ("rd", 10, 12), ("wikiword", 3, 12)])
def __init__(self, toolbox, index_help=True): self.schema = Schema(id=STORED, stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.toolbox = toolbox self.storage, self.index = self._index_setup() # We keep track of how many times the tool index has been rebuilt. # We start at -1, so that after the first index the count is at 0, # which is the same as the toolbox reload count. This way we can skip # reindexing if the index count is equal to the toolbox reload count. self.index_count = -1