def test_double_metaphone(): from whoosh.lang.dmetaphone import double_metaphone names = {'maurice': ('MRS', None), 'aubrey': ('APR', None), 'cambrillo': ('KMPRL', 'KMPR'), 'heidi': ('HT', None), 'katherine': ('K0RN', 'KTRN'), 'Thumbail': ('0MPL', 'TMPL'), 'catherine': ('K0RN', 'KTRN'), 'richard': ('RXRT', 'RKRT'), 'bob': ('PP', None), 'eric': ('ARK', None), 'geoff': ('JF', 'KF'), 'Through': ('0R', 'TR'), 'Schwein': ('XN', 'XFN'), 'dave': ('TF', None), 'ray': ('R', None), 'steven': ('STFN', None), 'bryce': ('PRS', None), 'randy': ('RNT', None), 'bryan': ('PRN', None), 'Rapelje': ('RPL', None), 'brian': ('PRN', None), 'otto': ('AT', None), 'auto': ('AT', None), 'Dallas': ('TLS', None), 'maisey': ('MS', None), 'zhang': ('JNK', None), 'Chile': ('XL', None), 'Jose': ('HS', None), 'Arnow': ('ARN', 'ARNF'), 'solilijs': ('SLLS', None), 'Parachute': ('PRKT', None), 'Nowhere': ('NR', None), 'Tux': ('TKS', None)} dmn = name = None for name in names.keys(): dmn = double_metaphone(name) assert dmn == names[name] mf = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter()) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert results == [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)] mf = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter(combine=True)) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert results == [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0), ('F', 1.0), ('FF', 0.5)] namefield = fields.TEXT(analyzer=mf) texts = list(namefield.process_text(u("Spruce View"), mode="query")) assert texts == [u('spruce'), 'SPRS', u('view'), 'F', 'FF']
def test_double_metaphone(): mf = analysis.RegexTokenizer() | analysis.LowercaseFilter( ) | analysis.DoubleMetaphoneFilter() results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert_equal(results, [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)]) mf = analysis.RegexTokenizer() | analysis.LowercaseFilter( ) | analysis.DoubleMetaphoneFilter(combine=True) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert_equal(results, [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0), ('F', 1.0), ('FF', 0.5)]) namefield = fields.TEXT(analyzer=mf) texts = list(namefield.process_text(u("Spruce View"), mode="query")) assert_equal(texts, [u('spruce'), 'SPRS', u('view'), 'F', 'FF'])
def test_copyfield(): qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, None)) assert_equal(text_type(qp.parse("hello b:matt")), "(a:hello AND b:matt AND c:matt)") qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.AndMaybeGroup)) assert_equal(text_type(qp.parse("hello b:matt")), "(a:hello AND (b:matt ANDMAYBE c:matt))") qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.RequireGroup)) assert_equal(text_type(qp.parse("hello (there OR b:matt)")), "(a:hello AND (a:there OR (b:matt REQUIRE c:matt)))") qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"a": "c"}, syntax.OrGroup)) assert_equal(text_type(qp.parse("hello there")), "((a:hello OR c:hello) AND (a:there OR c:there))") qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, mirror=True)) assert_equal(text_type(qp.parse("hello c:matt")), "(a:hello AND (c:matt OR b:matt))") qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"c": "a"}, mirror=True)) assert_equal(text_type(qp.parse("hello c:matt")), "((a:hello OR c:hello) AND (c:matt OR a:matt))") ana = analysis.RegexAnalyzer(r"\w+") | analysis.DoubleMetaphoneFilter() fmt = formats.Frequency() schema = fields.Schema(name=fields.KEYWORD, name_phone=fields.FieldType(fmt, ana, multitoken_query="or")) qp = qparser.QueryParser("name", schema) qp.add_plugin(plugins.CopyFieldPlugin({"name": "name_phone"})) assert_equal(text_type(qp.parse(u("spruce view"))), "((name:spruce OR name_phone:SPRS) AND (name:view OR name_phone:F OR name_phone:FF))")
def test_shared_composition(): shared = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() ana1 = shared | analysis.NgramFilter(3) ana2 = shared | analysis.DoubleMetaphoneFilter() assert_equal([t.text for t in ana1(u("hello"))], ["hel", "ell", "llo"]) assert_equal([t.text for t in ana2(u("hello"))], ["HL"])
def test_name_field(): ana = (analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter(combine=True)) namefield = fields.TEXT(analyzer=ana, multitoken_query="or") schema = fields.Schema(id=fields.STORED, name=namefield) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("one"), name=u("Leif Ericson")) w.commit() s = ix.searcher() qp = qparser.QueryParser("name", schema) q = qp.parse(u("leaf eriksen"), normalize=False) r = s.search(q) assert_equal(len(r), 1)