Пример #1
0
def test_double_metaphone():
    from whoosh.lang.dmetaphone import double_metaphone

    names = {'maurice': ('MRS', None),
             'aubrey': ('APR', None),
             'cambrillo': ('KMPRL', 'KMPR'),
             'heidi': ('HT', None),
             'katherine': ('K0RN', 'KTRN'),
             'Thumbail': ('0MPL', 'TMPL'),
             'catherine': ('K0RN', 'KTRN'),
             'richard': ('RXRT', 'RKRT'),
             'bob': ('PP', None),
             'eric': ('ARK', None),
             'geoff': ('JF', 'KF'),
             'Through': ('0R', 'TR'),
             'Schwein': ('XN', 'XFN'),
             'dave': ('TF', None),
             'ray': ('R', None),
             'steven': ('STFN', None),
             'bryce': ('PRS', None),
             'randy': ('RNT', None),
             'bryan': ('PRN', None),
             'Rapelje': ('RPL', None),
             'brian': ('PRN', None),
             'otto': ('AT', None),
             'auto': ('AT', None),
             'Dallas': ('TLS', None),
             'maisey': ('MS', None),
             'zhang': ('JNK', None),
             'Chile': ('XL', None),
             'Jose': ('HS', None),
             'Arnow': ('ARN', 'ARNF'),
             'solilijs': ('SLLS', None),
             'Parachute': ('PRKT', None),
             'Nowhere': ('NR', None),
             'Tux': ('TKS', None)}

    dmn = name = None
    for name in names.keys():
        dmn = double_metaphone(name)
    assert dmn == names[name]

    mf = (analysis.RegexTokenizer()
          | analysis.LowercaseFilter()
          | analysis.DoubleMetaphoneFilter())
    results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
    assert results == [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)]

    mf = (analysis.RegexTokenizer()
          | analysis.LowercaseFilter()
          | analysis.DoubleMetaphoneFilter(combine=True))
    results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
    assert results == [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0),
                       ('F', 1.0), ('FF', 0.5)]

    namefield = fields.TEXT(analyzer=mf)
    texts = list(namefield.process_text(u("Spruce View"), mode="query"))
    assert texts == [u('spruce'), 'SPRS', u('view'), 'F', 'FF']
Пример #2
0
def test_double_metaphone():
    mf = analysis.RegexTokenizer() | analysis.LowercaseFilter(
    ) | analysis.DoubleMetaphoneFilter()
    results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
    assert_equal(results, [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)])

    mf = analysis.RegexTokenizer() | analysis.LowercaseFilter(
    ) | analysis.DoubleMetaphoneFilter(combine=True)
    results = [(t.text, t.boost) for t in mf(u("Spruce View"))]
    assert_equal(results, [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0),
                           ('F', 1.0), ('FF', 0.5)])

    namefield = fields.TEXT(analyzer=mf)
    texts = list(namefield.process_text(u("Spruce View"), mode="query"))
    assert_equal(texts, [u('spruce'), 'SPRS', u('view'), 'F', 'FF'])
Пример #3
0
def test_copyfield():
    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, None))
    assert_equal(text_type(qp.parse("hello b:matt")), "(a:hello AND b:matt AND c:matt)")

    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.AndMaybeGroup))
    assert_equal(text_type(qp.parse("hello b:matt")), "(a:hello AND (b:matt ANDMAYBE c:matt))")

    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.RequireGroup))
    assert_equal(text_type(qp.parse("hello (there OR b:matt)")), "(a:hello AND (a:there OR (b:matt REQUIRE c:matt)))")

    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"a": "c"}, syntax.OrGroup))
    assert_equal(text_type(qp.parse("hello there")), "((a:hello OR c:hello) AND (a:there OR c:there))")

    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, mirror=True))
    assert_equal(text_type(qp.parse("hello c:matt")), "(a:hello AND (c:matt OR b:matt))")

    qp = qparser.QueryParser("a", None)
    qp.add_plugin(plugins.CopyFieldPlugin({"c": "a"}, mirror=True))
    assert_equal(text_type(qp.parse("hello c:matt")), "((a:hello OR c:hello) AND (c:matt OR a:matt))")

    ana = analysis.RegexAnalyzer(r"\w+") | analysis.DoubleMetaphoneFilter()
    fmt = formats.Frequency()
    schema = fields.Schema(name=fields.KEYWORD, name_phone=fields.FieldType(fmt, ana, multitoken_query="or"))
    qp = qparser.QueryParser("name", schema)
    qp.add_plugin(plugins.CopyFieldPlugin({"name": "name_phone"}))
    assert_equal(text_type(qp.parse(u("spruce view"))), "((name:spruce OR name_phone:SPRS) AND (name:view OR name_phone:F OR name_phone:FF))")
Пример #4
0
def test_shared_composition():
    shared = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter()

    ana1 = shared | analysis.NgramFilter(3)
    ana2 = shared | analysis.DoubleMetaphoneFilter()

    assert_equal([t.text for t in ana1(u("hello"))], ["hel", "ell", "llo"])
    assert_equal([t.text for t in ana2(u("hello"))], ["HL"])
Пример #5
0
def test_name_field():
    ana = (analysis.RegexTokenizer(r"\S+")
           | analysis.LowercaseFilter()
           | analysis.DoubleMetaphoneFilter(combine=True))
    namefield = fields.TEXT(analyzer=ana, multitoken_query="or")
    schema = fields.Schema(id=fields.STORED, name=namefield)

    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=u("one"), name=u("Leif Ericson"))
    w.commit()

    s = ix.searcher()
    qp = qparser.QueryParser("name", schema)
    q = qp.parse(u("leaf eriksen"), normalize=False)
    r = s.search(q)
    assert_equal(len(r), 1)