예제 #1
0
def test_fractional_weights():
    ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()

    # With Positions format
    schema = fields.Schema(f=fields.TEXT(analyzer=ana))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5"))
    w.commit()

    with ix.searcher() as s:
        wts = []
        for word in s.lexicon("f"):
            p = s.postings("f", word)
            wts.append(p.weight())
        assert_equal(wts, [0.5, 1.5, 2.0, 1.5])

    # Try again with Frequency format
    schema = fields.Schema(f=fields.TEXT(analyzer=ana, phrase=False))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5"))
    w.commit()

    with ix.searcher() as s:
        wts = []
        for word in s.lexicon("f"):
            p = s.postings("f", word)
            wts.append(p.weight())
        assert_equal(wts, [0.5, 1.5, 2.0, 1.5])
예제 #2
0
def test_posboost_postings():
    pbs = PositionBoosts()
    ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
    content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa")
    assert _roundtrip(content, pbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]),
                                                                ("bravo", [(1, 0.1), (3, 0.5)]),
                                                                ("charlie", [(2, 2)])]
    assert _roundtrip(content, pbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])]
    assert _roundtrip(content, pbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
예제 #3
0
def test_charboost_postings():
    cbs = CharacterBoosts()
    ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
    content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa")
    assert _roundtrip(content, cbs, "character_boosts", ana) == [("alfa", [(0, 0, 4, 2), (4, 37, 41, 1), (5, 42, 46, 1)]),
                                                                 ("bravo", [(1, 7, 12, 0.1), (3, 27, 32, 0.5)]),
                                                                 ("charlie", [(2, 17, 24, 2)])]
    assert _roundtrip(content, cbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]),
                                                                ("bravo", [(1, 0.1), (3, 0.5)]),
                                                                ("charlie", [(2, 2)])]
    assert _roundtrip(content, cbs, "characters", ana) == [("alfa", [(0, 0, 4), (4, 37, 41), (5, 42, 46)]),
                                                           ("bravo", [(1, 7, 12), (3, 27, 32)]),
                                                           ("charlie", [(2, 17, 24)])]
    assert _roundtrip(content, cbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])]
    assert _roundtrip(content, cbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
예제 #4
0
def test_delimited_attribute():
    ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
    results = [(t.text, t.boost) for t in ana(u("image render^2 file^0.5"))]
    assert_equal(results, [("image", 1.0), ("render", 2.0), ("file", 0.5)])