示例#1
0
def test_character_postings():
    content = u("alfa bravo charlie bravo alfa alfa")
    assert _roundtrip(content, Characters(), "characters") == [("alfa", [(0, 0, 4), (4, 25, 29), (5, 30, 34)]),
                                                               ("bravo", [(1, 5, 10), (3, 19, 24)]),
                                                               ("charlie", [(2, 11, 18)])]
    assert _roundtrip(content, Characters(), "positions") == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])]
    assert _roundtrip(content, Characters(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
示例#2
0
def test_character_postings():
    postings = []
    docnum = 0
    for _ in xrange(0, 20):
        docnum += randint(1, 10)
        posns = []
        pos = 0
        endchar = 0
        for __ in xrange(0, randint(1, 10)):
            pos += randint(1, 10)
            startchar = endchar + randint(3, 10)
            endchar = startchar + randint(3, 10)
            posns.append((pos, startchar, endchar))
        postings.append((docnum, posns))

    assert_equal(postings, roundtrip(postings, Characters(), "characters"))

    as_posns = [(docnum, [pos for pos, sc, ec in posns])
                for docnum, posns in postings]
    assert_equal(as_posns, roundtrip(postings, Characters(), "positions"))

    as_freq = [(docnum, len(posns)) for docnum, posns in as_posns]
    assert_equal(as_freq, roundtrip(postings, Characters(), "frequency"))
示例#3
0
from whoosh.fields import (TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema,
                           FieldType, DATETIME)

# CUSTOM ANALYZER wordsplit + lowercase filter for case insensitive search
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

# FILE INDEX SCHEMA DEFINITION
FILE_INDEX_NAME = 'FILE_INDEX'
FILE_SCHEMA = Schema(
    fileid=ID(unique=True),  # Path
    repository=ID(stored=True),
    repository_id=NUMERIC(unique=True, stored=True),  # Numeric id of repo
    repo_name=TEXT(stored=True),
    owner=TEXT(),
    path=TEXT(stored=True),
    content=FieldType(format=Characters(),
                      analyzer=ANALYZER,
                      scorable=True,
                      stored=True),
    modtime=STORED(),
    md5=STORED(),
    extension=ID(stored=True),
    commit_id=TEXT(stored=True),
    size=NUMERIC(stored=True),
    mimetype=TEXT(stored=True),
    lines=NUMERIC(stored=True),
)

# COMMIT INDEX SCHEMA
COMMIT_INDEX_NAME = 'COMMIT_INDEX'
COMMIT_SCHEMA = Schema(
示例#4
0
from whoosh.formats import Characters
from whoosh.highlight import highlight as whoosh_highlight, HtmlFormatter, ContextFragmenter
from kallithea.lib.utils2 import LazyProperty

log = logging.getLogger(__name__)

# CUSTOM ANALYZER wordsplit + lowercase filter
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()

#INDEX SCHEMA DEFINITION
SCHEMA = Schema(
    fileid=ID(unique=True),
    owner=TEXT(),
    repository=TEXT(stored=True),
    path=TEXT(stored=True),
    content=FieldType(format=Characters(), analyzer=ANALYZER,
                      scorable=True, stored=True),
    modtime=STORED(),
    extension=TEXT(stored=True)
)

IDX_NAME = 'HG_INDEX'
FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
FRAGMENTER = ContextFragmenter(200)

CHGSETS_SCHEMA = Schema(
    raw_id=ID(unique=True, stored=True),
    date=NUMERIC(stored=True),
    last=BOOLEAN(),
    owner=TEXT(),
    repository=ID(unique=True, stored=True),