예제 #1
0
    def open_index(self, index_name):
        assert index_name in ['tweet_index', 'tag_index']

        path = os.path.join(settings.STORAGE_DIR, index_name)

        if os.path.exists(path):
            # open index
            self.indexer = engine.Indexer(path)

        elif index_name == 'tag_index':
            # create index
            self.indexer = engine.Indexer(path)
            self.indexer.set('docid', stored=True)
            self.indexer.set('rank', dimensions=1, stored=True)
            self.indexer.set('hashtag', engine.Field.Text)
            self.indexer.set('city', engine.Field.Text)
            self.indexer.set('state', engine.Field.Text)
            self.indexer.set('date', engine.DateTimeField)
        else:
            # create index
            self.indexer = engine.Indexer(path)

            # code to index the field in the tweets
            self.indexer.set('docid', stored=True)
            self.indexer.set('rank', dimensions=1, stored=True)
            self.indexer.set('tweet', engine.Field.Text)
            self.indexer.set('descrpt', engine.Field.Text)
            # this long and lat tuple
            self.indexer.set('coord', engine.SpatialField)
            self.indexer.set('screen_name', engine.Field.Text)
            self.indexer.set('city', engine.Field.Text)
            self.indexer.set('state', engine.Field.Text)
            self.indexer.set('date', engine.DateTimeField)
예제 #2
0
def test_analyzers():
    stemmer = engine.Analyzer.standard(analysis.en.PorterStemFilter,
                                       typeAsPayload)
    for token in stemmer.tokens('Search'):
        assert token.positionIncrement == 1
        assert engine.TokenFilter(
            analysis.miscellaneous.EmptyTokenStream()).payload is None
        assert token.charTerm == 'search'
        assert token.type == token.payload == '<ALPHANUM>'
        assert token.offset == (0, 6)
        token.charTerm = token.type = ''
        token.offset, token.positionIncrement = (0, 0), 0
    assert str(stemmer.parse('searches',
                             field=['body',
                                    'title'])) == 'body:search title:search'
    assert str(stemmer.parse('searches', field={
        'body': 1.0,
        'title': 2.0
    })) == '(body:search)^1.0 (title:search)^2.0'
    indexer = engine.Indexer(analyzer=stemmer)
    indexer.set('text', engine.Field.Text)
    indexer.add(text='searches')
    indexer.commit()
    (item, ) = indexer.positions('text', 'search', payloads=True)
    assert item == (0, [(0, '<ALPHANUM>')])
    analyzer = engine.Analyzer.whitespace(engine.TokenFilter)
    assert [token.charTerm for token in analyzer.tokens('Search Engine')
            ] == ['Search', 'Engine']
예제 #3
0
def sort_sentences(claim):
    docs = extract_sentences_as_docs(related_sentences)

    sentence_indexer = engine.Indexer()
    sentence_indexer.deleteAll()
    sentence_indexer.set('title', engine.Field.Text, stored=True)
    sentence_indexer.set('sen_id', engine.Field.Text, stored=True)
    sentence_indexer.set('text', engine.Field.Text, stored=True)

    for key, text in iter(docs.items()):
        title = key[0]
        sen_id = key[1]
        sentence_indexer.add(title=title,
                             sen_id=sen_id,
                             text=title + " " + text)
    sentence_indexer.commit()

    query = process_query(claim)
    hits = sentence_indexer.search(query, field='text', count=5)
    sentences = []
    for hit in hits:
        sentences.append((hit['title'], hit['sen_id']))
    sentence_indexer.deleteAll()
    sentence_indexer.commit()
    sentence_indexer.close()
    return sentences
예제 #4
0
def test_multi(tempdir):
    indexers = engine.Indexer(tempdir), engine.Indexer()
    searcher = engine.MultiSearcher(
        [indexers[0].indexReader, indexers[1].directory])
    pytest.raises(TypeError, getattr, searcher, 'timestamp')
    assert engine.MultiSearcher([indexers[0].directory]).timestamp
    assert [reader.refCount for reader in searcher.indexReaders] == [2, 1]
    assert searcher.reopen() is searcher
    indexers[0].add()
    indexers[0].commit()
    assert [reader.refCount for reader in searcher.indexReaders] == [1, 1]
    searcher, previous = searcher.reopen(), searcher
    assert searcher.version > previous.version
    assert [reader.refCount for reader in searcher.indexReaders] == [1, 2]
    del previous
    assert [reader.refCount for reader in searcher.indexReaders] == [1, 1]
예제 #5
0
def prepare_index(docs_dir, stemmer, min_len=5):
    indexer = engine.Indexer()
    indexer.set('contents', stored=True)
    indexer.set('path', stored=True)

    idx = 0
    p = Pool(4)
    index_start, indexed_documents_count = default_timer(), 0
    for root, dirs, files in os.walk(docs_dir):
        for f in files:
            idx += 1
            if idx % 10 == 0:
                print('indexed {} files'.format(idx))

            with codecs.open(os.path.join(root, f), 'r', 'utf-8') as doc:
                articles = filter(lambda x: len(x.split()) >= min_len,
                                  doc.readlines())
                stemmed_articles = p.map(stem_sentence_and_remove_stopwords,
                                         articles)
                for article in stemmed_articles:
                    indexer.add(path=os.path.join(root, f), contents=article)
                    indexed_documents_count += 1

    indexer.commit()
    print('Added', indexed_documents_count, 'documents to index, spent',
          default_timer() - index_start, 'sec')
    return indexer
예제 #6
0
def test_nrt():
    indexer = engine.Indexer(nrt=True)
    indexer.add()
    assert indexer.count() == 0 and not indexer.current
    indexer.refresh()
    assert indexer.count() == 1 and indexer.current
    searcher = engine.IndexSearcher(indexer.directory)
    assert searcher.count() == 0 and searcher.current
    indexer.add()
    indexer.commit()
    assert indexer.count() == engine.IndexSearcher(
        indexer.directory).count() == 2
예제 #7
0
def test_indexes(tempdir):
    with pytest.raises(TypeError):
        engine.IndexSearcher()
    with pytest.raises(lucene.JavaError):
        engine.Indexer(tempdir, 'r')
    indexer = engine.Indexer()
    indexer.set('name', engine.Field.String, stored=True)
    indexer.set('text', engine.Field.Text)
    with engine.Indexer(tempdir) as temp:
        temp.add()
    with pytest.raises(KeyError), engine.Indexer(tempdir) as temp:
        temp.add()
        temp.add(missing='')
    for other in (temp, temp.directory, tempdir):
        indexer += other
    assert len(indexer) == 3
    analyzer = engine.Analyzer.whitespace()
    indexer.add(text=analyzer.tokens('?'), name=util.BytesRef('{}'))
    indexer.commit()
    assert indexer[next(indexer.docs('text', '?'))]['name'] == '{}'
    indexer.delete('text', '?')
    indexer.commit(merge=True)
    assert not indexer.hasDeletions()
    indexer.commit(merge=1)
    assert len(list(indexer.readers)) == 1
    reader = engine.indexers.IndexReader(indexer.indexReader)
    del reader.indexReader
    with pytest.raises(AttributeError):
        reader.maxDoc
    del indexer.indexSearcher
    with pytest.raises(AttributeError):
        indexer.search

    indexer = engine.Indexer(tempdir)
    indexer.add()
    indexer.commit()
    files = set(os.listdir(tempdir))
    path = os.path.join(tempdir, 'temp')
    with indexer.snapshot() as commit:
        indexer.commit(merge=1)
        assert indexer.indexCommit.generation > commit.generation
        engine.indexers.copy(commit, path)
        assert set(os.listdir(path)) == set(commit.fileNames) < files < set(
            os.listdir(tempdir))
        filepath = os.path.join(path, commit.segmentsFileName)
        os.remove(filepath)
        open(filepath, 'w').close()
        with pytest.raises(OSError):
            engine.indexers.copy(commit, path)
    with pytest.raises(lucene.JavaError):
        indexer.check(tempdir)
    del indexer
    assert engine.Indexer(tempdir)
    assert not os.path.exists(os.path.join(tempdir, commit.segmentsFileName))
    assert engine.IndexWriter.check(tempdir).clean
    assert not engine.IndexWriter.check(tempdir, fix=True).numBadSegments
예제 #8
0
def test_highlighting(constitution):
    indexer = engine.Indexer()
    indexer.set('text',
                engine.Field.Text,
                stored=True,
                storeTermVectors=True,
                storeTermVectorPositions=True,
                storeTermVectorOffsets=True)
    for doc in constitution:
        if 'amendment' in doc:
            indexer.add(text=doc['text'])
    indexer.commit()
    query = Q.term('text', 'right')
    assert engine.Analyzer.highlight(
        indexer.analyzer, query, 'text',
        "word right word") == "word <b>right</b> word"
    hits = indexer.search(query)
    highlights = list(hits.highlights(query, text=1))
    assert len(hits) == len(highlights)
    for highlight in highlights:
        assert '<b>right</b>' in highlight.pop('text') and not highlight
예제 #9
0
def test_spellcheck(fields, constitution):
    indexer = engine.Indexer()
    indexer.fields = {field.name: field for field in fields}
    for doc in constitution:
        indexer.add(doc)
    indexer.commit()
    assert indexer.complete('missing', '') == []
    assert {'shall', 'states'} <= set(indexer.complete('text', '')[:8])
    assert indexer.complete('text', 'con')[:2] == ['congress', 'constitution']
    assert indexer.complete('text', 'congress') == indexer.complete(
        'text', 'con', count=1) == ['congress']
    assert indexer.complete('text', 'congresses') == []
    assert indexer.suggest('text', 'write') == ['writs']
    assert indexer.suggest('text', 'write', 3) == ['writs', 'writ', 'written']
    assert indexer.suggest('text', 'write', 3, maxEdits=1) == ['writs', 'writ']
    query = indexer.parse('text:write', spellcheck=True)
    assert search.TermQuery.instance_(query) and str(query) == 'text:writs'
    query = indexer.parse('"hello world"', field='text', spellcheck=True)
    assert search.PhraseQuery.instance_(query) and str(
        query) == 'text:"held would"'
    assert str(indexer.parse('vwxyz', field='text',
                             spellcheck=True)) == 'text:vwxyz'
예제 #10
0
 def sales_sort(command):
     query = QueryParser(Version.LUCENE_CURRENT, "name",
                         analyzer).parse(command)
     scoreDocs = searcher.search(query, 8).scoreDocs
     # sorter = search.Sort(search.SortField('price', search.SortField.Type.STRING))
     # topdocs = searcher.search(query, None, 10, sorter)
     # print "%s total matching documents." % len(topdocs.scoreDocs)
     # for scoredoc in topdocs.scoreDocs:
     #     doc = searcher.doc(scoredoc.doc)
     print "%s total matching documents." % len(scoreDocs)
     indexer = engine.Indexer()
     l = 'name', 'price', 'comments', 'sales', 'url', 'img', 'place', 'shop'
     for i in l:
         indexer.set(i, stored=True, tokenized=False)
     for scoreDoc in scoreDocs:
         doc = searcher.doc(scoreDoc.doc)
         indexer.add(name=doc.get('name'),
                     price=doc.get('price'),
                     comments=doc.get('comments'),
                     sales=doc.get('sales'),
                     url=doc.get('url'),
                     img=doc.get('img'),
                     shop=doc.get('shop'),
                     place=doc.get('place'))
         # print doc.get('sales')
     indexer.commit()
     hits = list(indexer.search(sort='sales'))[::-1]
     for hit in hits:
         print '------------------------------------------------------------------------------------------------------------'
         print 'Perfume:', hit['name']
         print 'Price:', hit['price']
         print 'Sales:', hit['sales']
         print 'img:', hit['img']
         print 'comments:', hit['comments']
         print 'Shop:', hit['shop']
         print 'Place:', hit['place']
import lucene
import nltk
from lupyne import engine

import time
time_start = time.time()

lucene.initVM()

# Store the index in memory:
indexer = engine.Indexer(
    directory='doc_sentence'
)  # Indexer combines Writer and Searcher; RAMDirectory and StandardAnalyzer are defaults
indexer.set('content', engine.Field.Text, stored=True)
indexer.set('sent', engine.Field.Text, stored=True)
indexer.set('doc', engine.Field.Text, stored=True)
indexer.set('id', engine.Field.Text, stored=True)

doc_indexer = engine.Indexer(directory='stemmer_doc')
doc_indexer.set('content', engine.Field.Text, stored=True)
doc_indexer.set('doc', engine.Field.Text, stored=True)
doc_indexer.set('id', engine.Field.Text, stored=True)

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()


def lemmatize(word):
    if not word.isalpha():
        return word
import bs4
from general_utility import ACIndex
import datetime
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from collections import OrderedDict
import general_utility as gu

stopwords = nltk.corpus.stopwords.words('russian')
stemmer = SnowballStemmer('russian')
tokenizer = nltk.RegexpTokenizer(
    ur'[^A-Ba-b\s\.\",:;\(\)\!\?]+')  #WTF van Rossum?
corpus_index = ACIndex()
paths = gu.get_filepaths('/root/PycharmProjects/CA3/new_meta')
indexer = engine.Indexer('./lucene_index_05')
indexer.set('title', stored=True)
indexer.set('text', stored=True)
indexer.set('stemmed_text', stored=True)
indexer.set('rubrics', stored=True)
indexer.set('objects', stored=True)
indexer.set('metadata', stored=True)
indexer.set('keywords', stored=True)
indexer.set('id', stored=True)
indexer.set('mentions', stored=True)
indexer.set('path', stored=True, tokenized=False)


def normalize_date(date):
    if re.search('-', date):
        beginning, end = date.split('-')
예제 #13
0
def test_docvalues():
    indexer = engine.Indexer()
    indexer.set('id', engine.Field.String)
    indexer.set('title', docValuesType='binary')
    indexer.set('size', docValuesType='numeric')
    indexer.set('point', docValuesType='numeric')
    indexer.set('priority', docValuesType='sorted')
    indexer.set('tags', docValuesType='sorted_set')
    indexer.set('sizes', docValuesType='sorted_numeric')
    indexer.set('points', docValuesType='sorted_numeric')
    indexer.add(id='0',
                title='zero',
                size=0,
                point=0.5,
                priority='low',
                tags=['red'],
                sizes=[0],
                points=[0.5])
    indexer.commit()

    with pytest.raises(AttributeError):
        indexer.sortfield('id')
    sortfield = indexer.sortfield('id', type='string', reverse=True)
    assert sortfield.field == 'id' and sortfield.reverse and sortfield.type == search.SortField.Type.STRING
    sortfield = indexer.sortfield('title')
    assert sortfield.field == 'title' and not sortfield.reverse and sortfield.type == search.SortField.Type.STRING
    assert indexer.sortfield('size',
                             type=int).type == search.SortField.Type.LONG
    assert indexer.sortfield('point',
                             type=float).type == search.SortField.Type.DOUBLE
    assert indexer.sortfield('priority').type == search.SortField.Type.STRING
    assert indexer.sortfield('tags').type == search.SortField.Type.STRING
    assert indexer.sortfield('sizes').type == search.SortField.Type.LONG
    assert indexer.sortfield('points',
                             type=float).type == search.SortField.Type.DOUBLE

    segments = indexer.segments
    indexer.update('id',
                   id='0',
                   title='one',
                   size=1,
                   point=1.5,
                   priority='high',
                   tags=['blue'],
                   sizes=[1],
                   points=[1.5])
    indexer.commit()
    assert indexer.segments != segments
    segments = indexer.segments
    assert list(indexer.docvalues('title')) == ['one']
    assert list(indexer.docvalues('size', type=int)) == [1]
    assert list(indexer.docvalues('point', type=float)) == [1.5]
    assert list(indexer.docvalues('priority')) == ['high']
    assert list(indexer.docvalues('tags')) == [('blue', )]
    assert list(indexer.docvalues('sizes', type=int)) == [(1, )]
    assert list(indexer.docvalues('points', type=float)) == [(1.5, )]
    indexer.update('id', '0', title='two', size=2, point=2.5)
    indexer.update('id', '0')
    indexer.commit()
    assert indexer.segments == segments
    assert list(indexer.docvalues('title')) == ['two']
    assert list(indexer.docvalues('size', type=int)) == [2]
    assert list(indexer.docvalues('point', type=float)) == [2.5]
    with pytest.raises(AttributeError):
        indexer.docvalues('id')
    assert indexer.search().docvalues('title') == {0: 'two'}

    indexer.add()
    indexer.commit()
    assert None in indexer.docvalues('title')
    assert None in indexer.docvalues('size', type=int)
    assert None in indexer.docvalues('tags')
    assert None in indexer.docvalues('sizes', type=int)
예제 #14
0
Use to query doc by lupyne index, then resort sentences by a new in-memory index
"""

import time
import json
import lucene
import nltk
from org.apache.lucene import queryparser, analysis
from org.apache.lucene.search.similarities import BM25Similarity
from lupyne import engine

start_time = time.time()

lucene.initVM()
dest = "lucene_wikis.index"
indexer = engine.Indexer(dest)
indexer.setSimilarity(BM25Similarity())
analyzer = analysis.standard.StandardAnalyzer()
parser = queryparser.classic.QueryParser("text", analyzer)
prediction = {}
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
nltk.download('averaged_perceptron_tagger')
related_sentences = []


def re_process_string(str):
    str = str.replace(" ", "_")
    str = str.replace("(", "-LRB-")
    str = str.replace(")", "-RRB-")
    return str
예제 #15
0
 def __init__(self, *args, **kwargs):
     self.indexer = engine.Indexer(*args, **kwargs)
     self.updated = time.time()
     self.query_map = {}
예제 #16
0
def test_writer(tempdir):
    indexer = engine.Indexer(useCompoundFile=False)
    assert not indexer.config.useCompoundFile
    with pytest.raises(lucene.JavaError):
        engine.Indexer(indexer.directory)
    indexer.set('text', engine.Field.Text)
    indexer.set('name', stored=True)
    indexer.set('tag', engine.Field.Text, stored=True)
    searcher = indexer.indexSearcher
    indexer.commit()
    assert searcher is indexer.indexSearcher
    assert not searcher.search(count=1)
    indexer.add(text='hello world', name='sample', tag=['python', 'search'])
    assert len(indexer) == 1 and list(indexer) == []
    indexer.commit()
    assert searcher is not indexer.indexSearcher
    assert list(indexer) == [0]
    assert indexer.current
    assert 0 in indexer and 1 not in indexer
    doc = indexer[0]
    assert doc == {'tag': ['python', 'search'], 'name': ['sample']}
    assert doc['name'] == 'sample' and doc['tag'] == 'python'
    assert doc.dict('tag') == {'name': 'sample', 'tag': ['python', 'search']}
    assert doc.dict(name=None, missing=True) == {
        'name': 'sample',
        'missing': True
    }
    with pytest.raises(KeyError):
        doc['key']
    assert doc.getlist('name') == ['sample'] and doc.getlist('key') == []
    assert indexer.get(0, 'name').dict() == {'name': 'sample'}
    assert not list(indexer.termvector(0, 'tag'))
    assert indexer.count('text', 'hello') == indexer.count('text:hello') == 1
    assert list(indexer.docs('text', 'hello')) == [0]
    assert list(indexer.docs('text', 'hi')) == []
    assert list(indexer.docs('text', 'world', counts=True)) == [(0, 1)]
    assert list(indexer.positions('text', 'world')) == [(0, [1])]
    assert list(indexer.positions('text', 'world',
                                  offsets=True)) == [(0, [(-1, -1)])]
    hits = indexer.search('text:hello')
    assert len(hits) == hits.count == 1
    assert hits.scoredocs == hits[:1].scoredocs and not hits[1:]
    assert list(hits.ids) == [0]
    (score, ) = hits.scores
    assert 0 < score < 1
    assert dict(hits.items()) == {0: score}
    data = hits[0].dict()
    assert data['__id__'] == 0 and '__score__' in data
    assert not indexer.search('hello') and indexer.search('hello',
                                                          field='text')
    assert indexer.search('text:hello hi') and not indexer.search(
        'text:hello hi', op='and')
    assert indexer.search('text:*hello', allowLeadingWildcard=True)
    indexer.delete('name:sample')
    indexer.delete('tag', 'python')
    assert 0 in indexer and len(indexer) == 1 and indexer.segments == {'_0': 1}
    indexer.commit()
    assert 0 not in indexer and len(indexer) == 0 and sum(
        indexer.segments.values()) == 0
    indexer.add(tag='test', name='old')
    indexer.update('tag', tag='test')
    indexer.commit()
    assert [indexer[id].dict() for id in indexer] == [{'tag': 'test'}]
    indexer.update('tag', 'test', {'name': 'new'})
    indexer.commit()
    assert [indexer[id].dict() for id in indexer] == [{'name': 'new'}]
예제 #17
0
def test_searcher(tempdir, fields, constitution):
    indexer = engine.Indexer(tempdir)
    indexer.fields = {field.name: field for field in fields}
    for doc in constitution:
        indexer.add(doc)
    indexer.commit()
    searcher = engine.IndexSearcher.load(tempdir)
    assert len(indexer) == len(searcher) and store.RAMDirectory.instance_(
        searcher.directory)
    assert indexer.spellcheckers == {}
    assert indexer.complete('amendment', '')
    assert list(indexer.spellcheckers) == ['amendment']
    indexer.delete('amendment', doc['amendment'])
    indexer.add(doc)
    reader = indexer.indexReader
    indexer.commit(spellcheckers=True)
    assert reader.refCount == 0
    assert list(indexer.spellcheckers) == ['amendment']
    analyzer = engine.Analyzer.standard()
    doc = {'text': doc['text'], 'amendment': analyzer.tokens(doc['amendment'])}
    scores = list(
        searcher.match(doc, 'text:congress', 'text:law', 'amendment:27'))
    assert 0.0 == scores[0] < scores[1] <= scores[2] < 1.0
    assert len(indexer) == len(indexer.search()) == 35
    articles = list(indexer.terms('article'))
    articles.remove('Preamble')
    assert sorted(map(int, articles)) == list(range(1, 8))
    assert sorted(map(int, indexer.terms('amendment'))) == list(range(1, 28))
    assert list(indexer.terms('text', 'right')) == ['right', 'rights']
    assert dict(indexer.terms('text', 'right', counts=True)) == {
        'right': 13,
        'rights': 1
    }
    assert list(indexer.terms('text', 'right', 'right_')) == ['right']
    assert dict(indexer.terms('text', 'right', 'right_', counts=True)) == {
        'right': 13
    }
    assert list(indexer.terms('text', 'right',
                              distance=1)) == ['eight', 'right', 'rights']
    assert dict(indexer.terms('text', 'right', distance=1, counts=True)) == {
        'eight': 3,
        'right': 13,
        'rights': 1
    }
    assert list(indexer.terms('text', 'senite',
                              distance=2)) == ['senate', 'sent']
    word, count = next(indexer.terms('text', 'people', counts=True))
    assert word == 'people' and count == 8
    docs = dict(indexer.docs('text', 'people', counts=True))
    counts = list(docs.values())
    assert len(docs) == count and all(counts) and sum(counts) > count
    positions = dict(indexer.positions('text', 'people'))
    assert list(map(len, positions.values())) == counts
    (hit, ) = indexer.search('"We the People"', field='text')
    assert hit['article'] == 'Preamble'
    assert sorted(hit.dict()) == ['__id__', '__score__', 'article']
    hits = indexer.search('people', field='text')
    assert 'Preamble' in (hit.get('article') for hit in hits)
    assert len(hits) == hits.count == 8
    assert set(map(type, hits.ids)) == {int} and set(map(
        type, hits.scores)) == {float}
    assert hits.maxscore == next(hits.scores)
    ids = list(hits.ids)
    hits = indexer.search('people', count=5, mincount=5, field='text')
    assert list(hits.ids) == ids[:len(hits)]
    assert len(hits) == 5 and hits.count == 8
    assert not any(map(math.isnan, hits.scores))
    assert hits.maxscore == next(hits.scores)
    hits = indexer.search('text:people',
                          count=5,
                          sort=search.Sort.INDEXORDER,
                          scores=True)
    assert sorted(hits.ids) == list(hits.ids)
    assert all(score > 0 for score in hits.scores)
    (hit, ) = indexer.search('freedom', field='text')
    assert hit['amendment'] == '1'
    assert sorted(hit.dict()) == ['__id__', '__score__', 'amendment', 'date']
    hits = indexer.search('date:[1919 TO 1921]')
    amendments = ['18', '19']
    assert sorted(hit['amendment'] for hit in hits) == amendments
    query = Q.range('date', '1919', '1921')
    span = Q.span('text', 'persons')
    count = indexer.count(span)
    spans = dict(indexer.spans(span))
    assert len(spans) == count and spans == dict(
        indexer.docs('text', 'persons', counts=True))
    near = Q.near('text', 'persons', 'papers', slop=2)
    ((id, positions), ) = indexer.spans(near, positions=True)
    assert indexer[id]['amendment'] == '4' and positions in ([(3, 6)
                                                              ], [(10, 13)])
    assert 'persons' in indexer.termvector(id, 'text')
    assert dict(indexer.termvector(id, 'text', counts=True))['persons'] == 2
    assert dict(indexer.positionvector(id,
                                       'text'))['persons'] in ([3,
                                                                26], [10, 48])
    assert dict(indexer.positionvector(id, 'text',
                                       offsets=True))['persons'] == [(46, 53),
                                                                     (301, 308)
                                                                     ]
    analyzer = analysis.core.WhitespaceAnalyzer()
    query = indexer.morelikethis(0, analyzer=analyzer)
    assert {'text:united', 'text:states'} <= set(str(query).split())
    assert str(indexer.morelikethis(0, 'article', analyzer=analyzer)) == ''
    query = indexer.morelikethis(0, minDocFreq=3, analyzer=analyzer)
    assert {'text:establish', 'text:united', 'text:states'} <= set(
        str(query).split())
    assert str(
        indexer.morelikethis('jury',
                             'text',
                             minDocFreq=4,
                             minTermFreq=1,
                             analyzer=analyzer)) == 'text:jury'
    assert str(indexer.morelikethis('jury', 'article',
                                    analyzer=analyzer)) == ''
예제 #18
0
def indexer(tempdir):
    with engine.Indexer(tempdir) as indexer:
        for name in ('city', 'county', 'state', 'latitude', 'longitude'):
            indexer.set(name, stored=True)
        indexer.set('zipcode', engine.Field.String, stored=True)
        yield indexer