def open_index(self, index_name): assert index_name in ['tweet_index', 'tag_index'] path = os.path.join(settings.STORAGE_DIR, index_name) if os.path.exists(path): # open index self.indexer = engine.Indexer(path) elif index_name == 'tag_index': # create index self.indexer = engine.Indexer(path) self.indexer.set('docid', stored=True) self.indexer.set('rank', dimensions=1, stored=True) self.indexer.set('hashtag', engine.Field.Text) self.indexer.set('city', engine.Field.Text) self.indexer.set('state', engine.Field.Text) self.indexer.set('date', engine.DateTimeField) else: # create index self.indexer = engine.Indexer(path) # code to index the field in the tweets self.indexer.set('docid', stored=True) self.indexer.set('rank', dimensions=1, stored=True) self.indexer.set('tweet', engine.Field.Text) self.indexer.set('descrpt', engine.Field.Text) # this long and lat tuple self.indexer.set('coord', engine.SpatialField) self.indexer.set('screen_name', engine.Field.Text) self.indexer.set('city', engine.Field.Text) self.indexer.set('state', engine.Field.Text) self.indexer.set('date', engine.DateTimeField)
def test_analyzers(): stemmer = engine.Analyzer.standard(analysis.en.PorterStemFilter, typeAsPayload) for token in stemmer.tokens('Search'): assert token.positionIncrement == 1 assert engine.TokenFilter( analysis.miscellaneous.EmptyTokenStream()).payload is None assert token.charTerm == 'search' assert token.type == token.payload == '<ALPHANUM>' assert token.offset == (0, 6) token.charTerm = token.type = '' token.offset, token.positionIncrement = (0, 0), 0 assert str(stemmer.parse('searches', field=['body', 'title'])) == 'body:search title:search' assert str(stemmer.parse('searches', field={ 'body': 1.0, 'title': 2.0 })) == '(body:search)^1.0 (title:search)^2.0' indexer = engine.Indexer(analyzer=stemmer) indexer.set('text', engine.Field.Text) indexer.add(text='searches') indexer.commit() (item, ) = indexer.positions('text', 'search', payloads=True) assert item == (0, [(0, '<ALPHANUM>')]) analyzer = engine.Analyzer.whitespace(engine.TokenFilter) assert [token.charTerm for token in analyzer.tokens('Search Engine') ] == ['Search', 'Engine']
def sort_sentences(claim): docs = extract_sentences_as_docs(related_sentences) sentence_indexer = engine.Indexer() sentence_indexer.deleteAll() sentence_indexer.set('title', engine.Field.Text, stored=True) sentence_indexer.set('sen_id', engine.Field.Text, stored=True) sentence_indexer.set('text', engine.Field.Text, stored=True) for key, text in iter(docs.items()): title = key[0] sen_id = key[1] sentence_indexer.add(title=title, sen_id=sen_id, text=title + " " + text) sentence_indexer.commit() query = process_query(claim) hits = sentence_indexer.search(query, field='text', count=5) sentences = [] for hit in hits: sentences.append((hit['title'], hit['sen_id'])) sentence_indexer.deleteAll() sentence_indexer.commit() sentence_indexer.close() return sentences
def test_multi(tempdir): indexers = engine.Indexer(tempdir), engine.Indexer() searcher = engine.MultiSearcher( [indexers[0].indexReader, indexers[1].directory]) pytest.raises(TypeError, getattr, searcher, 'timestamp') assert engine.MultiSearcher([indexers[0].directory]).timestamp assert [reader.refCount for reader in searcher.indexReaders] == [2, 1] assert searcher.reopen() is searcher indexers[0].add() indexers[0].commit() assert [reader.refCount for reader in searcher.indexReaders] == [1, 1] searcher, previous = searcher.reopen(), searcher assert searcher.version > previous.version assert [reader.refCount for reader in searcher.indexReaders] == [1, 2] del previous assert [reader.refCount for reader in searcher.indexReaders] == [1, 1]
def prepare_index(docs_dir, stemmer, min_len=5): indexer = engine.Indexer() indexer.set('contents', stored=True) indexer.set('path', stored=True) idx = 0 p = Pool(4) index_start, indexed_documents_count = default_timer(), 0 for root, dirs, files in os.walk(docs_dir): for f in files: idx += 1 if idx % 10 == 0: print('indexed {} files'.format(idx)) with codecs.open(os.path.join(root, f), 'r', 'utf-8') as doc: articles = filter(lambda x: len(x.split()) >= min_len, doc.readlines()) stemmed_articles = p.map(stem_sentence_and_remove_stopwords, articles) for article in stemmed_articles: indexer.add(path=os.path.join(root, f), contents=article) indexed_documents_count += 1 indexer.commit() print('Added', indexed_documents_count, 'documents to index, spent', default_timer() - index_start, 'sec') return indexer
def test_nrt(): indexer = engine.Indexer(nrt=True) indexer.add() assert indexer.count() == 0 and not indexer.current indexer.refresh() assert indexer.count() == 1 and indexer.current searcher = engine.IndexSearcher(indexer.directory) assert searcher.count() == 0 and searcher.current indexer.add() indexer.commit() assert indexer.count() == engine.IndexSearcher( indexer.directory).count() == 2
def test_indexes(tempdir): with pytest.raises(TypeError): engine.IndexSearcher() with pytest.raises(lucene.JavaError): engine.Indexer(tempdir, 'r') indexer = engine.Indexer() indexer.set('name', engine.Field.String, stored=True) indexer.set('text', engine.Field.Text) with engine.Indexer(tempdir) as temp: temp.add() with pytest.raises(KeyError), engine.Indexer(tempdir) as temp: temp.add() temp.add(missing='') for other in (temp, temp.directory, tempdir): indexer += other assert len(indexer) == 3 analyzer = engine.Analyzer.whitespace() indexer.add(text=analyzer.tokens('?'), name=util.BytesRef('{}')) indexer.commit() assert indexer[next(indexer.docs('text', '?'))]['name'] == '{}' indexer.delete('text', '?') indexer.commit(merge=True) assert not indexer.hasDeletions() indexer.commit(merge=1) assert len(list(indexer.readers)) == 1 reader = engine.indexers.IndexReader(indexer.indexReader) del reader.indexReader with pytest.raises(AttributeError): reader.maxDoc del indexer.indexSearcher with pytest.raises(AttributeError): indexer.search indexer = engine.Indexer(tempdir) indexer.add() indexer.commit() files = set(os.listdir(tempdir)) path = os.path.join(tempdir, 'temp') with indexer.snapshot() as commit: indexer.commit(merge=1) assert indexer.indexCommit.generation > commit.generation engine.indexers.copy(commit, path) assert set(os.listdir(path)) == set(commit.fileNames) < files < set( os.listdir(tempdir)) filepath = os.path.join(path, commit.segmentsFileName) os.remove(filepath) open(filepath, 'w').close() with pytest.raises(OSError): engine.indexers.copy(commit, path) with pytest.raises(lucene.JavaError): indexer.check(tempdir) del indexer assert engine.Indexer(tempdir) assert not os.path.exists(os.path.join(tempdir, commit.segmentsFileName)) assert engine.IndexWriter.check(tempdir).clean assert not engine.IndexWriter.check(tempdir, fix=True).numBadSegments
def test_highlighting(constitution): indexer = engine.Indexer() indexer.set('text', engine.Field.Text, stored=True, storeTermVectors=True, storeTermVectorPositions=True, storeTermVectorOffsets=True) for doc in constitution: if 'amendment' in doc: indexer.add(text=doc['text']) indexer.commit() query = Q.term('text', 'right') assert engine.Analyzer.highlight( indexer.analyzer, query, 'text', "word right word") == "word <b>right</b> word" hits = indexer.search(query) highlights = list(hits.highlights(query, text=1)) assert len(hits) == len(highlights) for highlight in highlights: assert '<b>right</b>' in highlight.pop('text') and not highlight
def test_spellcheck(fields, constitution): indexer = engine.Indexer() indexer.fields = {field.name: field for field in fields} for doc in constitution: indexer.add(doc) indexer.commit() assert indexer.complete('missing', '') == [] assert {'shall', 'states'} <= set(indexer.complete('text', '')[:8]) assert indexer.complete('text', 'con')[:2] == ['congress', 'constitution'] assert indexer.complete('text', 'congress') == indexer.complete( 'text', 'con', count=1) == ['congress'] assert indexer.complete('text', 'congresses') == [] assert indexer.suggest('text', 'write') == ['writs'] assert indexer.suggest('text', 'write', 3) == ['writs', 'writ', 'written'] assert indexer.suggest('text', 'write', 3, maxEdits=1) == ['writs', 'writ'] query = indexer.parse('text:write', spellcheck=True) assert search.TermQuery.instance_(query) and str(query) == 'text:writs' query = indexer.parse('"hello world"', field='text', spellcheck=True) assert search.PhraseQuery.instance_(query) and str( query) == 'text:"held would"' assert str(indexer.parse('vwxyz', field='text', spellcheck=True)) == 'text:vwxyz'
def sales_sort(command): query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(command) scoreDocs = searcher.search(query, 8).scoreDocs # sorter = search.Sort(search.SortField('price', search.SortField.Type.STRING)) # topdocs = searcher.search(query, None, 10, sorter) # print "%s total matching documents." % len(topdocs.scoreDocs) # for scoredoc in topdocs.scoreDocs: # doc = searcher.doc(scoredoc.doc) print "%s total matching documents." % len(scoreDocs) indexer = engine.Indexer() l = 'name', 'price', 'comments', 'sales', 'url', 'img', 'place', 'shop' for i in l: indexer.set(i, stored=True, tokenized=False) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) indexer.add(name=doc.get('name'), price=doc.get('price'), comments=doc.get('comments'), sales=doc.get('sales'), url=doc.get('url'), img=doc.get('img'), shop=doc.get('shop'), place=doc.get('place')) # print doc.get('sales') indexer.commit() hits = list(indexer.search(sort='sales'))[::-1] for hit in hits: print '------------------------------------------------------------------------------------------------------------' print 'Perfume:', hit['name'] print 'Price:', hit['price'] print 'Sales:', hit['sales'] print 'img:', hit['img'] print 'comments:', hit['comments'] print 'Shop:', hit['shop'] print 'Place:', hit['place']
import lucene import nltk from lupyne import engine import time time_start = time.time() lucene.initVM() # Store the index in memory: indexer = engine.Indexer( directory='doc_sentence' ) # Indexer combines Writer and Searcher; RAMDirectory and StandardAnalyzer are defaults indexer.set('content', engine.Field.Text, stored=True) indexer.set('sent', engine.Field.Text, stored=True) indexer.set('doc', engine.Field.Text, stored=True) indexer.set('id', engine.Field.Text, stored=True) doc_indexer = engine.Indexer(directory='stemmer_doc') doc_indexer.set('content', engine.Field.Text, stored=True) doc_indexer.set('doc', engine.Field.Text, stored=True) doc_indexer.set('id', engine.Field.Text, stored=True) lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() def lemmatize(word): if not word.isalpha(): return word
import bs4 from general_utility import ACIndex import datetime import re import nltk from nltk.stem.snowball import SnowballStemmer from collections import OrderedDict import general_utility as gu stopwords = nltk.corpus.stopwords.words('russian') stemmer = SnowballStemmer('russian') tokenizer = nltk.RegexpTokenizer( ur'[^A-Ba-b\s\.\",:;\(\)\!\?]+') #WTF van Rossum? corpus_index = ACIndex() paths = gu.get_filepaths('/root/PycharmProjects/CA3/new_meta') indexer = engine.Indexer('./lucene_index_05') indexer.set('title', stored=True) indexer.set('text', stored=True) indexer.set('stemmed_text', stored=True) indexer.set('rubrics', stored=True) indexer.set('objects', stored=True) indexer.set('metadata', stored=True) indexer.set('keywords', stored=True) indexer.set('id', stored=True) indexer.set('mentions', stored=True) indexer.set('path', stored=True, tokenized=False) def normalize_date(date): if re.search('-', date): beginning, end = date.split('-')
def test_docvalues(): indexer = engine.Indexer() indexer.set('id', engine.Field.String) indexer.set('title', docValuesType='binary') indexer.set('size', docValuesType='numeric') indexer.set('point', docValuesType='numeric') indexer.set('priority', docValuesType='sorted') indexer.set('tags', docValuesType='sorted_set') indexer.set('sizes', docValuesType='sorted_numeric') indexer.set('points', docValuesType='sorted_numeric') indexer.add(id='0', title='zero', size=0, point=0.5, priority='low', tags=['red'], sizes=[0], points=[0.5]) indexer.commit() with pytest.raises(AttributeError): indexer.sortfield('id') sortfield = indexer.sortfield('id', type='string', reverse=True) assert sortfield.field == 'id' and sortfield.reverse and sortfield.type == search.SortField.Type.STRING sortfield = indexer.sortfield('title') assert sortfield.field == 'title' and not sortfield.reverse and sortfield.type == search.SortField.Type.STRING assert indexer.sortfield('size', type=int).type == search.SortField.Type.LONG assert indexer.sortfield('point', type=float).type == search.SortField.Type.DOUBLE assert indexer.sortfield('priority').type == search.SortField.Type.STRING assert indexer.sortfield('tags').type == search.SortField.Type.STRING assert indexer.sortfield('sizes').type == search.SortField.Type.LONG assert indexer.sortfield('points', type=float).type == search.SortField.Type.DOUBLE segments = indexer.segments indexer.update('id', id='0', title='one', size=1, point=1.5, priority='high', tags=['blue'], sizes=[1], points=[1.5]) indexer.commit() assert indexer.segments != segments segments = indexer.segments assert list(indexer.docvalues('title')) == ['one'] assert list(indexer.docvalues('size', type=int)) == [1] assert list(indexer.docvalues('point', type=float)) == [1.5] assert list(indexer.docvalues('priority')) == ['high'] assert list(indexer.docvalues('tags')) == [('blue', )] assert list(indexer.docvalues('sizes', type=int)) == [(1, )] assert list(indexer.docvalues('points', type=float)) == [(1.5, )] indexer.update('id', '0', title='two', size=2, point=2.5) indexer.update('id', '0') indexer.commit() assert indexer.segments == segments assert list(indexer.docvalues('title')) == ['two'] assert list(indexer.docvalues('size', type=int)) == [2] assert list(indexer.docvalues('point', type=float)) == [2.5] with pytest.raises(AttributeError): indexer.docvalues('id') assert indexer.search().docvalues('title') == {0: 'two'} indexer.add() indexer.commit() assert None in indexer.docvalues('title') assert None in indexer.docvalues('size', type=int) assert None in indexer.docvalues('tags') assert None in indexer.docvalues('sizes', type=int)
Use to query doc by lupyne index, then resort sentences by a new in-memory index """ import time import json import lucene import nltk from org.apache.lucene import queryparser, analysis from org.apache.lucene.search.similarities import BM25Similarity from lupyne import engine start_time = time.time() lucene.initVM() dest = "lucene_wikis.index" indexer = engine.Indexer(dest) indexer.setSimilarity(BM25Similarity()) analyzer = analysis.standard.StandardAnalyzer() parser = queryparser.classic.QueryParser("text", analyzer) prediction = {} word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() nltk.download('averaged_perceptron_tagger') related_sentences = [] def re_process_string(str): str = str.replace(" ", "_") str = str.replace("(", "-LRB-") str = str.replace(")", "-RRB-") return str
def __init__(self, *args, **kwargs): self.indexer = engine.Indexer(*args, **kwargs) self.updated = time.time() self.query_map = {}
def test_writer(tempdir): indexer = engine.Indexer(useCompoundFile=False) assert not indexer.config.useCompoundFile with pytest.raises(lucene.JavaError): engine.Indexer(indexer.directory) indexer.set('text', engine.Field.Text) indexer.set('name', stored=True) indexer.set('tag', engine.Field.Text, stored=True) searcher = indexer.indexSearcher indexer.commit() assert searcher is indexer.indexSearcher assert not searcher.search(count=1) indexer.add(text='hello world', name='sample', tag=['python', 'search']) assert len(indexer) == 1 and list(indexer) == [] indexer.commit() assert searcher is not indexer.indexSearcher assert list(indexer) == [0] assert indexer.current assert 0 in indexer and 1 not in indexer doc = indexer[0] assert doc == {'tag': ['python', 'search'], 'name': ['sample']} assert doc['name'] == 'sample' and doc['tag'] == 'python' assert doc.dict('tag') == {'name': 'sample', 'tag': ['python', 'search']} assert doc.dict(name=None, missing=True) == { 'name': 'sample', 'missing': True } with pytest.raises(KeyError): doc['key'] assert doc.getlist('name') == ['sample'] and doc.getlist('key') == [] assert indexer.get(0, 'name').dict() == {'name': 'sample'} assert not list(indexer.termvector(0, 'tag')) assert indexer.count('text', 'hello') == indexer.count('text:hello') == 1 assert list(indexer.docs('text', 'hello')) == [0] assert list(indexer.docs('text', 'hi')) == [] assert list(indexer.docs('text', 'world', counts=True)) == [(0, 1)] assert list(indexer.positions('text', 'world')) == [(0, [1])] assert list(indexer.positions('text', 'world', offsets=True)) == [(0, [(-1, -1)])] hits = indexer.search('text:hello') assert len(hits) == hits.count == 1 assert hits.scoredocs == hits[:1].scoredocs and not hits[1:] assert list(hits.ids) == [0] (score, ) = hits.scores assert 0 < score < 1 assert dict(hits.items()) == {0: score} data = hits[0].dict() assert data['__id__'] == 0 and '__score__' in data assert not indexer.search('hello') and indexer.search('hello', field='text') assert indexer.search('text:hello hi') and not indexer.search( 'text:hello hi', op='and') assert indexer.search('text:*hello', allowLeadingWildcard=True) indexer.delete('name:sample') indexer.delete('tag', 'python') assert 0 in indexer and len(indexer) == 1 and indexer.segments == {'_0': 1} indexer.commit() assert 0 not in indexer and len(indexer) == 0 and sum( indexer.segments.values()) == 0 indexer.add(tag='test', name='old') indexer.update('tag', tag='test') indexer.commit() assert [indexer[id].dict() for id in indexer] == [{'tag': 'test'}] indexer.update('tag', 'test', {'name': 'new'}) indexer.commit() assert [indexer[id].dict() for id in indexer] == [{'name': 'new'}]
def test_searcher(tempdir, fields, constitution): indexer = engine.Indexer(tempdir) indexer.fields = {field.name: field for field in fields} for doc in constitution: indexer.add(doc) indexer.commit() searcher = engine.IndexSearcher.load(tempdir) assert len(indexer) == len(searcher) and store.RAMDirectory.instance_( searcher.directory) assert indexer.spellcheckers == {} assert indexer.complete('amendment', '') assert list(indexer.spellcheckers) == ['amendment'] indexer.delete('amendment', doc['amendment']) indexer.add(doc) reader = indexer.indexReader indexer.commit(spellcheckers=True) assert reader.refCount == 0 assert list(indexer.spellcheckers) == ['amendment'] analyzer = engine.Analyzer.standard() doc = {'text': doc['text'], 'amendment': analyzer.tokens(doc['amendment'])} scores = list( searcher.match(doc, 'text:congress', 'text:law', 'amendment:27')) assert 0.0 == scores[0] < scores[1] <= scores[2] < 1.0 assert len(indexer) == len(indexer.search()) == 35 articles = list(indexer.terms('article')) articles.remove('Preamble') assert sorted(map(int, articles)) == list(range(1, 8)) assert sorted(map(int, indexer.terms('amendment'))) == list(range(1, 28)) assert list(indexer.terms('text', 'right')) == ['right', 'rights'] assert dict(indexer.terms('text', 'right', counts=True)) == { 'right': 13, 'rights': 1 } assert list(indexer.terms('text', 'right', 'right_')) == ['right'] assert dict(indexer.terms('text', 'right', 'right_', counts=True)) == { 'right': 13 } assert list(indexer.terms('text', 'right', distance=1)) == ['eight', 'right', 'rights'] assert dict(indexer.terms('text', 'right', distance=1, counts=True)) == { 'eight': 3, 'right': 13, 'rights': 1 } assert list(indexer.terms('text', 'senite', distance=2)) == ['senate', 'sent'] word, count = next(indexer.terms('text', 'people', counts=True)) assert word == 'people' and count == 8 docs = dict(indexer.docs('text', 'people', counts=True)) counts = list(docs.values()) assert len(docs) == count and all(counts) and sum(counts) > count positions = dict(indexer.positions('text', 'people')) assert list(map(len, positions.values())) == counts (hit, ) = indexer.search('"We the People"', field='text') assert hit['article'] == 'Preamble' assert sorted(hit.dict()) == ['__id__', '__score__', 'article'] hits = indexer.search('people', field='text') assert 'Preamble' in (hit.get('article') for hit in hits) assert len(hits) == hits.count == 8 assert set(map(type, hits.ids)) == {int} and set(map( type, hits.scores)) == {float} assert hits.maxscore == next(hits.scores) ids = list(hits.ids) hits = indexer.search('people', count=5, mincount=5, field='text') assert list(hits.ids) == ids[:len(hits)] assert len(hits) == 5 and hits.count == 8 assert not any(map(math.isnan, hits.scores)) assert hits.maxscore == next(hits.scores) hits = indexer.search('text:people', count=5, sort=search.Sort.INDEXORDER, scores=True) assert sorted(hits.ids) == list(hits.ids) assert all(score > 0 for score in hits.scores) (hit, ) = indexer.search('freedom', field='text') assert hit['amendment'] == '1' assert sorted(hit.dict()) == ['__id__', '__score__', 'amendment', 'date'] hits = indexer.search('date:[1919 TO 1921]') amendments = ['18', '19'] assert sorted(hit['amendment'] for hit in hits) == amendments query = Q.range('date', '1919', '1921') span = Q.span('text', 'persons') count = indexer.count(span) spans = dict(indexer.spans(span)) assert len(spans) == count and spans == dict( indexer.docs('text', 'persons', counts=True)) near = Q.near('text', 'persons', 'papers', slop=2) ((id, positions), ) = indexer.spans(near, positions=True) assert indexer[id]['amendment'] == '4' and positions in ([(3, 6) ], [(10, 13)]) assert 'persons' in indexer.termvector(id, 'text') assert dict(indexer.termvector(id, 'text', counts=True))['persons'] == 2 assert dict(indexer.positionvector(id, 'text'))['persons'] in ([3, 26], [10, 48]) assert dict(indexer.positionvector(id, 'text', offsets=True))['persons'] == [(46, 53), (301, 308) ] analyzer = analysis.core.WhitespaceAnalyzer() query = indexer.morelikethis(0, analyzer=analyzer) assert {'text:united', 'text:states'} <= set(str(query).split()) assert str(indexer.morelikethis(0, 'article', analyzer=analyzer)) == '' query = indexer.morelikethis(0, minDocFreq=3, analyzer=analyzer) assert {'text:establish', 'text:united', 'text:states'} <= set( str(query).split()) assert str( indexer.morelikethis('jury', 'text', minDocFreq=4, minTermFreq=1, analyzer=analyzer)) == 'text:jury' assert str(indexer.morelikethis('jury', 'article', analyzer=analyzer)) == ''
def indexer(tempdir): with engine.Indexer(tempdir) as indexer: for name in ('city', 'county', 'state', 'latitude', 'longitude'): indexer.set(name, stored=True) indexer.set('zipcode', engine.Field.String, stored=True) yield indexer