def prepare_chained_filter(self, dt1, dt2): """Return a chained filter.""" return ChainedFilter([ self.dup_filter, TermRangeFilter( 'date_published', BytesRef(dt1.strftime(self.date_format)), BytesRef(dt2.strftime(self.date_format)), True, True) ], [ChainedFilter.AND, ChainedFilter.AND])
def query_between_dates(self, dt1, dt2, original_query=None): '''Update the given query to only allow records between dt1 and dt2.''' return TermRangeQuery( 'date_published', # Field BytesRef(dt1.strftime(self.date_format)), # Lower bound BytesRef(dt2.strftime(self.date_format)), # Upper bound True, # Include lower bound True # Include upper bound )
def index_one(self, article): """Create index for one url object in the database. """ try: date_published_str = article['date_published'].strftime( self.date_format) except Exception as e: logger.warning('Error when formating date_published %r: %s ', article['canonical_url'], e) return doc = Document() doc.add(StoredField('group_id', article['group_id'])) doc.add(StoredField('article_id', article['article_id'])) doc.add( StringField('date_published', date_published_str, Field.Store.YES)) doc.add( SortedDocValuesField('date_published', BytesRef(date_published_str))) doc.add(StoredField('date_published', date_published_str)) doc.add(StringField('domain', article['domain'], Field.Store.YES)) doc.add(StringField('site_type', article['site_type'], Field.Store.YES)) doc.add( TextField('canonical_url', article['canonical_url'], Field.Store.YES)) doc.add(TextField('title', article['title'], Field.Store.YES)) doc.add(TextField('meta', article['meta'], Field.Store.NO)) doc.add(TextField('content', article['content'], Field.Store.NO)) doc.add(StoredField('uq_id_str', article['uq_id_str'])) self.writer.addDocument(doc)
def __init__(self, input): super(PayloadSetter, self).__init__(input) self.input = input self.payloadAtt = self.addAttribute(PayloadAttribute.class_) self.data = JArray('byte')(1) self.p = BytesRef(self.data, 0, 1)
def incrementToken(self): if self.input.incrementToken(): bytes = JArray('byte')("pos: %d" %(self.pos)) self.payloadAttr.setPayload(BytesRef(bytes)) if self.pos == 0 or self.i % 2 == 1: posIncr = 1 else: posIncr = 0 self.posIncrAttr.setPositionIncrement(posIncr) self.pos += posIncr self.i += 1 return True return False
def index_document(writer, log): doc = Document() doc.add(SortedDocValuesField('host', BytesRef(log['host']))) doc.add(Field('host', log['host'], TextField.TYPE_STORED)) doc.add(Field('client_user_name_if_available', log['client_user_name_if_available'], TextField.TYPE_STORED)) date = handleDate(log['date_time']) doc.add(SortedDocValuesField('date_time', BytesRef(date))) doc.add(Field('date_time', date, StringField.TYPE_STORED)) doc.add(SortedDocValuesField('method', BytesRef(log['method']))) # doc.add(FacetField('method', log['method'])) doc.add(Field('method', log['method'], TextField.TYPE_STORED)) doc.add(SortedDocValuesField('request_path', BytesRef(log['request_path']))) doc.add(Field('request_path', log['request_path'], TextField.TYPE_STORED)) doc.add(SortedDocValuesField('protocol', BytesRef(log['protocol']))) doc.add(Field('protocol', log['protocol'], StringField.TYPE_STORED)) doc.add(SortedDocValuesField('response_code', BytesRef(str(log['response_code'])))) response_code = str(log['response_code']) if log['response_code'] else 'None' doc.add(Field('response_code_string', response_code, StringField.TYPE_STORED)) doc.add(IntPoint('response_code', log['response_code'])) doc.add(SortedDocValuesField('content_size', BytesRef(log['content_size']))) doc.add(IntPoint('content_size', log['content_size'])) doc.add(Field('request_referrer', log['request_referrer'], TextField.TYPE_NOT_STORED)) doc.add(Field('request_user_agent', log['request_user_agent'], TextField.TYPE_NOT_STORED)) doc.add(Field('router_name', log['router_name'], TextField.TYPE_NOT_STORED)) doc.add(Field('server_url', log['server_url'], TextField.TYPE_STORED)) doc.add(SortedDocValuesField('request_duration', BytesRef(log['request_duration']))) doc.add(IntPoint('request_duration', log['request_duration'])) location = str(log['location']) if log['location'] else 'None' location_ascii_free = unicodedata.normalize('NFKD', location).encode('ascii','ignore').decode('ascii') doc.add(SortedDocValuesField('location', BytesRef(location_ascii_free))) # doc.add(Field('location_raw', location, StringField.TYPE_STORED)) doc.add(Field('location', location, TextField.TYPE_STORED)) writer.addDocument(doc)
def termsForField(self, field, prefix=None, limit=10, **kwargs): convert = lambda term: term.utf8ToString() terms = [] termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field) if termsEnum is None: return terms iterator = termsEnum.iterator(None) if prefix: iterator.seekCeil(BytesRef(prefix)) terms.append((iterator.docFreq(), convert(iterator.term()))) bytesIterator = BytesRefIterator.cast_(iterator) try: while len(terms) < limit: term = convert(bytesIterator.next()) if prefix and not term.startswith(prefix): break terms.append((iterator.docFreq(), term)) except StopIteration: pass return terms
def _updateOaiRecord(self, identifier, setSpecs, metadataPrefixes, delete=False, oldDoc=None, deleteInSets=None, deleteInPrefixes=None, _overrideStamp=None): oldDoc = oldDoc or self._getDocument(identifier) doc, oldDeletedSets, oldDeletedPrefixes = self._getNewDocument( identifier, oldDoc=oldDoc) newStamp = _overrideStamp if self._importMode else self._newStamp() doc.add(LongPoint(STAMP_FIELD, int(newStamp))) doc.add( StoredField(STAMP_FIELD, BytesRef(JArray('byte')(int_to_bytes(newStamp))))) doc.add(NumericDocValuesField(NUMERIC_STAMP_FIELD, int(newStamp))) allMetadataPrefixes, allDeletedPrefixes = self._setMetadataPrefixes( doc=doc, metadataPrefixes=asSet(metadataPrefixes), delete=delete, deleteInPrefixes=asSet(deleteInPrefixes), oldDeletedPrefixes=oldDeletedPrefixes) allSets, allDeletedSets = self._setSets(doc=doc, setSpecs=setSpecs or [], delete=delete, deleteInSets=deleteInSets, oldDeletedSets=oldDeletedSets) if delete or (allDeletedSets and allSets == allDeletedSets ) or allMetadataPrefixes == allDeletedPrefixes: doc.add( StringField(TOMBSTONE_FIELD, TOMBSTONE_VALUE, Field.Store.YES)) self._writer.updateDocument(Term(IDENTIFIER_FIELD, identifier), doc) self._latestModifications.add(str(identifier)) self.do.signalOaiUpdate(metadataPrefixes=allMetadataPrefixes, sets=allSets, stamp=newStamp)
def testSetPosition(self): class _tokenizer(PythonTokenizer): def __init__(_self): super(_tokenizer, _self).__init__() _self.TOKENS = ["1", "2", "3", "4", "5"] _self.INCREMENTS = [1, 2, 1, 0, 1] _self.i = 0 _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_) _self.termAtt = _self.addAttribute(CharTermAttribute.class_) _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_) def incrementToken(_self): if _self.i == len(_self.TOKENS): return False _self.clearAttributes() _self.termAtt.append(_self.TOKENS[_self.i]) _self.offsetAtt.setOffset(_self.i, _self.i) _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i]) _self.i += 1 return True def reset(_self): super(_tokenizer, _self).reset() _self.i = 0 class _analyzer(PythonAnalyzer): def createComponents(_self, fieldName): return Analyzer.TokenStreamComponents(_tokenizer()) def initReader(_self, fieldName, reader): return reader writer = self.getWriter(analyzer=_analyzer()) d = Document() d.add(Field("field", "bogus", TextField.TYPE_STORED)) writer.addDocument(d) writer.commit() writer.close() searcher = self.getSearcher() reader = searcher.getIndexReader() pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1")) pos.nextDoc() # first token should be at position 0 self.assertEqual(0, pos.nextPosition()) pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2")) pos.nextDoc() # second token should be at position 2 self.assertEqual(2, pos.nextPosition()) b = PhraseQuery.Builder() b.add(Term("field", "1")) b.add(Term("field", "2")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # same as previous, just specify positions explicitely. b = PhraseQuery.Builder() b.add(Term("field", "1"), 0) b.add(Term("field", "2"), 1) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # specifying correct positions should find the phrase. b = PhraseQuery.Builder() b.add(Term("field", "1"), 0) b.add(Term("field", "2"), 2) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "3")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "3")) b.add(Term("field", "4")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # phrase query would find it when correct positions are specified. b = PhraseQuery.Builder() b.add(Term("field", "3"), 0) b.add(Term("field", "4"), 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) # phrase query should fail for non existing searched term # even if there exist another searched terms in the same searched # position. b = PhraseQuery.Builder() b.add(Term("field", "3"), 0) b.add(Term("field", "9"), 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # multi-phrase query should succed for non existing searched term # because there exist another searched terms in the same searched # position. b = MultiPhraseQuery.Builder() b.add([Term("field", "3"), Term("field", "9")], 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "4")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "3")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "4")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits))
def collectLeaf(_self, postings, position, term): if postings.getPayload() is not None: _self.payloads.append(BytesRef.deepCopyOf(postings.getPayload()))
def testPayloadsPos0(self): writer = self.getWriter(analyzer=TestPayloadAnalyzer()) doc = Document() doc.add( Field("content", "a a b c d e a f g h i j a b k k", TextField.TYPE_STORED)) writer.addDocument(doc) reader = writer.getReader() writer.close() tp = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "content", BytesRef("a")) count = 0 self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS) # "a" occurs 4 times self.assertEqual(4, tp.freq()) expected = 0 self.assertEqual(expected, tp.nextPosition()) self.assertEqual(1, tp.nextPosition()) self.assertEqual(3, tp.nextPosition()) self.assertEqual(6, tp.nextPosition()) # only one doc has "a" self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS) searcher = self.getSearcher(reader=reader) stq1 = SpanTermQuery(Term("content", "a")) stq2 = SpanTermQuery(Term("content", "k")) sqs = [stq1, stq2] snq = SpanNearQuery(sqs, 30, False) count = 0 sawZero = False pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) while pspans.next(): payloads = pspans.getPayload() sawZero |= pspans.start() == 0 it = payloads.iterator() while it.hasNext(): count += 1 it.next() self.assertEqual(5, count) self.assert_(sawZero) spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) count = 0 sawZero = False while spans.next(): count += 1 sawZero |= spans.start() == 0 self.assertEqual(4, count) self.assert_(sawZero) sawZero = False psu = PayloadSpanUtil(searcher.getTopReaderContext()) pls = psu.getPayloadsForQuery(snq) count = pls.size() it = pls.iterator() while it.hasNext(): bytes = JArray('byte').cast_(it.next()) s = bytes.string_ sawZero |= s == "pos: 0" self.assertEqual(5, count) self.assert_(sawZero)
def index_docs(self, log_interval=100000): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS) t2_tk = FieldType() t2_tk.setStored(True) t2_tk.setTokenized(True) t2_tk.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t3_tkfp = FieldType() t3_tkfp.setStored(True) t3_tkfp.setTokenized(True) t3_tkfp.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) bin_dv_ft = FieldType() bin_dv_ft.setDocValuesType(DocValuesType.BINARY) bin_dv_ft.setStored(True) bin_dv_ft.setIndexOptions(IndexOptions.DOCS) num_paragraphs = 0 num_empty_paragraphs = 0 no_entity_paragraphs = 0 num_avg_entities = 0 # get docs from our sqlite db for d_idx, doc_id in enumerate(self.doc_ids): doc_p_ents = self.wiki_db.get_doc_p_ents(doc_id) assert doc_p_ents doc_dict = json.loads(doc_p_ents) paragraphs = doc_dict['paragraphs'] for p_idx, p in enumerate(paragraphs): p_text = p['text'] if len(p_text) == 0: num_empty_paragraphs += 1 continue lucene_doc = Document() lucene_doc.add(Field("wiki_doc_id", doc_id, t3_tkfp)) lucene_doc.add(Field("p_idx", str(p_idx), t1)) lucene_doc.add(Field("content", p_text, t3_tkfp)) # Named-entities ents = p['ents'] ent_set = set() if len(ents) > 0: entity_idx_set = set() entity_type_id_set = set() entity_positions = list() for entity in ents: # Filter number types if entity['label_'] in self.spacy_number_types: continue assert 'label' in entity, 'doc_id={}'.format(doc_id) num_avg_entities += 1 entity_key = entity['text'] + '\t' + entity['label_'] etypeidx = self.entitytype2idx.get(entity['label']) if etypeidx is None: etypeidx = len(self.entitytype2idx) self.entitytype2idx[entity['label']] = etypeidx eidx = self.entity2idx.get(entity_key) if eidx is None: eidx = len(self.entity2idx) self.entity2idx[entity_key] = eidx self.idx2entity[eidx] = entity_key self.entity_dict[eidx] = \ (entity['text'], entity['label_'], etypeidx) entity_idx_set.add(eidx) entity_type_id_set.add(etypeidx) entity_positions.append((eidx, etypeidx, entity['start_char'], entity['end_char'])) ent_set.add((eidx, etypeidx)) if len(entity_idx_set) > 0: lucene_doc.add( Field("entity_id", '\t'.join([str(eidx) for eidx in entity_idx_set]), t2_tk)) lucene_doc.add( Field("entity_type_id", '\t'.join([str(etid) for etid in entity_type_id_set]), t2_tk)) positions = \ '\t'.join(['{},{},{},{}' .format(eidx, etidx, start_char, end_char) for eidx, etidx, start_char, end_char in entity_positions]) lucene_doc.add(Field("entity_position", positions, t1)) else: no_entity_paragraphs += 1 if self.num_entities_max < len(ent_set): self.num_entities_max = len(ent_set) binary = get_binary4dvs(ent_set, write_type=self.write_type) # https://lucene.apache.org/pylucene/jcc/features.html br = BytesRef(lucene.JArray('byte')(binary)) lucene_doc.add(BinaryDocValuesField("eqa_bin", br)) # # debug # lucene_doc.add(StoredField("eqa_bin_store", br)) # lucene_doc.add(StoredField("bin_raw", binary.hex())) self.writer.addDocument(lucene_doc) num_paragraphs += 1 if num_paragraphs % log_interval == 0: print(datetime.now(), 'Added #paragraphs', num_paragraphs, '#wikidocs', d_idx + 1, '#entities', len(self.entity_dict)) print('#paragraphs', num_paragraphs) print('#no_entity_paragraphs', no_entity_paragraphs, '{:.2f}%'.format(100*no_entity_paragraphs/num_paragraphs)) print('avg num of entities {:.2f}' .format(num_avg_entities / (num_paragraphs - no_entity_paragraphs))) if num_empty_paragraphs > 0: print('#skipped_empty_paragraphs', num_empty_paragraphs) print('\nAdding entity docs..') for e_dict_idx, entity_idx in enumerate(self.entity_dict): # skip UNK if entity_idx == self.entity2idx['UNK']: continue ename, etype, etype_idx = self.entity_dict[entity_idx] entity_doc = Document() entity_doc.add(Field("name", ename, t2_tk)) entity_doc.add(Field("type", etype, t1)) entity_doc.add(Field("eid", str(entity_idx), t1)) entity_doc.add(Field("etid", str(etype_idx), t1)) self.writer.addDocument(entity_doc) if (e_dict_idx + 1) % (10 * log_interval) == 0: print(datetime.now(), '#entities', e_dict_idx + 1) print('#entities', len(self.entity2idx) - 1) print('#entities_max', self.num_entities_max) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() self.writer.commit() self.writer.close() ticker.tick = False print('done')
def stats_tooltip(word, doc_id, reader): # content statistics term = Term('content', tokenize(word)) term_text = unicode(term).replace('content:', '') doc_count = reader.docFreq(term) # in how many docs the term appears total_term_count = reader.totalTermFreq( term) # how many times the term appears in any doc n_docs = reader.getDocCount('content') # total number of docs postings = MultiFields.getTermDocsEnum(reader, 'content', BytesRef(term_text)) while postings.docID() != doc_id: # this is bad postings.nextDoc() term_count = postings.freq() # how many times the term appears in this doc similarity = ClassicSimilarity() tf = similarity.tf(float(term_count)) # sqrt(term_freq) # whether the term is is common or rare among all the docs idf = similarity.idf(long(doc_count), long(n_docs)) # log((n_docs+1)/(doc_count+1)) + 1 # abstract statistics abstract_term = Term('abstract', tokenize(word)) abstract_doc_count = reader.docFreq(abstract_term) abstract_total_term_count = reader.totalTermFreq(abstract_term) a_idf = similarity.idf(long(abstract_doc_count), long(n_docs)) abstract_postings = MultiFields.getTermDocsEnum(reader, 'abstract', BytesRef(term_text)) if not abstract_postings: # the term appears in no document's abstract abstract_term_count = 0 a_tf = 1 else: while abstract_postings.docID() != doc_id: # this is bad if abstract_postings.nextDoc() == abstract_postings.NO_MORE_DOCS: abstract_term_count = 0 # it does not appear in this document's abstract a_tf = 1 break else: # no break, it does appear in this document's abstract abstract_term_count = abstract_postings.freq() a_tf = similarity.tf(float(abstract_term_count)) content_score = tf * idf**2 * CONTENT_BOOST abstract_score = a_tf * a_idf**2 * ABSTRACT_BOOST # mixing concerns like nobody's business return ''' <div class="popup"> <div class="term">{}</div> <table> <tr> <th> </th> <th>abstr</th> <th>body</th> <th>total</th> </tr> <tr><td>this doc</td> <td>{}</td> <td>{}</td> <td>{}</td> </tr> <tr><td>TF</td> <td>{:.2g}</td> <td>{:.2g}</td> <td>{:.2g}</td> </tr> <tr><td>nr docs</td> <td>{}</td> <td>{}</td> <td>{}</td> </tr> <tr><td>IDF</td> <td>{:.2g}</td> <td>{:.2g}</td> <td>{:.2g}</td> </tr> <tr><td>score</td> <td>{:.2g}</td> <td>{:.2g}</td> <td><b>{:.2g}</b></td> </tr> <tr><td>all docs</td> <td>{}</td> <td>{}</td> <td>{}</td> </tr> </table> <div class="total-docs">{}</div> </div> '''.format( term_text, abstract_term_count, term_count - abstract_term_count, term_count, a_tf, tf, a_tf * tf, abstract_doc_count, doc_count, doc_count, a_idf, idf, a_idf * idf, abstract_score, content_score, abstract_score * content_score, abstract_total_term_count, total_term_count - abstract_total_term_count, total_term_count, n_docs)