def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False, is_bigram_cache_used=False, mongoObj=None): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) #self.analyzer = StandardAnalyzer() self.analyzer = SimpleAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searcher = IndexSearcher(self.reader) self.dict_term_freq = {} if similarity == 'BM25': (self.searcher).setSimilarity(BM25Similarity()) # load bigram cache self.is_bigram_cache_used = is_bigram_cache_used if is_bigram_cache_used == True: seperate_char = '/' if self.index_dir.find('/') > -1 else '\\' index_name = self.index_dir.split(seperate_char)[-1] self.index_name = index_name self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache'] self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
def build(self, index): writer = self.getWriter(directory=index.index, analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT)) seed(101) for d in xrange(self.minId, self.maxId + 1): doc = Document() doc.add(Field("id", self.pad(d), StringField.TYPE_STORED)) if index.allowNegativeRandomInts: r = randint(~self.MAX_INT, self.MAX_INT) else: r = randint(0, self.MAX_INT) if index.maxR < r: index.maxR = r if r < index.minR: index.minR = r doc.add(Field("rand", self.pad(r), StringField.TYPE_STORED)) doc.add(Field("body", "body", StringField.TYPE_STORED)) writer.addDocument(doc) writer.commit() writer.close()
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False, is_bigram_cache_used=False, mongoObj=None): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) self.analyzer = SimpleAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searchers = [] self.searchers.append(IndexSearcher(self.reader)) if similarity == 'BM25': (self.searchers[0]).setSimilarity(BM25Similarity()) # load bigram cache self.is_bigram_cache_used = is_bigram_cache_used if is_bigram_cache_used == True: seperate_char = '/' if self.index_dir.find('/') > -1 else '\\' index_name = self.index_dir.split(seperate_char)[-1] self.index_name = index_name self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache'] self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache'] if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache_with_wikipedia'] else: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache']
def main(): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) config = config.setRAMBufferSizeMB(1024.0) # write data to index if not is_index_Exist: print('begin backup code files') system_flag = platform.system() cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp -f *.py %s\code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
def testNot(self): writer = self.getWriter(analyzer=SimpleAnalyzer()) d1 = Document() d1.add(Field("field", "a b", TextField.TYPE_STORED)) writer.addDocument(d1) writer.commit() writer.close() searcher = self.getSearcher() query = QueryParser("field", SimpleAnalyzer()).parse("a NOT b") topDocs = searcher.search(query, 50) self.assertEqual(0, topDocs.totalHits)
def running(command): command = unicode(command) STORE_DIR = "index" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) return run(searcher, analyzer, command)
def __init__(self, store_dir): initVM() directory = SimpleFSDirectory(File(store_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) print 'loaded index: %s' % store_dir self.analyzer = {} self.analyzer['StandardAnalyzer'] = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer['SimpleAnalyzer'] = SimpleAnalyzer(Version.LUCENE_CURRENT) self.analyzer['ChineseAnalyzer'] = ChineseAnalyzer(Version.LUCENE_CURRENT)
def vagueSearch(command, urlclick): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File('index2.3')) print "run vague search..." searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) results, swxc_res = run(searcher, analyzer, command, urlclick) del searcher return results, swxc_res
def get_search_func(): jieba.initialize() vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) searcher = IndexSearcher(DirectoryReader.open(SimpleFSDirectory(File(LUCENE_INDEX_DIR)))) search = search_func_factory(analyzer=analyzer, searcher=searcher, vm_env=vm_env) return search
def setUp(self): super(TestRegexQuery, self).setUp() writer = self.getWriter(analyzer=SimpleAnalyzer(self.TEST_VERSION)) doc = Document() doc.add( Field(self.FN, "the quick brown fox jumps over the lazy dog", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) writer.commit() writer.close() self.searcher = self.getSearcher()
def run(self): print 'lucene', lucene.VERSION start = datetime.now() try: IndexFiles( xmlpath=self.xmlpath, storeDir=self.indexpath, analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT), ItemClass=self.ItemClass) end = datetime.now() print end - start except Exception, e: print "Failed: ", e
def testSimple(self): a = SimpleAnalyzer() self._assertAnalyzesTo(a, "foo bar FOO BAR", ["foo", "bar", "foo", "bar"]) self._assertAnalyzesTo(a, "foo bar . FOO <> BAR", ["foo", "bar", "foo", "bar"]) self._assertAnalyzesTo(a, "foo.bar.FOO.BAR", ["foo", "bar", "foo", "bar"]) self._assertAnalyzesTo(a, "U.S.A.", ["u", "s", "a"]) self._assertAnalyzesTo(a, "C++", ["c"]) self._assertAnalyzesTo(a, "B2B", ["b", "b"]) self._assertAnalyzesTo(a, "2B", ["b"]) self._assertAnalyzesTo(a, "\"QUOTED\" word", ["quoted", "word"])
def _index_files(storeDir, indexFile): jieba.initialize() store = SimpleFSDirectory(File(storeDir)) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) _index_docs(indexFile, writer) print('commit index') writer.commit() writer.close() print('done')
def testDocBoost(self): writer = self.getWriter( analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT)) f1 = Field("field", "word", TextField.TYPE_STORED) f2 = Field("field", "word", TextField.TYPE_STORED) f2.setBoost(2.0) d1 = Document() d2 = Document() d1.add(f1) # boost = 1 d2.add(f2) # boost = 2 writer.addDocument(d1) writer.addDocument(d2) writer.close() scores = [0.0] * 2 class collector(PythonCollector): def __init__(_self, scores): super(collector, _self).__init__() _self.scores = scores _self.base = 0 def collect(_self, doc, score): _self.scores[doc + _self.base] = score def setNextReader(_self, context): _self.base = context.docBase def acceptsDocsOutOfOrder(_self): return True self.getSearcher().search(TermQuery(Term("field", "word")), collector(scores)) lastScore = 0.0 for score in scores: self.assert_(score > lastScore) lastScore = score
def __init__(self, root, storeDir): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
def testPerField(self): perField = HashMap() perField.put("special", SimpleAnalyzer()) analyzer = PerFieldAnalyzerWrapper(WhitespaceAnalyzer(), perField) text = "Qwerty" tokenStream = analyzer.tokenStream("field", StringReader(text)) tokenStream.reset() termAtt = tokenStream.getAttribute(CharTermAttribute.class_) self.assertTrue(tokenStream.incrementToken()) self.assertEqual("Qwerty", termAtt.toString(), "WhitespaceAnalyzer does not lowercase") tokenStream = analyzer.tokenStream("special", StringReader(text)) tokenStream.reset() termAtt = tokenStream.getAttribute(CharTermAttribute.class_) self.assertTrue(tokenStream.incrementToken()) self.assertEqual("qwerty", termAtt.toString(), "SimpleAnalyzer lowercases")
def __init__(self, root, storeDir, f): self.filedir = f if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) # analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def render_result(request, template, result, Search, index): ''' Render 'result' page for website and image search. Input: `request`: `request` variable received `template`: template HTML file `result`: relative path of 'result' page `Search`: search class used to search the index `index`: directory storing the Lucene index ''' if request.method == "POST": keyword = request.form['keyword'] return redirect(url_for(result, keyword=keyword)) vm_env.attachCurrentThread() engine = Search(index, SimpleAnalyzer(), lambda x: ' '.join(jieba.cut(x))) keyword = request.args.get('keyword') command = {"type": result, "keyword": keyword} if command not in search_history: search_history.append(command) results = engine.search_command(keyword) return render_template(template, keyword=keyword, results=results)
def search(self, command, num, use_clf): print("log1", command, num, use_clf) self.vm.attachCurrentThread() searcher = self.searcher print("command", command) if (not self.reT.search(command)): if (use_clf): print("sentence feed to classify", command) probs = self.classifier.classify(command) command = self.text.seg(command) command = self.text.remove_stop_word(command) # command = self.text.replace_white_space_with_dash(command) key = sorted(range(len(self.keys)), key=lambda i: probs[i], reverse=True) key_use = [] key_use.append(key[0]) for i in key[1:]: if probs[i] > 0.3 or probs[i] - probs[key[0]] > -0.1: key_use.append(i) command_final = self.keys[key_use[0]] + ":(" + command + ")" for i in key_use[1:]: command_final = "%s OR %s:(%s)" % (command_final, self.keys[i], command) command = command_final # command = "Title:\"2016 吉 07 民终 491号 包颜峰诉\"" # command = "PubDate:\"2016 11 24\"" # command = "WBSB:浙江省 WBSB:苍南县 WBSB:人民法院" print(command) # command = "Title:陕西省-高级-人民法院 Pubdate:陕西省-高级-人民法院" query = QueryParser("PubDate", WhitespaceAnalyzer()).parse(command) # parser = MultiFieldQueryParser(['WBSB'], self.analyzer) # parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) # query =parser.parse(QueryParserBase,command) # P = QueryParser('Pubdate', CJKAnalyzer()) # query = MultiFieldQueryParser(['WBSB','Pubdate'],CJKAnalyzer()).parse(P,command) # # # # query = MultiFieldQueryParser(['WBSB',"title"], CJKAnalyzer()).getMultiFieldQuery(q) # # p = QueryParser('Title', CJKAnalyzer()).parse("你好 中国 你好 北京") # print(query) # fields = [] # # fields = ["filename", "contents", "description"] # # for i in key_use: # fields.append(self.keys[i]) # flags = [BooleanClause.Occur.SHOULD]*len(fields) # # query=MultiFieldQueryParser.parse(command, fields, flags, WhitespaceAnalyzer()) # print(query) scoreDocs = searcher.search(query, num).scoreDocs results = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) result = dict() for i in self.keys: result[i] = doc.get(i) result['id'] = doc.get('id') results.append(result) probs_tmp = "" for key, prob in zip(self.keys, probs): probs_tmp += "%s:%2f " % (key, prob) probs = probs_tmp key_use_tmp = "" for i in key_use: key_use_tmp += "%s " % (self.keys[i]) key_use = key_use_tmp return results, probs, key_use else: command = self.text.seg(command) command = self.text.remove_stop_word(command) fields = self.keys flags = [BooleanClause.Occur.SHOULD] * len(fields) query = MultiFieldQueryParser.parse(command, fields, flags, WhitespaceAnalyzer()) # command_final = "Title:"+command # for i in self.keys[1:]: # command_final = "%s OR %s:%s"% (command_final,i,command) # command=command_final # print("矣") # print(command) # query = QueryParser("Title", self.analyzer).parse(command) fields = self.keys flags = [BooleanClause.Occur.SHOULD] * len(fields) query = MultiFieldQueryParser.parse(command, fields, flags, WhitespaceAnalyzer()) print(query) scoreDocs = searcher.search(query, num).scoreDocs results = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) result = dict() for i in self.keys: result[i] = doc.get(i) result['id'] = doc.get('id') results.append(result) return results, [None] * len(self.keys), self.keys else: print('command', command) ps = self.reT.findall(command) print(ps) print(type(command)) rem = self.reT.sub(command, ' ') print(ps) print(rem) q_t = [] key_use = [] for i in ps: f = i[1] data = i[4] rela = i[5] key_use.append(f) q_t.append(f) q_t.append(':') seg_t = self.text.seg(data) seg_t = self.text.remove_stop_word(seg_t) dash_t = self.text.replace_white_space_with_dash(seg_t) q_t.append(dash_t) if (rela): q_t.append(" %s " % rela) print('tract pattern', q_t) q_f = "".join(q_t) print("final q", q_f) query = QueryParser("PubDate", SimpleAnalyzer()).parse(q_f) print("query", query) scoreDocs = searcher.search(query, num).scoreDocs results = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) result = dict() for i in self.keys: result[i] = doc.get(i) result['id'] = doc.get('id') results.append(result) return results, [None] * len(key_use), key_use
''' Generate a `Document` according to the parameters. Input: `img`: dict containing a single image info Output: `Document` with the fields initialized ''' doc = Document() doc.add(StringField("img_url", img['img_url'], Field.Store.YES)) doc.add(TextField("description", img['description'], Field.Store.YES)) doc.add(StringField("url", img['url'], Field.Store.YES)) doc.add(StringField("url_title", img['url_title'], Field.Store.YES)) return doc if __name__ == '__main__': #html_dir = sys.argv[1] store_dir = 'index' #sys.argv[2] lucene.initVM() print('lucene {}'.format(lucene.VERSION)) start = datetime.now() try: # ExtractImgs(html_dir) IndexImgs(store_dir, SimpleAnalyzer()) end = datetime.now() print(end - start) except Exception as e: print("Failed: {}".format(e)) raise e
doc.add(TextField("title", title, Field.Store.YES)) doc.add(TextField("url", url, Field.Store.YES)) if len(contents) > 0: # doc.add(Field("contents", contents, self.content_type)) doc.add(TextField("contents", contents, Field.Store.YES)) else: print("Warning: No content in {}".format(filename)) return doc if __name__ == '__main__': html_dir = sys.argv[1] doc_dir = sys.argv[2] store_dir = sys.argv[3] lucene.initVM() print('lucene {}'.format(lucene.VERSION)) start = datetime.now() try: # IndexFiles('test_folder', 'index', StandardAnalyzer()) ConvertFiles(html_dir, doc_dir) # Use `SimpleAnalyzer` as `Analyzer` IndexFiles(html_dir, doc_dir, store_dir, SimpleAnalyzer()) end = datetime.now() print(end - start) except Exception as e: print("Failed: {}".format(e)) raise e
doc.add(StringField("name", doc_info['name'], Field.Store.YES)) doc.add(StringField("path", doc_info['path'], Field.Store.YES)) doc.add(StringField("title", doc_info['title'], Field.Store.YES)) doc.add(StringField("url", doc_info['url'], Field.Store.YES)) doc.add(TextField("site", doc_info['site'], Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print("Warning: No content in {}".format(doc_info['name'])) return doc if __name__ == '__main__': doc_dir = sys.argv[1] store_dir = sys.argv[2] lucene.initVM() print('lucene {}'.format(lucene.VERSION)) start = datetime.now() try: # fn = 'pg17565.txt' # IndexUpdate('testfolder', 'index', StandardAnalyzer()) IndexUpdate(doc_dir, store_dir, SimpleAnalyzer()) end = datetime.now() print(end - start) except Exception as e: print("Failed: {}".format(e)) raise e
def __recs_query(self, positive_rated_document_list, scores, recs_number, items_directory, candidate_list: List) -> pd.DataFrame: """ Builds a query using the contents that the user liked. The terms relative to the contents that the user liked are boosted by the rating he/she gave. A filter clause is added to the query to consider only candidate items Args: positive_rated_document_list: List of contents that the user liked scores: Ratings given by the user recs_number: How many items must be recommended. You can only specify the number, not a specific item for which compute the prediction items_directory: Directory where the items are stored Returns: score_frame (pd.DataFrame): DataFrame containing the recommendations for the user """ BooleanQuery.setMaxClauseCount(2000000) searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory( Paths.get(items_directory)))) if self.__classic_similarity: searcher.setSimilarity(ClassicSimilarity()) field_list = searcher.doc(positive_rated_document_list[0]).getFields() user_fields = {} field_parsers = {} analyzer = SimpleAnalyzer() for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] = field.stringValue() field_parsers[field.name()] = QueryParser(field.name(), analyzer) positive_rated_document_list.remove(positive_rated_document_list[0]) for _ in positive_rated_document_list: for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] += field.stringValue() logger.info("Building query") query_builder = BooleanQuery.Builder() for score in scores: for field_name in user_fields.keys(): if field_name == 'content_id': continue field_parsers[field_name].setDefaultOperator( QueryParser.Operator.OR) field_query = field_parsers[field_name].escape( user_fields[field_name]) field_query = field_parsers[field_name].parse(field_query) field_query = BoostQuery(field_query, score) query_builder.add(field_query, BooleanClause.Occur.SHOULD) if candidate_list is not None: id_query_string = ' OR '.join("content_id:\"" + content_id + "\"" for content_id in candidate_list) id_query = QueryParser("testo_libero", KeywordAnalyzer()).parse(id_query_string) query_builder.add(id_query, BooleanClause.Occur.MUST) query = query_builder.build() docs_to_search = len(positive_rated_document_list) + recs_number scoreDocs = searcher.search(query, docs_to_search).scoreDocs logger.info("Building score frame to return") recorded_items = 0 columns = ['to_id', 'rating'] score_frame = pd.DataFrame(columns=columns) for scoreDoc in scoreDocs: if recorded_items >= recs_number: break if scoreDoc.doc not in positive_rated_document_list: doc = searcher.doc(scoreDoc.doc) item_id = doc.getField("content_id").stringValue() recorded_items += 1 score_frame = pd.concat([ score_frame, pd.DataFrame.from_records([(item_id, scoreDoc.score)], columns=columns) ]) return score_frame
def testSimilarity(self): writer = self.getWriter(analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT), similarity=SimpleSimilarity()) d1 = Document() d1.add(Field("field", "a c", TextField.TYPE_STORED)) d2 = Document() d2.add(Field("field", "a c b", TextField.TYPE_STORED)) writer.addDocument(d1) writer.addDocument(d2) writer.commit() writer.close() searcher = self.getSearcher() searcher.setSimilarity(SimpleSimilarity()) a = Term("field", "a") b = Term("field", "b") c = Term("field", "c") class collector1(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(TermQuery(b), collector1()) builder = BooleanQuery.Builder() builder.add(TermQuery(a), BooleanClause.Occur.SHOULD) builder.add(TermQuery(b), BooleanClause.Occur.SHOULD) bq = builder.build() class collector2(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(doc + _self.base + 1, score) def doSetNextReader(_self, context): _self.base = context.docBase def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(bq, collector2()) pq = PhraseQuery(a.field(), [a.bytes(), c.bytes()]) class collector3(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(pq, collector3()) pq = PhraseQuery(2, a.field(), [a.bytes(), b.bytes()]) class collector4(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(0.5, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(pq, collector4())
querys.add(query, BooleanClause.Occur.MUST) return self.searcher.search(querys.build(), 50).scoreDocs def output(self, score_docs): ''' Output the search results in terminal. Input: `score_docs`: search results Output: None ''' print("{} total matching documents.".format(len(score_docs))) for score_doc in score_docs: doc = self.searcher.doc(score_doc.doc) print('path: {}, title: {}, url: {}, name: {}'.format( doc.get('path'), doc.get('title'), doc.get('url'), doc.get('name'))) print() if __name__ == '__main__': index_dir = sys.argv[1] lucene.initVM() print('lucene', lucene.VERSION) # SearchFiles('index', StandardAnalyzer()) # Pass the Jieba function as a parameter for generalized preprocessing SearchFiles(index_dir, SimpleAnalyzer(), lambda x: ' '.join(jieba.cut(x)))
def __init__(self, folder='gushiwen_index'): self.searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(File(folder)))) self.analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
split_result=line.split() if len(split_result)<=1: dic[split_result[0]]="no name" dic[split_result[1]]=split_result[0] myfile.close() return dic""" if __name__ == '__main__': """ if len(sys.argv) < 2: print IndexFiles.__doc__ sys.exit(1) """ lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION start = datetime.now() try: """ base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) IndexFiles(sys.argv[1], os.path.join(base_dir, INDEX_DIR), StandardAnalyzer(Version.LUCENE_CURRENT)) """ analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) IndexFiles("Music", INDEX_DIR, analyzer) end = datetime.now() print end - start except Exception, e: print "Failed: ", e raise e
def testSimple(self): writer = self.getWriter(analyzer=SimpleAnalyzer()) doc = Document() field = Field("foo", "", TextField.TYPE_NOT_STORED) doc.add(field) dvField = FloatDocValuesField("foo_boost", 0.0) doc.add(dvField) field2 = Field("bar", "", TextField.TYPE_NOT_STORED) doc.add(field2) field.setStringValue("quick brown fox") field2.setStringValue("quick brown fox") dvField.setFloatValue(2.0) # boost x2 writer.addDocument(doc) field.setStringValue("jumps over lazy brown dog") field2.setStringValue("jumps over lazy brown dog") dvField.setFloatValue(4.0) # boost x4 writer.addDocument(doc) reader = writer.getReader() writer.close() # no boosting searcher1 = self.getSearcher(reader=reader) base = searcher1.getSimilarity(True) # boosting searcher2 = self.getSearcher(reader=reader) class _similarity(PythonPerFieldSimilarityWrapper): def __init__(_self, base): super(_similarity, _self).__init__() _self.base = base _self.fooSim = BoostingSimilarity(base, "foo_boost") def get(_self, field): return _self.fooSim if "foo" == field else _self.base searcher2.setSimilarity(_similarity(base)) # in this case, we searched on field "foo". first document should have # 2x the score. tq = TermQuery(Term("foo", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 2.0, SCORE_EPSILON) # this query matches only the second document, which should have 4x # the score. tq = TermQuery(Term("foo", "jumps")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 4.0, SCORE_EPSILON) # search on on field bar just for kicks, nothing should happen, since # we setup our sim provider to only use foo_boost for field foo. tq = TermQuery(Term("bar", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON) reader.close()
def main(): if len(sys.argv) < 2: print('error: too few arguments') print('command: python create_category_corpus.py NUMBER_TOP_CATEGORY') quit() NUMBER_TOP_CATEGORY = int(sys.argv[1]) print('NUMBER_TOP_CATEGORY=%d' % (NUMBER_TOP_CATEGORY)) print('loading category profiles') profile = load_zipped_pickle('category_profiles_dbpedia_201510.gz') print('finish loading category profiles') system_flag = platform.system() cwd = os.getcwd() # initialize mongo client if system_flag == 'Windows': client = pymongo.MongoClient("localhost", 27017) else: client = pymongo.MongoClient("localhost", 58903) db = client.wiki2015 wiki_article_categories = db['article_categories'] category_corpus = {} pkl_filename = 'category_dbpedia_corpus_top%d_fsdm3.pkl.gz' % ( NUMBER_TOP_CATEGORY) if system_flag == 'Windows': lucene_dbpedia_fsdm = Lucene_Object('mmapDirectory\\dbpedia_v2_FSDM3', 'BM25', True) else: lucene_dbpedia_fsdm = Lucene_Object( '%s/mmapDirectory/dbpedia_v2_FSDM3' % (cwd), 'BM25', True) cnt = 0 if os.path.exists(pkl_filename) == True: #if False==True: print('loading category corpus') category_corpus = load_zipped_pickle(pkl_filename) else: for item in wiki_article_categories.find(): list_category = item['categories'].strip().split('|') uri_article = item['uri'] title = findTitle(uri_article) entity_content_dict = {} doc_entity = lucene_dbpedia_fsdm.findEntityDocFromIndex( title, 'title', False) if doc_entity is None: continue for f in [ 'names', 'attributes', 'categories', 'similar_entities', 'related_entities', 'catchall' ]: entity_content_dict[f] = doc_entity[f] entity_content_dict['stemmed_' + f] = doc_entity['stemmed_' + f] if len(entity_content_dict['catchall'].strip()) == 0: continue for cat in list_category[:NUMBER_TOP_CATEGORY]: if ('<http://dbpedia.org/resource/Category:' + cat + '>') not in profile: continue if cat not in category_corpus: category_corpus[cat] = [] if len(category_corpus[cat]) < 300: category_corpus[cat].append(entity_content_dict) #cnt+=1 #if cnt>20: #break print('saving corpus to pkl.gz') save_zipped_pickle(category_corpus, pkl_filename) client.close() # begin write the data into index print('begin write into index') if system_flag == 'Windows': LUCENE_INDEX_DIR = 'mmapDirectory\\category_corpus_dbpedia201510_top' + str( NUMBER_TOP_CATEGORY) + '_fsdm3' else: LUCENE_INDEX_DIR = '%s/mmapDirectory/category_corpus_dbpedia201510_top' % ( cwd) + str(NUMBER_TOP_CATEGORY) + '_fsdm3' # backup code files cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp *.py %s/code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) # write data to index w = IndexWriter(index_mm, config) cnt = 0 data = {} max_article_num = 0 stemmer = SnowballStemmer('english') for cat, list_entity_dict in category_corpus.items(): cat_label = cleanSentence(cat, True) data.clear() data['category'] = (cat, 'StringField') data['label'] = (cat_label, 'CUSTOM_FIELD_TEXT') data['stemmed_label'] = (stemSentence(cat_label, stemmer, True), 'CUSTOM_FIELD_TEXT') data['num_articles'] = (len(list_entity_dict), 'INTEGER_STORED') if data['num_articles'][0] > max_article_num: max_article_num = data['num_articles'][0] for f in [ 'names', 'attributes', 'categories', 'similar_entities', 'related_entities', 'catchall' ]: contents = cleanSentence( ' '.join([dic[f] for dic in list_entity_dict]), True, ' ') data[f] = (contents, 'CUSTOM_FIELD_TEXT_NOT_STORED') data['stemmed_' + f] = (stemSentence(contents, stemmer, False), 'CUSTOM_FIELD_TEXT_NOT_STORED') #print ('--------------------') # need to calculate corpus average length addDoc(w, data) #cnt+=1 #if cnt>20: #break w.close() print('max article num=%d' % (max_article_num))
def testSimilarity(self): writer = self.getWriter(analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT), similarity=SimpleSimilarity()) d1 = Document() d1.add(Field("field", "a c", TextField.TYPE_STORED)) d2 = Document() d2.add(Field("field", "a b c", TextField.TYPE_STORED)) writer.addDocument(d1) writer.addDocument(d2) writer.commit() writer.close() searcher = self.getSearcher() searcher.setSimilarity(SimpleSimilarity()) a = Term("field", "a") b = Term("field", "b") c = Term("field", "c") class collector1(PythonCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(TermQuery(b), collector1()) bq = BooleanQuery() bq.add(TermQuery(a), BooleanClause.Occur.SHOULD) bq.add(TermQuery(b), BooleanClause.Occur.SHOULD) class collector2(PythonCollector): def collect(_self, doc, score): self.assertEqual(doc + _self.base + 1, score) def setNextReader(_self, context): _self.base = context.docBase def acceptsDocsOutOfOrder(_self): return True searcher.search(bq, collector2()) pq = PhraseQuery() pq.add(a) pq.add(c) class collector3(PythonCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(pq, collector3()) pq.setSlop(2) class collector4(PythonCollector): def collect(_self, doc, score): self.assertEqual(2.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(pq, collector4())