def retrieveDocs(q): lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_30) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 1000 hits = searcher.search(query, MAX) nonDiverse = [] docsToScores = {} #create a list of html files with relevant websites rQ = [] print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") #print(new_urls[str(hit.doc)]) result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString() if (len(nonDiverse) < 10): nonDiverse.append(new_urls[str(hit.doc)]) #find the document that corresponds to the html website and append to a list for min distance website = new_urls[str(hit.doc)] #html_files numbers of the hit websites added to rQ rQ.append(inv_map[website]) docsToScores[int(inv_map[website])] = hit.score print(inv_map[website]) return docsToScores, rQ, nonDiverse
def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def fn(): env.attachCurrentThread() start = datetime.now() IndexFiles(sys.argv[1], "index", StandardAnalyzer(Version.LUCENE_CURRENT)) end = datetime.now() print end - start
def post(self): q= self.get_argument("query") # self.write(key) # def query(query): # query = self.get_argument("q") lucene.initVM() indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 10 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) items = [] rQ = [] #for key, value in doc_urls.iteritems() # print (key, value) for hit in hits.scoreDocs: #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]}) print hit.score, hit.doc, hit.toString() print(len(doc_urls)) items.append(doc_urls[str(hit.doc)]) doc = searcher.doc(hit.doc) print(hit.doc) self.render("index.html", title="Results", items=items, query=q)
def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def testPrefixQuery(self): parser = QueryParser(Version.LUCENE_CURRENT, "category", StandardAnalyzer(Version.LUCENE_CURRENT)) parser.setLowercaseExpandedTerms(False) print parser.parse("/Computers/technology*").toString("category")
def main(cls, argv): query = TermQuery(Term("f", "ipsum")) scorer = QueryScorer(query) formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(50) highlighter.setTextFragmenter(fragmenter) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) tokenStream = analyzer.tokenStream("f", StringReader(cls.text)) result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...") stdout.write("<html>") stdout.write("<style>\n") stdout.write(".highlight {\n") stdout.write(" background: yellow\n") stdout.write("}\n") stdout.write("</style>") stdout.write("<body>") stdout.write(result) stdout.write("</body></html>\n") stdout.flush()
def __init__(self): self.__dict__ = self.__shared_state if not self.__shared_state: self.jccvm = lucene.initVM() self.index = SimpleFSDirectory( lucene.File(settings.lucene_index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
def luceneRetriver(query): lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query= QueryParser(Version.LUCENE_30,"text",\ lucene_analyzer).parse(query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "Hits: ", total_hits.totalHits for hit in total_hits.scoreDocs: print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString( ) doc = lucene_searcher.doc(hit.doc) print doc.get("text").encode("utf-8")
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def names(): lst = [] search = "spax" #request.form['product'] lucene.initVM() dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: if hit.score >= 1: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") items = doc.get("text").encode("utf-8").split(',') for item in items: if item == search: pass elif item not in lst: lst.append(item) #print lst data = {"products": lst} if request.method == 'POST': return jsonify(data) else: return jsonify(data)
def index_files(files, index_directory): lucene.initVM() d = SimpleFSDirectory(File(index_directory)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512)) for f in files: parse_file(f, writer) writer.optimize() writer.close()
def open(self, name, txn, **kwds): super(IndexContainer, self).open(name, txn, **kwds) if kwds.get('create', False): directory = self.getDirectory() indexWriter = IndexWriter(directory, StandardAnalyzer(), True) indexWriter.close() directory.close()
def extractFeatureQueryWords(query): import string from lucene import Document, TermQuery, Term # create analyzer aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: file = open('../features.txt', 'r') featurelist = [] for line in file.readlines(): words_in_line = line.split() featurelist += words_in_line querywordlist = query.split() featureQueryList = [] productQueryList = [] for word in querywordlist: if word in featurelist: featureQueryList.append(word) else: # create parser for word aux_parser = QueryParser(Version.LUCENE_CURRENT, "title", aux_analyzer) aux_query = aux_parser.parse(word) scoreDocs = searcher.search(aux_query, 50).scoreDocs if scoreDocs: productQueryList.append(word) featureQuery = "" if featureQueryList: featureQuery = "(" for i in range(len(featureQueryList)): if i == len(featureQueryList) - 1: featureQuery += featureQueryList[i] + ")" else: featureQuery += featureQueryList[i] + " AND " print featureQuery productQuery = "" if productQueryList: productQuery = "(" for i in range(len(productQueryList)): if i == len(productQueryList) - 1: productQuery += productQueryList[i] + ")" else: productQuery += productQueryList[i] + " AND " return (featureQuery, productQuery, featureQueryList, productQueryList) except Exception, ex: print "Could not separate feature query words. Reason: ", ex return ("", "(" + query + ")", [], querywordlist)
def SearchFiles(command): STORE_DIR = "lucene/index" getVMEnv().attachCurrentThread() # print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) rankedfiles = run(searcher, analyzer, command) searcher.close() return rankedfiles
def search(command): STORE_DIR = "index" vm_env = initVM() print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) result = run(searcher, analyzer, command) searcher.close() return result
def testPhraseQuery(self): analyzer = StandardAnalyzer(Version.LUCENE_24) q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"') self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed") q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"') self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
def getResultScoreDocs(query): # create analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # create parser for user submitted query parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) formatted_query = parser.parse(query) scoreDocs = searcher.search(formatted_query, 50).scoreDocs return scoreDocs
def commitIndexWriter(self, writer): directory = writer.getDirectory() writer.close() dbDirectory = self.getDirectory() dbWriter = IndexWriter(dbDirectory, StandardAnalyzer(), False) dbWriter.setUseCompoundFile(False) dbWriter.addIndexes([directory]) directory.close() dbWriter.close() dbDirectory.close()
def main(cls, argv): print "SimpleAnalyzer" cls.displayTokensWithFullDetails(SimpleAnalyzer(), "The quick brown fox....") print "\n----" print "StandardAnalyzer" cls.displayTokensWithFullDetails( StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at [email protected]")
def testWithQueryParser(self): query = QueryParser(Version.LUCENE_CURRENT, "content", self.synonymAnalyzer).parse('"fox jumps"') topDocs = self.searcher.search(query, 50) # in Lucene 1.9, position increments are no longer ignored self.assertEqual(1, topDocs.totalHits, "!!!! what?!") query = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer( Version.LUCENE_CURRENT)).parse('"fox jumps"') topDocs = self.searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "*whew*")
def createIndex(cls, dataDir, indexDir, useCompound): indexDir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(useCompound) for dir, dirnames, filenames in os.walk(dataDir): for filename in filenames: if filename.endswith('.properties'): cls.indexFile(writer, os.path.join(dir, filename), dataDir) writer.optimize() writer.close()
def query(query): lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)] doc = searcher.doc(hit.doc)
def build_perfield_analyzer(index_fields): """ This function creates a PerFieldAnalyzerWrapper which allows us to associate different Lucene Analyzers to specific fields in our Lucene index. """ analyzer = PerFieldAnalyzerWrapper(StandardAnalyzer( Version.LUCENE_CURRENT)) for (index_name, analyzer_type) in index_fields.iteritems(): if analyzer_type == "standard": continue analyzer.addAnalyzer(index_name, ANALYZER_LOOKUP.get(analyzer_type)) return analyzer
def search(cls, indexDir, q): fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(fsDir, True) query = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q) start = time() hits = searcher.search(query, 50).scoreDocs duration = timedelta(seconds=time() - start) print "Found %d document(s) (in %s) that matched query '%s':" % ( len(hits), duration, q) for hit in hits: doc = searcher.doc(hit.doc) print 'path:', doc.get("path")
def find(self, query, indir): lucene.initVM() INDEXDIR = indir indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query = QueryParser(Version.LUCENE_30,"<default field>",\ lucene_analyzer).parse("text:" + query + " OR title:" + query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "\nHits: ", total_hits.totalHits, "\n" for hit in total_hits.scoreDocs: print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc( hit.doc).get("department").encode( "utf-8"), "Title:", lucene_searcher.doc( hit.doc).get("title").encode("utf-8") print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" % ( dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
def luceneIndexer(docdir, indir): """ IndexDocuments from a directory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile, 'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def someMethod(self): directory = RAMDirectory() analyzer = StandardAnalyzer() writer = IndexWriter(directory, analyzer, True) doc = Document() doc.add(Field.Text("title", "This is the title")) doc.add(Field.UnStored("contents", "...document contents...")) writer.addDocument(doc) writer.addDocument(doc, analyzer) expression = "some query" query = QueryParser.parse(expression, "contents", analyzer) parser = QueryParser("contents", analyzer) query = parser.parseQuery(expression)
def index(self, doc, title, department, url): indexdir = SimpleFSDirectory(File(self.indir)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, self.init, IndexWriter.MaxFieldLength(512)) self.init = False # Initialize document and index it document = Document() document.add( Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED)) document.add( Field("department", department, Field.Store.YES, Field.Index.ANALYZED)) document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED)) index_writer.addDocument(document) index_writer.optimize() index_writer.close()