def displayResults(self, query, sort): searcher = IndexSearcher(self.directory, True) fillFields = False computeMaxScore = False docsScoredInOrder = False computeScores = True collector = TopFieldCollector.create(sort, 20, fillFields, computeScores, computeMaxScore, docsScoredInOrder) searcher.search(query, None, collector) scoreDocs = collector.topDocs().scoreDocs print "\nResults for:", query, "sorted by", sort print "Title".rjust(30), "pubmonth".rjust(10), \ "id".center(4), "score".center(15) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) title = doc["title"] if len(title) > 30: title = title[:30] print title.encode('ascii', 'replace').rjust(30), \ doc["pubmonth"].rjust(10), \ str(scoreDoc.doc).center(4), \ ("%06f" % (scoreDoc.score)).rjust(12) print " ", doc["category"] # print searcher.explain(query, scoreDoc.doc) searcher.close()
def search(r, keyword=""): import logging logger = logging.getLogger("search") bench = Benchmark(logger) from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit import lucene, os os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17" lucene.initVM(lucene.CLASSPATH) directory = FSDirectory.open(File(CONFIG.INDEX_PATH)) ROBOT_INDEX = IndexSearcher(directory, True) ROBOT_ANALYZER = StandardAnalyzer() keyword = keyword or r.GET["keyword"] query = QueryParser("context", ROBOT_ANALYZER) query = query.parse('"%s"' % keyword) bench.start_mark("search") hits = ROBOT_INDEX.search(query) count = len(hits) result = [] i = 0 for hit in hits: i += 1 if i > 100: break doc = Hit.cast_(hit).getDocument() result.append(SearchResult(doc, i, keyword)) ROBOT_INDEX.close() et = bench.stop_mark() return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
def testTermRangeQuery(self): searcher = IndexSearcher(self.directory, True) query = TermRangeQuery("title2", "d", "j", True, True) topDocs = searcher.search(query, 100) self.assertEqual(3, topDocs.totalHits) searcher.close()
def main(cls, argv): if len(argv) != 2: print "Usage: BerkeleyDbSearcher <index dir>" return dbHome = argv[1] env = DBEnv() env.set_flags(DB_LOG_INMEMORY, 1); if os.name == 'nt': env.set_cachesize(0, 0x4000000, 1) elif os.name == 'posix': from commands import getstatusoutput if getstatusoutput('uname') == (0, 'Linux'): env.set_cachesize(0, 0x4000000, 1) env.open(dbHome, (DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0) index = DB(env) blocks = DB(env) txn = None try: txn = env.txn_begin(None) index.open(filename = '__index__', dbtype = DB_BTREE, flags = DB_THREAD, txn = txn) blocks.open(filename = '__blocks__', dbtype = DB_BTREE, flags = DB_THREAD, txn = txn) except: if txn is not None: txn.abort() txn = None raise else: txn.commit() txn = None try: txn = env.txn_begin(None) directory = DbDirectory(txn, index, blocks, 0) searcher = IndexSearcher(directory, True) topDocs = searcher.search(TermQuery(Term("contents", "fox")), 50) print topDocs.totalHits, "document(s) found" searcher.close() except: if txn is not None: txn.abort() txn = None raise else: txn.abort() index.close() blocks.close() env.close()
def getHitCount(self, fieldName, searchString): searcher = IndexSearcher(self.dir, True) t = Term(fieldName, searchString) query = TermQuery(t) hitCount = len(searcher.search(query, 50).scoreDocs) searcher.close() return hitCount
def SearchFiles(command): STORE_DIR = "lucene/index" getVMEnv().attachCurrentThread() # print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) rankedfiles = run(searcher, analyzer, command) searcher.close() return rankedfiles
def search(command): STORE_DIR = "index" vm_env = initVM() print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) result = run(searcher, analyzer, command) searcher.close() return result
def Searchfile(command, prior, page, RPP): STORE_DIR = "index_ans" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) store = run(searcher, analyzer, command, prior) searcher.close() start = (page - 1) * RPP end = start + RPP return store[start:end], len(store)
def testExclusive(self): searcher = IndexSearcher(self.directory, True) # pub date of TTC was October 1988 query = NumericRangeQuery.newIntRange("pubmonth", 198805, 198810, False, False) topDocs = searcher.search(query, 100) self.assertEqual(0, topDocs.totalHits) searcher.close()
def GET(self): form1 = login() user_data = web.input() vm_env.attachCurrentThread() STORE_DIR = "F:\\imgindex" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) a,b,c,d,e = img_func(user_data.keyword,searcher,analyzer) searcher.close() return render.img_result(form1,a,b,c,d,e)
class LuceneSearch(object): def __init__(self): STORE_DIR = "index" initVM() print 'lucene', VERSION self.directory = SimpleFSDirectory(File(STORE_DIR)) print self.directory self.searcher = IndexSearcher(self.directory, True) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) def close(self): self.searcher.close() def raw_search(self, query_string): query = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer).parse(query_string) scoreDocs = self.searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) matches = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) #print 'doc matched = ', dir(doc) contents = LuceneDoc.load(doc.get('name')) matches.append({'contents' : contents, 'doc' : doc}) return matches def search(self, query): matches = self.raw_search(query) results = '' if len(matches) > 0: results += str(len(matches))+" results <br/>" for match in matches: results += '<a href='+str(match['contents']['dealUrl'])+'>'+str(match['contents']['merchant'])+'</a><br />' results += '<p>'+str(match['contents']['shortAnnouncementTitle'])+','+str(match['contents']['redemptionLocation'])+'</p><br/>' else: results = "0 results <br/>" return results def cli_search(self): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return matches = self.raw_search(command) print print "Searching for:", command for match in matches: print match['contents']['dealUrl'] print match['contents']['merchant'], ',', match['contents']['redemptionLocation'], ', ', match['contents']['div'] print match['contents']['shortAnnouncementTitle'] print '-'*80
def testExclusive(self): searcher = IndexSearcher(self.directory, True) # pub date of TTC was October 1988 query = NumericRangeQuery.newIntRange("pubmonth", Integer(198805), Integer(198810), False, False) topDocs = searcher.search(query, 100) self.assertEqual(0, topDocs.totalHits) searcher.close()
def begining(command): STORE_DIR = "index" global vm_env vm_env = initVM() vm_env.attachCurrentThread() #print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = lucene.WhitespaceAnalyzer(Version.LUCENE_CURRENT) a = run(command, searcher, analyzer) searcher.close() return a
def testTerm(self): searcher = IndexSearcher(self.directory, True) t = Term("subject", "ant") query = TermQuery(t) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "JDwA") t = Term("subject", "junit") scoreDocs = searcher.search(TermQuery(t), 50).scoreDocs self.assertEqual(2, len(scoreDocs)) searcher.close()
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page): """ Uses the query term provided to search the disease ontology lucene index """ results = [] index_dir = SimpleFSDirectory(File(index_dir)) analyzer = build_perfield_analyzer(index_metadata) searcher = IndexSearcher(index_dir) index_fields = index_metadata.keys() # Since we are paging results we want to grab what page we are on page = (int(search_params.get('page', 1))) - 1 # Doing something pretty hacky here since we are trying to move from 0-based to 1 # based indexing to match our pagingation display offset = int(page) * records_per_page # If we are executing an advanced search we will be building a BooleanQuery # in parts as opposed to the one MultiFieldQueryParser when doing a basic # serach query = None if search_params.get('adv_search') == "True": query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer) else: parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer) query = MultiFieldQueryParser.parse( parser, process_query_param((search_params.get('q')))) # Parse through our hits hits = searcher.search(query, 10000) total_hits = hits.totalHits count = min(hits.totalHits - offset, records_per_page) for i in xrange(0, count): score_doc = hits.scoreDocs[offset + i] doc = searcher.doc(score_doc.doc) term_id = doc.get('term id') name = doc.get('name') explain = searcher.explain(query, score_doc.doc) match_fields = get_field_matches(explain.toString(), index_fields) results.append((term_id, name, list(match_fields))) searcher.close() return (results, total_hits)
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def _create_pairs(self, inpDF, colname, idd): """ function to create cartesian pairs of matched-similar text records first calls create index function followed by search index row by row in a pandas dataframe """ lookup_dir = {} for i, row in inpDF.iterrows(): if row[colname] not in lookup_dir: lookup_dir[row[colname]] = [] else: lookup_dir[row[colname]].append(row[idd]) pairs = [] directory = self._createIndex(inpDF, colname) searcher = IndexSearcher(directory, True) matches = inpDF.apply( lambda x: self._searchIndex(searcher, x, colname, idd), axis=1) captured_candidates = {} for match_pair in matches: for matched in match_pair: # value_index = inpDF[inpDF[colname] == matched[2]].index.tolist() value_index = lookup_dir[matched[2]] for cell_index in value_index: if matched[0] != cell_index: rstring = "-".join( sorted([str(matched[0]), str(cell_index)])) if rstring not in captured_candidates: captured_candidates[rstring] = 1 row = [] row.append(matched[0]) row.append(matched[1]) row.append(cell_index) row.append(matched[2]) pairs.append(row) searcher.close() directory.close() header = [idd, colname, idd + "_", colname + "_"] pairDF = pd.DataFrame(pairs, columns=header) return pairDF
def run(command): if command == '': return None STORE_DIR = "index" initVM(CLASSPATH) directory = FSDirectory.getDirectory(STORE_DIR, False) searcher = IndexSearcher(directory) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) parser.setFuzzyMinSim(0.2) query = parser.parse(command) hits = map(transform, searcher.search(query)) searcher.close() return hits
def testCollecting(self): query = TermQuery(Term("contents", "junit")) searcher = IndexSearcher(self.directory, True) collector = BookLinkCollector(searcher) searcher.search(query, collector) links = collector.getLinks() self.assertEqual("java development with ant", links["http://www.manning.com/antbook"]) scoreDocs = searcher.search(query, 10).scoreDocs self.dumpHits(searcher, scoreDocs) searcher.close()
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page): """ Uses the query term provided to search the disease ontology lucene index """ results = [] index_dir = SimpleFSDirectory(File(index_dir)) analyzer = build_perfield_analyzer(index_metadata) searcher = IndexSearcher(index_dir) index_fields = index_metadata.keys() # Since we are paging results we want to grab what page we are on page = (int(search_params.get('page', 1))) - 1 # Doing something pretty hacky here since we are trying to move from 0-based to 1 # based indexing to match our pagingation display offset = int(page) * records_per_page # If we are executing an advanced search we will be building a BooleanQuery # in parts as opposed to the one MultiFieldQueryParser when doing a basic # serach query = None if search_params.get('adv_search') == "True": query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer) else: parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer) query = MultiFieldQueryParser.parse(parser, process_query_param((search_params.get('q')))) # Parse through our hits hits = searcher.search(query, 10000) total_hits = hits.totalHits count = min(hits.totalHits - offset, records_per_page) for i in xrange(0, count): score_doc = hits.scoreDocs[offset+i] doc = searcher.doc(score_doc.doc) term_id = doc.get('term id') name = doc.get('name') explain = searcher.explain(query, score_doc.doc) match_fields = get_field_matches( explain.toString(), index_fields ) results.append( (term_id, name, list(match_fields)) ) searcher.close() return (results, total_hits)
def doSearch(self,searchString,fieldToSearch,luceneDir): searchResult =[] store = SimpleFSDirectory(File(luceneDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) searcher = IndexSearcher(store) query = QueryParser(Version.LUCENE_CURRENT,fieldToSearch,analyzer).parse(searchString) hits = searcher.search(query,self.MAX) print "Found %d documents that matched the query '%s'" %(hits.totalHits,searchString) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) #docdict['score'] = hit.score #docdict['docid'] = hit.doc #docdict['content'] = doc.get("contents").encode("utf-8") searchResult.append([doc.get("title").encode("utf-8"),doc.get("contents").encode("utf-8")]) searcher.close() return searchResult
def delete_old(self, index): existing_ids = set([book.id for book in Book.objects.all()]) reader = IndexReader.open(index.index, False) searcher = IndexSearcher(reader) try: num = searcher.docFreq(Term('is_book', 'true')) docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num) for result in docs.scoreDocs: stored = searcher.doc(result.doc) book_id = int(stored.get('book_id')) if not book_id in existing_ids: print "book id %d doesn't exist." % book_id index.remove_book(book_id) finally: searcher.close() reader.close()
def testSimple(self): class SimpleSimilarity(PythonSimilarity): def lengthNorm(_self, field, numTerms): return 1.0 def queryNorm(_self, sumOfSquaredWeights): return 1.0 def tf(_self, freq): return freq def sloppyFreq(_self, distance): return 2.0 def idfTerms(_self, terms, searcher): return 1.0 def idf(_self, docFreq, numDocs): return 1.0 def coord(_self, overlap, maxOverlap): return 1.0 def scorePayload(_self, docId, fieldName, start, end, payload, offset, length): return 1.0 self.indexSingleFieldDocs([Field("contents", "x", Field.Store.YES, Field.Index.ANALYZED)]) searcher = IndexSearcher(self.directory) searcher.setSimilarity(SimpleSimilarity()) query = TermQuery(Term("contents", "x")) explanation = searcher.explain(query, 0) print explanation scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs)) self.assertEqual(scoreDocs[0].score, 1.0) searcher.close()
def a(): import os from lucene import CJKAnalyzer,Hit dire = os.path.dirname(__file__) + '/index' analyzer = CJKAnalyzer() searcher = IndexSearcher(dire) query = QueryParser('summary',analyzer).parse('java')#TermQuery(Term("type", "restaurant")) sort = Sort(SortField("locatisdon", CategoryComparatorSource('java'))) hits = searcher.search(query,sort) print len(hits) i = 0 for hit in hits: i+=1 if i== 10: break doc = Hit.cast_(hit).getDocument() print 'title:', doc.get("author"), 'name:', doc.get("link") print Hit.cast_(hit).getScore() searcher.close()
def calculateWeight(self,context): #try: self.termList = context.termList ramreader = IndexReader.open(context.ramIndex,True) store = SimpleFSDirectory(File(context.luceneDir)) storereader = IndexReader.open(store) searcher = IndexSearcher(store) ramsearcher = IndexSearcher(context.ramIndex) # Number of documents in the collection N = storereader.numDocs() # Number of relevant documents R = ramreader.numDocs() analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) for w in self.termList: searchString= "'" + w + "'" query = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(searchString) # Number of relevant document having the term #r = ramsearcher.docFreq(Term("contents",w)) hits = ramsearcher.search(query,self.MAX) r = hits.totalHits # Number of documents having the term #n = searcher.docFreq(Term("contents",w)) query = QueryParser(Version.LUCENE_CURRENT,context.searchField,analyzer).parse(searchString) hits = searcher.search(query,self.MAX) n = hits.totalHits if (R-r) > 0 and (n-r) > 0 and (N-n-R+r) > 0: weight = (r/(R-r))/(((n-r)/(N-n-R+r))) else: weight =0 if weight > self.weightThreshold: self.gramList.append([w,weight]) searcher.close() ramsearcher.close() storereader.close() ramreader.close() #except Exception,e: # print 'error',e
def similar(command, docno): STORE_DIR = "index" initVM(CLASSPATH) directory = FSDirectory.getDirectory(STORE_DIR, False) searcher = IndexSearcher(directory) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) parser.setFuzzyMinSim(0.2) query = parser.parse(command) hits = searcher.search(query) document = hits.id(docno) ir = IndexReader.open(STORE_DIR) mlt = MoreLikeThis(ir) mlt.setFieldNames(['name', 'contents']) mlt.setMinWordLen(2) mlt.setBoost(True) query = mlt.like(document) hits = map(transform, searcher.search(query)) searcher.close() return hits
class Searcher(object): def __init__(self): self.searcher = IndexSearcher(STORE_DIR) self.analyzer = CJKAnalyzer() self.catfilter = CatFilter() def __del__(self): self.searcher.close() def search(self, query,category_id=None): SHOULD = BooleanClause.Occur.SHOULD #MultiFieldQueryParser.setOperator(QueryParser.DEFAULT_OPERATOR_AND); parser1 = QueryParser('summary',self.analyzer) parser2 = QueryParser('title',self.analyzer) parser1.setDefaultOperator(QueryParser.AND_OPERATOR) parser2.setDefaultOperator(QueryParser.AND_OPERATOR) q1 = parser1.parse(query) q2 = parser2.parse(query) boolQuery = BooleanQuery() boolQuery.add(q1,SHOULD) boolQuery.add(q2,SHOULD) #camp = CategoryComparatorSource(query) #sortfield = SortField("link", camp) #sort = Sort(sortfield) if category_id: self.catfilter.query = query self.catfilter.category_id = category_id hits = self.searcher.search(boolQuery,self.catfilter) else: hits = self.searcher.search(boolQuery) return hits def search_by_field(self,query,field='summary'): parser = QueryParser(field,self.analyzer) parser.setDefaultOperator(QueryParser.AND_OPERATOR) q = parser.parse(query) return self.searcher.search(q)
class tfidf(Command): """ """ def __init__(self): """ """ # Number of documents self.N = 0 # Number of terms self.m = 0 # Individual terms self.unigramList = None def process(self,context): self.unigramList = context.termList self.ramreader = IndexReader.open(context.ramIndex,True) self.ramsearcher = IndexSearcher(context.ramIndex) self.N = self.ramreader.numDocs() self.m = len(self.unigramList) self.createTermDocumentMatrix() self.ramsearcher.close() self.ramreader.close() context.termdocumentmatrix = self.termdocumentMatrix print 'finished creating term document matrix' self.context = context def createTermDocumentMatrix(self): self.termdocumentMatrix = np.zeros((self.m,self.N),dtype=int) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) for index,word in enumerate(self.unigramList): searchString= "'" + word[0] + "'" query = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(searchString) hits = self.ramsearcher.search(query,self.N) for hit in hits.scoreDocs: self.termdocumentMatrix[index,hit.doc] = hits.totalHits
def retrieve(string,tweetID): global eventNum global eventDict global eventList lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) try: searcher = IndexSearcher(dir) except lucene.JavaError: #print 'Inside First Except' eventDict[tweetID] = eventNum eventNum = eventNum + 1 analyzer.close() return try: query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(string) #e = sys.exc_info()[0] #print e MAX = 2 hits = searcher.search(query, MAX) #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) #print 'total hits' #print hits.totalHits if hits.totalHits > 0: eventDict[tweetID] = eventDict[hits.scoreDocs[0].doc] analyzer.close() searcher.close() return else: #print '-----------' #print tweetID eventDict[tweetID] = eventNum eventNum = eventNum + 1 analyzer.close() searcher.close() return #for hit in hits.scoreDocs: #print hit.score, hit.doc, hit.toString() #doc = searcher.doc(hit.doc) #print doc.get("text").encode("utf-8") except lucene.JavaError: eventDict[tweetID] = eventNum eventNum = eventNum + 1 analyzer.close() searcher.close() return
def do_invite(keywords): print "invite started!>>>>>>" initVM() indexDir = "/tmp/luceneindex/doc" version = Version.LUCENE_CURRENT idxDir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(version) searcher = IndexSearcher(idxDir) query = QueryParser(version, "description", analyzer).parse(keywords) hits = searcher.search(query, 1000) indentCandidates = [] #print len(hits.scoreDocs) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>" #print doc.get("description") intents = doc.get("intent") #print doc.get("url") if intents == None: continue intents = eval(intents) for intent in intents: indentCandidates.append(intent) searcher.close() inviteEmails = [] #patterns = ["[^A-Za-z0-9_-]*(?P<buzz>([A-Za-z0-9_-]+(\.\w+)*@(\w+\.)+\w{2,3}))", '''qq[^\d]*(?P<buzz>[1-9][0-9]{4,})'''] for indentCandidate in indentCandidates: #print repr(indentCandidate[0]) emailCandidate = indentCandidate[0] if emailCandidate.find("@") == -1: qqMail = emailCandidate+"@qq.com" inviteEmails.append(qqMail) else: inviteEmails.append(emailCandidate) # remove useless intent #indentCandidate = indentCandidate.strip() #probability = bayes.checkneedprobability((indentCandidate).decode("ascii","ignore").encode("utf8")) #print probability #if (probability>0.5): # continue #emailPattern = patterns[0] #qqPattern = patterns[1] #qqMatches =re.finditer(qqPattern,indentCandidate, re.IGNORECASE| re.DOTALL) #emailMatches = re.finditer(emailPattern,indentCandidate, re.IGNORECASE| re.DOTALL) #for qqMatch in qqMatches: # qq = qqMatch.group("buzz").strip() #print qq # qqMail = qq+"@qq.com" # inviteEmails.append(qqMail) #for emailMatch in emailMatches: # email = emailMatch.group("buzz").strip() #print email # inviteEmails.append(email) #add haiming and rex mail #remove multipule emails toInviteEmails = set(inviteEmails) toInviteEmails.add("*****@*****.**") toInviteEmails.add("*****@*****.**") toInviteEmails.add("*****@*****.**") print "invite total" +str(len(toInviteEmails))+" emails >>>>>>>>>>>" #for mail_address in toInviteEmails: # print mail_address return toInviteEmails
print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) #用analyzer来对查询语句进行词法分析和语言处理。 #QueryParser调用parser进行语法分析,形成查询语法树,放到Query中。 scoreDocs = searcher.search(query, 50).scoreDocs #IndexSearcher调用search对查询语法树Query进行搜索,得到结果 print "%s total matching documents." % len(scoreDocs), '\n' for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print "------------------------" print 'path:', doc.get("path") print 'title:', doc.get("title") print 'url:', doc.get("url") print 'name:', doc.get("name") if __name__ == '__main__': STORE_DIR = "index" initVM() print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) #索引文件存放的位置 searcher = IndexSearcher(directory, True) #索引信息读入到内存,创建IndexSearcher准备进行搜索 analyzer = lucene.WhitespaceAnalyzer( Version.LUCENE_CURRENT ) #analyzer用来对查询语句进行词法分析和语言处理的,和IndexFiles.py中使用同样的analyzer。 run(searcher, analyzer) searcher.close()
def queryenhancement(query, swords, k, fList, path, TotalN): if not os.path.isdir(path): print("No directory named %s" % os.path.abspath(path)) return if len(fList) == 0: return "" wordcount = {} N = len(fList) # def length of docs doclen = np.zeros(N) # def docs query docquery = np.zeros(N) # proximity to query word proxword = {} # tokenize query queryword = re.findall('[0-9a-zA-Z]+', query) for i in range(N): fname = fList[i] fp = open(path + "/" + str(fname)) # tokenize doc docword = [s.lower() for s in re.findall("[0-9a-zA-Z]+", fp.read())] # length of docs doclen[i] = len(docword) # l(D) # store the location of query words queryloc = [] for j in range(len(docword)): # occurence of words if docword[j] not in wordcount: wordcount[docword[j]] = np.zeros(N) wordcount[docword[j]][i] = 1 else: wordcount[docword[j]][i] += 1 # record token location if docword[j].lower() in queryword: docquery[i] += 1 # m(Q,D) queryloc.append(j) proxanc = [ filter(lambda y: y < 6 and y > 0, range(x - 5, x + 6)) for x in queryloc ] for ancL in proxanc: for anc in ancL: if docword[anc] not in proxword: proxword[docword[anc]] = np.zeros(N) proxword[docword[anc]][i] = 1 else: proxword[docword[anc]][i] += 1 fp.close() # exclude stopwords and query words for w in wordcount.keys(): if w in queryword or w in swords: wordcount.pop(w) # c(W,D) for w in proxword.keys(): if w in queryword or w in swords: proxword.pop(w) # align proximity measurement and word frequency for w in wordcount.keys(): if w not in proxword: proxword[w] = np.zeros(N) # f(W,Q,D) rwordocc = {w: np.sum(wordcount[w] != 0) for w in wordcount} # z(W,Q) STORE_DIR = "lucene/index" getVMEnv().attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) ireader = IndexSearcher(directory, True) docfreq = { w: ireader.docFreq(Term("contents", w.lower())) for w in wordcount } # g(W) ireader.close() # label the words wordtag = np.array(wordcount.keys()) wordlabel = {x: word for x, word in enumerate(wordtag)} score1 = np.zeros([len(wordcount), N]) score2 = np.zeros([len(wordcount), N]) y = np.zeros(len(wordcount)) for i in range(len(wordcount)): w = wordlabel[i] score1[i] = proxword[w] score2[i] = wordcount[w] y[i] = np.max([0, (rwordocc[w] * 1. / N - 2. * docfreq[w] / TotalN)]) score1 *= 1. / docquery score2 *= 1. / doclen score1 = np.sqrt(score1) score2 = np.sqrt(score2) vals = np.sum(score1 + (y * score2.T).T, axis=1) idx = np.argsort(vals)[::-1] print vals[idx[:100]] print docquery ret = "" count = 0 for i in range(len(idx)): # if wordtag[idx[i]] not in query and wordtag[idx[i]] not in swords: ret += " " + wordtag[idx[i]] count += 1 if count < k: continue else: break # print ret return ret
class SynonymAnalyzerTest(TestCase): synonymAnalyzer = SynonymAnalyzer(MockSynonymEngine()) def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.synonymAnalyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("content", "The quick brown fox jumps over the lazy dogs", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) def tearDown(self): self.searcher.close() def testJumps(self): stream = self.synonymAnalyzer.tokenStream("contents", StringReader("jumps")) term = stream.addAttribute(TermAttribute.class_) posIncr = stream.addAttribute(PositionIncrementAttribute.class_) i = 0 expected = ["jumps", "hops", "leaps"] while stream.incrementToken(): self.assertEqual(expected[i], term.term()) if i == 0: expectedPos = 1 else: expectedPos = 0 self.assertEqual(expectedPos, posIncr.getPositionIncrement()) i += 1 self.assertEqual(3, i) def testSearchByAPI(self): tq = TermQuery(Term("content", "hops")) topDocs = self.searcher.search(tq, 50) self.assertEqual(1, topDocs.totalHits) pq = PhraseQuery() pq.add(Term("content", "fox")) pq.add(Term("content", "hops")) topDocs = self.searcher.search(pq, 50) self.assertEquals(1, topDocs.totalHits) def testWithQueryParser(self): query = QueryParser(Version.LUCENE_CURRENT, "content", self.synonymAnalyzer).parse('"fox jumps"') topDocs = self.searcher.search(query, 50) # in Lucene 1.9, position increments are no longer ignored self.assertEqual(1, topDocs.totalHits, "!!!! what?!") query = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer( Version.LUCENE_CURRENT)).parse('"fox jumps"') topDocs = self.searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "*whew*") def main(cls): query = QueryParser(Version.LUCENE_CURRENT, "content", cls.synonymAnalyzer).parse('"fox jumps"') print "\"fox jumps\" parses to ", query.toString("content") print "From AnalyzerUtils.tokensFromAnalysis: " AnalyzerUtils.displayTokens(cls.synonymAnalyzer, "\"fox jumps\"") print '' main = classmethod(main)
class PlushBase: """Base class.""" def __init__(self, store_dir=None, verbose=False): self.verbose = verbose self.store_path = None self.searcher = None self.index_reader = None self.directory = None self.analyzers = {} self.initAnalyzers() self.default_analyzer_id = 'Simple' self.fields = [] self._connected = False if store_dir: self.openStore(store_dir) def __del__(self): self.closeStore() self._connected = False def initDummyStore(self, directory): """Open a dummy ramdirectory for testing.""" writer = IndexWriter(directory, SimpleAnalyzer(), True) doc = Document() doc.add(Field("name", 'dummy.txt', Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("path", '/path/to/dummy.txt', Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("path", '/path/to/another/dummy.txt', Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("contents", "foo dummy bar", Field.Store.YES, Field.Index.TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close() def openStore(self, store_dir): """Open a lucene store.""" if self._connected: self.closeStore() if store_dir == 'dummy': directory = RAMDirectory() self.initDummyStore(directory) store_path = store_dir else: store_path = os.path.abspath(store_dir) try: directory = SimpleFSDirectory(File(store_path)) #TODO , False) except JavaError: print "Error: %s Not found." % store_path return try: self.searcher = IndexSearcher(directory) except JavaError: print "Error: '%s' is not a valid lucene store." % store_path return print 'Opening store: %s' % store_path self.directory = directory self.store_path = store_path # TODO - TaxonomyReader?? self.index_reader = IndexReader.open(directory) self.fields = self.getFieldNames() self.fields.sort() self._connected = True def closeStore(self): """Close a lucene store.""" if self.searcher is not None: if self.verbose: print "Close searcher." self.searcher.close() self.directory = None self.searcher = None self.index_reader = None self.fields = [] self.store_path = None self._connected = False def maxDoc(self): """Maximum doc number.""" return self.index_reader.maxDoc() def numDocs(self): """Number of docs in the store.""" return self.index_reader.numDocs() def getFieldNames(self): """Return a unique list of field names that exist in this index.""" fields = {} terms = self.index_reader.terms() while terms.next(): fields[terms.term().field()] = True return fields.keys() # TODO #if VERSION.startswith('1.9'): # return self.index_reader.getFieldNames() #return self.index_reader.getFieldNames(IndexReader.FieldOption.ALL) def getFields(self, doc_num=None): """Return fields of a doc.""" if doc_num is None: doc_num = self.maxDoc() - 1 doc = self.index_reader.document(doc_num) return doc.fields() def getDoc(self, doc_num=None): """Return a lucene doc.""" if doc_num is None: doc_num = self.maxDoc() - 1 return self.index_reader.document(doc_num) def getFieldInfos(self, doc_num=None): """Return fields description. [(name, stored, index, token, binary, compressed), ...]""" fields = [] doc = self.getDoc(doc_num) for name in self.fields: # TODO - this form of getFields() is deprecated mfields= doc.getFields(name) if not mfields: fields.append((name, False, False, False, False, False, 'N/A')) continue for field in mfields: fields.append((field.name(), field.isStored(), field.isIndexed(), field.isTokenized(), field.isBinary(), False, field.stringValue())) #TODO #field.isCompressed(), field.stringValue())) return fields def search(self, command, field_id="contents", sort_on=None, sort_order=False, analyzer_id=None): """Do the lucene search.""" analyzer = self.getAnalyzer(analyzer_id) try: if VERSION.startswith('1.9'): query = QueryParser.parse(command, field_id, analyzer) else: query = QueryParser(field_id, analyzer).parse(command) except JavaError: print "Error: Lucene cannot parse this query." return None if sort_on: return self.searcher.search(query, Sort(sort_on, sort_order)) return self.searcher.search(query) def getTermFreqs(self, field=None, max_term=None, pattern=None): """Return a list ([(num occurence, term), ..], number of term)""" item = self.index_reader.terms() min_freq = 0 freqs = [] if max_term: limit = max_term else: limit = 1000 if pattern is not None: try: pat = re.compile(pattern) except RegexError: print "Error: '%s' is an invalid regex" % pattern return [], 0 count = 0 while(item.next()): term = item.term() if field and term.field() != field: continue count += 1 word = term.text() freq = item.docFreq() if pattern is not None and not pat.search(word): continue if len(freqs) >= limit and freq < min_freq: continue freqs.append((-1 * freq, word)) freqs.sort() if len(freqs) > limit: freqs.pop() min_freq = freqs[0][0] item.close() freqs = [(-1*freq, word) for freq, word in freqs] return freqs, count def initAnalyzers(self): """Init all analyzer.""" # TODO #self.analyzers['French'] = FrenchAnalyzer() #self.analyzers['German'] = GermanAnalyzer() self.analyzers['Keyword'] = KeywordAnalyzer(Version.LUCENE_CURRENT) self.analyzers['Simple'] = SimpleAnalyzer(Version.LUCENE_CURRENT) self.analyzers['Stop'] = StopAnalyzer(Version.LUCENE_CURRENT) self.analyzers['Standard'] = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzers['Whitespace'] = WhitespaceAnalyzer(Version.LUCENE_CURRENT) nxlucene_home = os.getenv('NXLUCENE_HOME', None) if nxlucene_home: # point to http://svn.nuxeo.org/pub/NXLucene/trunk/src/nxlucene nxlucene_home = os.path.normpath(nxlucene_home) sys.path.append(nxlucene_home) try: from analysis import analyzers_map except ImportError: print "Error: Invalid NXLUCENE_HOME %s" % nxlucene_home return for key, value in analyzers_map.items(): self.analyzers['nx' + key] = value print "NXLucene analyzers loaded." def getAnalyzer(self, analyzer_id=None): """Return an analyzer or default.""" if analyzer_id is None: analyzer_id = self.default_analyzer_id return self.analyzers.get(analyzer_id) def displayAnalyzedQuery(self, text, field_name, analyzer_id=None): """Print analyzed tokens.""" analyzer = self.getAnalyzer(analyzer_id) tokens = [token.termText() for token in analyzer.tokenStream(field_name, StringReader(text))] print " %s analyzer tokens: %s" % (analyzer_id or self.default_analyzer_id, ", ".join(tokens) )
def search2(): results0=[] results2=[] loc=[] sr='' tmp='' mark=False if request.method == 'POST': try: initVM() directory = SimpleFSDirectory(File('albumIndex')) searcher2 = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) if "Search" in request.form.values(): sr=request.form['text'] elif "Shuffle" in request.form.values(): mark=True while len(loc)<20: tmp=random.randint(0,searcher2.maxDoc()-1) if tmp not in loc: loc+=[tmp] if mark: print 'loc=',loc ct=0 for i in loc: doc = searcher2.doc(i) songs=doc.get('albumsongs') songs=songs.split('!@#$%') urls=doc.get("albumsongURLs") urls=urls.split('!@#$%') results2+=[{'albumnum': doc.get("albumnum"),\ 'albumname':doc.get('albumname'),\ 'albumartist':doc.get('albumartist'),\ 'albumintro': doc.get("albumintro"),\ 'albumsongs':songs,\ 'albumsongURLs': urls,\ 'albumpicURL':doc.get('albumpicURL'),\ 'albumartistURL':doc.get('albumartistURL'),\ 'albumURL':doc.get('albumURL'),\ 'rank':100}] else: print request.form.values() print 'sr=',sr if sr=='': return results0,results2,"" ## for i in sr: ## tmp+=i+" " ## print tmp scoreDocs=run2(searcher2, analyzer,sr,1) #search exact album if scoreDocs!=False: doc=scoreDocs songs=doc.get('albumsongs') songs=songs.split('!@#$%') urls=doc.get("albumsongURLs") urls=urls.split('!@#$%') results2+=[{'albumnum': doc.get("albumnum"),\ 'albumname':doc.get('albumname'),\ 'albumartist':doc.get('albumartist'),\ 'albumintro': doc.get("albumintro"),\ 'albumsongs':songs,\ 'albumsongURLs': urls,\ 'albumpicURL':doc.get('albumpicURL'),\ 'albumartistURL':doc.get('albumartistURL'),\ 'albumURL':doc.get('albumURL'),\ 'rank':100}] results0=results2 else: scoreDocs=run2(searcher2, analyzer,sr,20) #search 20 albums rank=100 for scoreDoc in scoreDocs: doc = searcher2.doc(scoreDoc.doc) songs=doc.get('albumsongs') songs=songs.split('!@#$%') urls=doc.get("albumsongURLs") urls=urls.split('!@#$%') results2+=[{'albumnum': doc.get("albumnum"),\ 'albumname':doc.get('albumname'),\ 'albumartist':doc.get('albumartist'),\ 'albumintro': doc.get("albumintro"),\ 'albumsongs':songs,\ 'albumsongURLs': urls,\ 'albumpicURL':doc.get('albumpicURL'),\ 'albumartistURL':doc.get('albumartistURL'),\ 'albumURL':doc.get('albumURL'),\ 'rank':rank}] rank-=5 conn = MySQLdb.connect(host='localhost', user='******',passwd='1234',charset="utf8") # conn = MySQLdb.connect(host='localhost', user='******',passwd='ee208',charset="utf8") conn.select_db('coversearch'); cursor = conn.cursor() for i in results2: try: cursor.execute("select zan from albums where id="+i['albumnum']) zan=cursor.fetchone()[0] i['zan']=zan i['rank']+=int(zan) except: i['zan']=0 results2.sort(key=lambda x:x['rank'],reverse=True) results2=resortalbum(results2,sr) conn.commit() cursor.close() conn.close() searcher2.close() except Exception,e: print 2,e
def search(): results=[] results3=[] loc=[] sr='' tmp='' mark=False if request.method == 'POST': try: initVM() directory = SimpleFSDirectory(File("songIndex")) searcher = IndexSearcher(directory, True) directory = SimpleFSDirectory(File("artistIndex")) searcher3 = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) if "Search" in request.form.values(): sr=request.form['text'] elif "Shuffle" in request.form.values(): mark=True while len(loc)<20: tmp=random.randint(0,searcher.maxDoc()-1) if tmp not in loc: loc+=[tmp] ## if request.form['action']=="Search": ## sr=request.form['text'] ## elif request.form['action']=="Shuffle": ## sr='1' if mark: print 'loc=',loc for i in loc: doc = searcher.doc(i) results+=[{'songname':doc.get("songname"),\ 'songurl':doc.get('songurl'),\ 'albumname':doc.get('songalbum'),\ 'songartist':doc.get('songartist'),\ 'albumurl': doc.get("songalbumURL"),\ 'picPath':doc.get('songpicURL'),\ }] else: print request.form.values() print 'sr=',sr if sr=='': return results,results3,"" for i in sr: tmp+=i+" " ## print tmp ## scoreDocs=run2(searcher2, analyzer,sr) ## if len(scoreDocs)!=0: ## doc=searcher2.doc(scoreDocs[0].doc) ## results2+=[{'albumnum:', doc.get("albumnum"),\ ## 'albumname:',doc.get('albumname'),\ ## 'albumartist:',doc.get('albumartist'),\ ## 'albumintro:', doc.get("albumintro"),\ ## 'albumsongs:',doc.get('albumsongs'),\ ## 'albumsongURLs:', doc.get("albumsongURLs"),\ ## 'albumpicURL:',doc.get('albumpicURL')}] ## else: scoreDocs=run3(searcher3,analyzer,sr) if scoreDocs == False: scoreDocs=run(searcher, analyzer,sr) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) results+=[{'songname':doc.get("songname"),\ 'songurl':doc.get('songurl'),\ 'albumname':doc.get('songalbum'),\ 'songartist':doc.get('songartist'),\ 'albumurl': doc.get("songalbumURL"),\ 'picPath':doc.get('songpicURL')\ }] else: doc=scoreDocs singeralbums=doc.get('singeralbums') singeralbums=singeralbums.split('!@#$%') singeralbumURLs=doc.get("singeralbumURLs") singeralbumURLs=singeralbumURLs.split('!@#$%') results3+=[{'singername': doc.get("singername"),\ 'singerplace':doc.get('singerplace'),\ 'singerintro':doc.get('singerintro'),\ 'singeralbums': singeralbums,\ 'singeralbumURLs':singeralbumURLs,\ 'singerpicURL': doc.get("singerpicURL")\ }] searcher.close() except Exception,e: print 1,e
t =c_buffer(strlen*6) bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(command),c_int(strlen),t,c_int(0),0) command=t.value.decode('gbk').encode('utf8') ##list=t.value.split() ##print ' '.join(list) dll.ICTCLAS_Exit() command=command.decode('utf8') if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'title:', doc.get("title"), 'url:', doc.get("url"), 'name:', doc.get("name") if __name__ == '__main__': STORE_DIR = "F:\\index" initVM() print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) run(searcher, analyzer) searcher.close()
class SynonymAnalyzerTest(TestCase): synonymAnalyzer = SynonymAnalyzer(MockSynonymEngine()) def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.synonymAnalyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("content", "The quick brown fox jumps over the lazy dogs", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) def tearDown(self): self.searcher.close() def testJumps(self): stream = self.synonymAnalyzer.tokenStream("contents", StringReader("jumps")) term = stream.addAttribute(TermAttribute.class_) posIncr = stream.addAttribute(PositionIncrementAttribute.class_) i = 0 expected = ["jumps", "hops", "leaps"] while stream.incrementToken(): self.assertEqual(expected[i], term.term()) if i == 0: expectedPos = 1 else: expectedPos = 0 self.assertEqual(expectedPos, posIncr.getPositionIncrement()) i += 1 self.assertEqual(3, i) def testSearchByAPI(self): tq = TermQuery(Term("content", "hops")) topDocs = self.searcher.search(tq, 50) self.assertEqual(1, topDocs.totalHits) pq = PhraseQuery() pq.add(Term("content", "fox")) pq.add(Term("content", "hops")) topDocs = self.searcher.search(pq, 50) self.assertEquals(1, topDocs.totalHits) def testWithQueryParser(self): query = QueryParser(Version.LUCENE_CURRENT, "content", self.synonymAnalyzer).parse('"fox jumps"') topDocs = self.searcher.search(query, 50) # in Lucene 1.9, position increments are no longer ignored self.assertEqual(1, topDocs.totalHits, "!!!! what?!") query = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT)).parse('"fox jumps"') topDocs = self.searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "*whew*") def main(cls): query = QueryParser(Version.LUCENE_CURRENT, "content", cls.synonymAnalyzer).parse('"fox jumps"') print "\"fox jumps\" parses to ", query.toString("content") print "From AnalyzerUtils.tokensFromAnalysis: " AnalyzerUtils.displayTokens(cls.synonymAnalyzer, "\"fox jumps\"") print '' main = classmethod(main)