def run(searcher, analyzer, command, urlclick): if command == '': return [] res = firstsearch(searcher, analyzer, command) command = ''.join(my_jieba.cut(command)) command = " ".join(jieba.cut(command, cut_all=True)) if len(res) > 0: scoreDocs = res else: querys = BooleanQuery() for k in tag: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(command) if k == 'taste' or k == 'tech': query.setBoost(0.5) querys.add(query, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 10000).scoreDocs swxc_res = findres(command, scoreDocs, searcher) formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>") formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>") scorer = QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command)) highlighter1 = Highlighter(formatter_name, scorer) highlighter2 = Highlighter( formatter_name, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'content', analyzer).parse(command))) highlighter3 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'ingredient', analyzer).parse(command))) highlighter4 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'taste', analyzer).parse(command))) highlighter5 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'tech', analyzer).parse(command))) highlighter6 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'others', analyzer).parse(command))) fragmenter = SimpleFragmenter(1000) highlighter1.setTextFragmenter(fragmenter) highlighter2.setTextFragmenter(fragmenter) highlighter3.setTextFragmenter(fragmenter) highlighter4.setTextFragmenter(fragmenter) highlighter5.setTextFragmenter(fragmenter) highlighter6.setTextFragmenter(fragmenter) results = [] for scoreDoc in scoreDocs: if (len(scoreDocs) > 200 and len(scoreDocs) * scoreDoc.score < 2) or scoreDoc.score < 0.002: continue doc = searcher.doc(scoreDoc.doc) highlighterContent = highlighter1.getBestFragment( analyzer, 'name', doc.get('name')) highlighterContent2 = highlighter2.getBestFragment( analyzer, 'content', doc.get('content')) highlighterContent3 = highlighter3.getBestFragment( analyzer, 'ingredient', doc.get('ingredient')) highlighterContent4 = highlighter4.getBestFragment( analyzer, 'taste', doc.get('taste')) highlighterContent5 = highlighter5.getBestFragment( analyzer, 'tech', doc.get('tech')) highlighterContent6 = highlighter6.getBestFragment( analyzer, 'others', doc.get('others')) if highlighterContent: highlighterContent = highlighterContent.replace(' ', '') highlighterContent = highlighterContent.replace(',', ' ') else: highlighterContent = doc.get('name').replace(' ', '') if highlighterContent2: highlighterContent2 = highlighterContent2.replace(' ', '') highlighterContent2 = highlighterContent2.replace(',', ' ') else: highlighterContent2 = doc.get('content').replace(' ', '') if highlighterContent3: highlighterContent3 = highlighterContent3.replace(',', '') else: highlighterContent3 = (doc.get('ingredient')).replace(',', '') if highlighterContent4: pass else: highlighterContent4 = doc.get('taste') if highlighterContent5: pass else: highlighterContent5 = doc.get('tech') if highlighterContent6: highlighterContent6 = highlighterContent6.replace(',', '') else: highlighterContent6 = (doc.get('others')).replace(',', '') results.append( (highlighterContent, doc.get('img'), highlighterContent2, highlighterContent3, highlighterContent4, highlighterContent5, highlighterContent6, doc.get('url'), scoreDoc.score)) for i in range(0, min(20, len(results)) - 1): flag = True for j in range(0, min(20, len(results)) - i - 1): if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[ results[j][7]] < urlclick[results[j + 1][7]]: flag = False results[j], results[j + 1] = results[j + 1], results[j] if flag: break return results, swxc_res
def run(searcher, analyzer, command, judge): print "\nSearching for: " + command # command = unicode(command, 'UTF-8') if command == '': return commands = " ".join(jieba.cut(command)).split() commands_notseg = command.split() querys = BooleanQuery() querys1 = BooleanQuery() querys2 = BooleanQuery() for i in commands: query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(i) querys.setBoost(math.sqrt(len(i))) querys.add(query, BooleanClause.Occur.MUST) #分词匹配 querys1.add(query, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 50).scoreDocs if len(scoreDocs) == 0: querys = BooleanQuery() for i in commands: for j in i: query = QueryParser(Version.LUCENE_CURRENT, "not_seg", analyzer).parse(j) querys.add(query, BooleanClause.Occur.MUST) #逐字匹配 querys1.add(query, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 50).scoreDocs for i in commands: query = QueryParser(Version.LUCENE_CURRENT, "comment", analyzer).parse(i) query.setBoost(0.5) querys2.add(query, BooleanClause.Occur.SHOULD) #评论匹配 if len(commands) > 1: querys2.add(querys1, BooleanClause.Occur.MUST) querys2.add(querys, BooleanClause.Occur.SHOULD) querys = BooleanQuery() for i in commands_notseg: query = QueryParser(Version.LUCENE_CURRENT, "type", analyzer).parse(i) query.setBoost(2) querys.add(query, BooleanClause.Occur.SHOULD) #标签匹配 if len(commands_notseg) > 1: querys.add(querys1, BooleanClause.Occur.MUST) querys2.add(querys, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys2, 20).scoreDocs print "%s total matching documents." % len(scoreDocs) res = [] temp = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) temp = [ doc.get("org"), doc.get("type"), doc.get("price"), doc.get("imgsrc"), doc.get('comment_notseg'), doc.get('ISBN'), doc.get('ID') ] print temp for i in SearchFiles.main(doc.get("org")): temp.append(i) res.append(temp) res1 = [] if judge == True: for i in range(len(res)): temp = res[i] tempres = main(temp[0], False) #迭代寻找相似书籍 if len(tempres) >= 4: for j in tempres[1:4]: temp.append(j) elif len(tempres) == 1: for j in range(3): temp.append(tempres[0]) else: for j in range(1, len(tempres)): temp.append(tempres[j]) for j in range(len(tempres), 4): temp.append(tempres[-1]) res1.append(temp) return res1
def superSearch(command, command_dict, urlclick): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File('index2.3')) print "run super search..." searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) command = ' '.join(jieba.cut_for_search(command)) querys = BooleanQuery() if command: query = QueryParser(Version.LUCENE_CURRENT, 'nameforsearch', analyzer).parse(command) querys.add(query, BooleanClause.Occur.SHOULD) for k, v in (command_dict[0]).items(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) query.setBoost(0.1) querys.add(query, BooleanClause.Occur.MUST) for k, v in (command_dict[1]).items(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST_NOT) scoreDocs = searcher.search(querys, 10000).scoreDocs swxc_res = findres(command + ' ' + command_dict[0].get("ingredient", ''), scoreDocs, searcher) formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>") formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>") if command: scorer = QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command)) highlighters = [Highlighter(formatter_name, scorer)] else: highlighters = [''] if command_dict[0].get('ingredient'): highlighters.append( Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'ingredient', analyzer).parse( command_dict[0]['ingredient'])))) else: highlighters.append('') if command_dict[0].get('taste'): highlighters.append( Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'taste', analyzer).parse(command_dict[0]['taste'])))) else: highlighters.append('') if command_dict[0].get('tech'): highlighters.append( Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'tech', analyzer).parse(command_dict[0]['tech'])))) else: highlighters.append('') fragmenter = SimpleFragmenter(1000) for h in highlighters: if h: h.setTextFragmenter(fragmenter) results = [] for scoreDoc in scoreDocs: if (scoreDoc.score * len(scoreDocs) < 200 and len(scoreDocs) > 200) or scoreDoc.score < 0.1: continue doc = searcher.doc(scoreDoc.doc) if command: highlighterContent = highlighters[0].getBestFragment( analyzer, 'name', doc.get('name')) else: highlighterContent = '' if highlighters[1]: highlighterContent2 = highlighters[1].getBestFragment( analyzer, 'ingredient', doc.get('ingredient')) else: highlighterContent2 = '' if highlighters[2]: highlighterContent3 = highlighters[2].getBestFragment( analyzer, 'taste', doc.get('taste')) else: highlighterContent3 = '' if highlighters[3]: highlighterContent4 = highlighters[3].getBestFragment( analyzer, 'tech', doc.get('tech')) else: highlighterContent4 = '' if highlighterContent: highlighterContent = highlighterContent.replace(' ', '') highlighterContent = highlighterContent.replace(',', ' ') else: highlighterContent = doc.get('name').replace(' ', '') if highlighterContent2: highlighterContent2 = highlighterContent2.replace(',', '') else: highlighterContent2 = (doc.get('ingredient')).replace(',', '') if highlighterContent3: pass else: highlighterContent3 = doc.get('taste') if highlighterContent4: pass else: highlighterContent4 = doc.get('tech') results.append( (highlighterContent, doc.get('img'), doc.get('content').replace(' ', ''), highlighterContent2, highlighterContent3, highlighterContent4, doc.get('others').replace(',', ''), doc.get('url'), scoreDoc.score)) for i in range(0, min(20, len(results)) - 1): flag = True for j in range(0, min(20, len(results)) - i - 1): if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[ results[j][7]] < urlclick[results[j + 1][7]]: flag = False results[j], results[j + 1] = results[j + 1], results[j] if flag: break return results, swxc_res