def run(searcher, analyzer, command, urlclick):

    if command == '':
        return []
    res = firstsearch(searcher, analyzer, command)
    command = ''.join(my_jieba.cut(command))
    command = " ".join(jieba.cut(command, cut_all=True))
    if len(res) > 0:
        scoreDocs = res
    else:
        querys = BooleanQuery()
        for k in tag:
            query = QueryParser(Version.LUCENE_CURRENT, k,
                                analyzer).parse(command)
            if k == 'taste' or k == 'tech':
                query.setBoost(0.5)
            querys.add(query, BooleanClause.Occur.SHOULD)
        scoreDocs = searcher.search(querys, 10000).scoreDocs

    swxc_res = findres(command, scoreDocs, searcher)
    formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>")
    formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>")
    scorer = QueryScorer(
        QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command))
    highlighter1 = Highlighter(formatter_name, scorer)
    highlighter2 = Highlighter(
        formatter_name,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'content',
                        analyzer).parse(command)))
    highlighter3 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'ingredient',
                        analyzer).parse(command)))
    highlighter4 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'taste',
                        analyzer).parse(command)))
    highlighter5 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'tech',
                        analyzer).parse(command)))
    highlighter6 = Highlighter(
        formatter,
        QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'others',
                        analyzer).parse(command)))

    fragmenter = SimpleFragmenter(1000)
    highlighter1.setTextFragmenter(fragmenter)
    highlighter2.setTextFragmenter(fragmenter)
    highlighter3.setTextFragmenter(fragmenter)
    highlighter4.setTextFragmenter(fragmenter)
    highlighter5.setTextFragmenter(fragmenter)
    highlighter6.setTextFragmenter(fragmenter)

    results = []
    for scoreDoc in scoreDocs:
        if (len(scoreDocs) > 200 and
                len(scoreDocs) * scoreDoc.score < 2) or scoreDoc.score < 0.002:
            continue
        doc = searcher.doc(scoreDoc.doc)

        highlighterContent = highlighter1.getBestFragment(
            analyzer, 'name', doc.get('name'))
        highlighterContent2 = highlighter2.getBestFragment(
            analyzer, 'content', doc.get('content'))
        highlighterContent3 = highlighter3.getBestFragment(
            analyzer, 'ingredient', doc.get('ingredient'))
        highlighterContent4 = highlighter4.getBestFragment(
            analyzer, 'taste', doc.get('taste'))
        highlighterContent5 = highlighter5.getBestFragment(
            analyzer, 'tech', doc.get('tech'))
        highlighterContent6 = highlighter6.getBestFragment(
            analyzer, 'others', doc.get('others'))

        if highlighterContent:
            highlighterContent = highlighterContent.replace(' ', '')
            highlighterContent = highlighterContent.replace(',', ' ')
        else:
            highlighterContent = doc.get('name').replace(' ', '')

        if highlighterContent2:
            highlighterContent2 = highlighterContent2.replace(' ', '')
            highlighterContent2 = highlighterContent2.replace(',', ' ')
        else:
            highlighterContent2 = doc.get('content').replace(' ', '')
        if highlighterContent3:
            highlighterContent3 = highlighterContent3.replace(',', '')
        else:
            highlighterContent3 = (doc.get('ingredient')).replace(',', '')
        if highlighterContent4:
            pass
        else:
            highlighterContent4 = doc.get('taste')
        if highlighterContent5:
            pass
        else:
            highlighterContent5 = doc.get('tech')
        if highlighterContent6:
            highlighterContent6 = highlighterContent6.replace(',', '')
        else:
            highlighterContent6 = (doc.get('others')).replace(',', '')

        results.append(
            (highlighterContent, doc.get('img'), highlighterContent2,
             highlighterContent3, highlighterContent4, highlighterContent5,
             highlighterContent6, doc.get('url'), scoreDoc.score))

        for i in range(0, min(20, len(results)) - 1):
            flag = True
            for j in range(0, min(20, len(results)) - i - 1):
                if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[
                        results[j][7]] < urlclick[results[j + 1][7]]:
                    flag = False
                    results[j], results[j + 1] = results[j + 1], results[j]
            if flag:
                break

    return results, swxc_res
示例#2
0
def run(searcher, analyzer, command, judge):

    print "\nSearching for: " + command
    # command = unicode(command, 'UTF-8')
    if command == '':
        return
    commands = " ".join(jieba.cut(command)).split()
    commands_notseg = command.split()
    querys = BooleanQuery()
    querys1 = BooleanQuery()
    querys2 = BooleanQuery()
    for i in commands:
        query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(i)
        querys.setBoost(math.sqrt(len(i)))
        querys.add(query, BooleanClause.Occur.MUST)  #分词匹配
        querys1.add(query, BooleanClause.Occur.SHOULD)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    if len(scoreDocs) == 0:
        querys = BooleanQuery()
        for i in commands:
            for j in i:
                query = QueryParser(Version.LUCENE_CURRENT, "not_seg",
                                    analyzer).parse(j)
                querys.add(query, BooleanClause.Occur.MUST)  #逐字匹配
                querys1.add(query, BooleanClause.Occur.SHOULD)
        scoreDocs = searcher.search(querys, 50).scoreDocs
    for i in commands:
        query = QueryParser(Version.LUCENE_CURRENT, "comment",
                            analyzer).parse(i)
        query.setBoost(0.5)
        querys2.add(query, BooleanClause.Occur.SHOULD)  #评论匹配
    if len(commands) > 1:
        querys2.add(querys1, BooleanClause.Occur.MUST)
    querys2.add(querys, BooleanClause.Occur.SHOULD)
    querys = BooleanQuery()
    for i in commands_notseg:
        query = QueryParser(Version.LUCENE_CURRENT, "type", analyzer).parse(i)
        query.setBoost(2)
        querys.add(query, BooleanClause.Occur.SHOULD)  #标签匹配

    if len(commands_notseg) > 1:
        querys.add(querys1, BooleanClause.Occur.MUST)
    querys2.add(querys, BooleanClause.Occur.SHOULD)
    scoreDocs = searcher.search(querys2, 20).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    res = []
    temp = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        temp = [
            doc.get("org"),
            doc.get("type"),
            doc.get("price"),
            doc.get("imgsrc"),
            doc.get('comment_notseg'),
            doc.get('ISBN'),
            doc.get('ID')
        ]
        print temp
        for i in SearchFiles.main(doc.get("org")):
            temp.append(i)
        res.append(temp)
    res1 = []
    if judge == True:
        for i in range(len(res)):
            temp = res[i]
            tempres = main(temp[0], False)  #迭代寻找相似书籍
            if len(tempres) >= 4:
                for j in tempres[1:4]:
                    temp.append(j)
            elif len(tempres) == 1:
                for j in range(3):
                    temp.append(tempres[0])
            else:
                for j in range(1, len(tempres)):
                    temp.append(tempres[j])
                for j in range(len(tempres), 4):
                    temp.append(tempres[-1])
            res1.append(temp)
    return res1
def superSearch(command, command_dict, urlclick):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File('index2.3'))
    print "run super search..."
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    command = ' '.join(jieba.cut_for_search(command))
    querys = BooleanQuery()
    if command:
        query = QueryParser(Version.LUCENE_CURRENT, 'nameforsearch',
                            analyzer).parse(command)
        querys.add(query, BooleanClause.Occur.SHOULD)
    for k, v in (command_dict[0]).items():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        query.setBoost(0.1)
        querys.add(query, BooleanClause.Occur.MUST)
    for k, v in (command_dict[1]).items():
        query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST_NOT)
    scoreDocs = searcher.search(querys, 10000).scoreDocs
    swxc_res = findres(command + ' ' + command_dict[0].get("ingredient", ''),
                       scoreDocs, searcher)
    formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>")
    formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>")
    if command:
        scorer = QueryScorer(
            QueryParser(Version.LUCENE_CURRENT, 'name',
                        analyzer).parse(command))
        highlighters = [Highlighter(formatter_name, scorer)]
    else:
        highlighters = ['']
    if command_dict[0].get('ingredient'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'ingredient',
                                analyzer).parse(
                                    command_dict[0]['ingredient']))))
    else:
        highlighters.append('')
    if command_dict[0].get('taste'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'taste',
                                analyzer).parse(command_dict[0]['taste']))))
    else:
        highlighters.append('')
    if command_dict[0].get('tech'):
        highlighters.append(
            Highlighter(
                formatter,
                QueryScorer(
                    QueryParser(Version.LUCENE_CURRENT, 'tech',
                                analyzer).parse(command_dict[0]['tech']))))
    else:
        highlighters.append('')
    fragmenter = SimpleFragmenter(1000)
    for h in highlighters:
        if h:
            h.setTextFragmenter(fragmenter)

    results = []
    for scoreDoc in scoreDocs:
        if (scoreDoc.score * len(scoreDocs) < 200
                and len(scoreDocs) > 200) or scoreDoc.score < 0.1:
            continue
        doc = searcher.doc(scoreDoc.doc)
        if command:
            highlighterContent = highlighters[0].getBestFragment(
                analyzer, 'name', doc.get('name'))
        else:
            highlighterContent = ''
        if highlighters[1]:
            highlighterContent2 = highlighters[1].getBestFragment(
                analyzer, 'ingredient', doc.get('ingredient'))
        else:
            highlighterContent2 = ''
        if highlighters[2]:
            highlighterContent3 = highlighters[2].getBestFragment(
                analyzer, 'taste', doc.get('taste'))
        else:
            highlighterContent3 = ''
        if highlighters[3]:
            highlighterContent4 = highlighters[3].getBestFragment(
                analyzer, 'tech', doc.get('tech'))
        else:
            highlighterContent4 = ''

        if highlighterContent:
            highlighterContent = highlighterContent.replace(' ', '')
            highlighterContent = highlighterContent.replace(',', ' ')
        else:
            highlighterContent = doc.get('name').replace(' ', '')
        if highlighterContent2:
            highlighterContent2 = highlighterContent2.replace(',', '')
        else:
            highlighterContent2 = (doc.get('ingredient')).replace(',', '')
        if highlighterContent3:
            pass
        else:
            highlighterContent3 = doc.get('taste')
        if highlighterContent4:
            pass
        else:
            highlighterContent4 = doc.get('tech')
        results.append(
            (highlighterContent, doc.get('img'),
             doc.get('content').replace(' ', ''),
             highlighterContent2, highlighterContent3, highlighterContent4,
             doc.get('others').replace(',',
                                       ''), doc.get('url'), scoreDoc.score))

        for i in range(0, min(20, len(results)) - 1):
            flag = True
            for j in range(0, min(20, len(results)) - i - 1):
                if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[
                        results[j][7]] < urlclick[results[j + 1][7]]:
                    flag = False
                    results[j], results[j + 1] = results[j + 1], results[j]
            if flag:
                break

    return results, swxc_res