Пример #1
0
    def main(cls, argv):

        query = TermQuery(Term("f", "ipsum"))
        scorer = QueryScorer(query)
        formatter = SimpleHTMLFormatter("<span class=\"highlight\">",
                                        "</span>")
        highlighter = Highlighter(formatter, scorer)
        fragmenter = SimpleFragmenter(50)
        highlighter.setTextFragmenter(fragmenter)

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        tokenStream = analyzer.tokenStream("f", StringReader(cls.text))
        result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...")

        stdout.write("<html>")
        stdout.write("<style>\n")
        stdout.write(".highlight {\n")
        stdout.write(" background: yellow\n")
        stdout.write("}\n")
        stdout.write("</style>")

        stdout.write("<body>")
        stdout.write(result)
        stdout.write("</body></html>\n")
        stdout.flush()
Пример #2
0
    def testHighlighting(self):

        text = "The quick brown fox jumps over the lazy dog"

        query = TermQuery(Term("field", "fox"))
        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream("field", StringReader(text))

        self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog",
                         highlighter.getBestFragment(tokenStream, text))
Пример #3
0
    def testHighlighting(self):

        text = "The quick brown fox jumps over the lazy dog"

        query = TermQuery(Term("field", "fox"))
        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "field", StringReader(text))

        self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog",
                         highlighter.getBestFragment(tokenStream, text))
Пример #4
0
def search_image(command):
    if command == ' ':
        return []
    Docs = []
    vm_env = getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_img"

    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)

    command_dict = {}
    allowed_opt = ['site']
    opt = 'contents'
    for i in command.split(' '):
        if ':' in i:
            opt, value = i.split(':')[:2]
            opt = opt.lower()
            if opt in allowed_opt and value != '':
                command_dict[opt] = command_dict.get(opt, '') + ' ' + value
        else:
            seg_list = jieba.cut(i)
            command_dict[opt] = command_dict.get(opt,
                                                 '') + ' ' + " ".join(seg_list)

    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        if k == 'site':
            t = Term(k, '*' + v)
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 10000).scoreDocs
    formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>")
    highlighter = Highlighter(formatter, QueryScorer(querys))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        doc_dic = {}
        doc_dic["url"] = doc.get("url")
        doc_dic["imgurl"] = doc.get("imgurl")
        doc_dic["urltitle"] = doc.get("urltitle")
        text = doc.get("contents")
        ts = analyzer.tokenStream(doc.get("contents"), StringReader(text))
        doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...")
        Docs.append(doc_dic)
    searcher.close()
    return Docs
Пример #5
0
	def get(self, searchterm):
		
		#Lucene Search for reference/word
		query = QueryParser(Version.LUCENE_35,"text",self.analyzer).parse(searchterm)
		hits = self.searcher.search(query,100000)
		query_score = QueryScorer(query)
		print query_score
		HighlightFormatter = SimpleHTMLFormatter();
		query_score = QueryScorer (query)

		highlighter = Highlighter(HighlightFormatter, query_score)

		# Set the fragment size. We break text in to fragment of 64 characters
		fragmenter  = SimpleSpanFragmenter(query_score, 64);
		highlighter.setTextFragmenter(fragmenter); 
		
		# add a highlighter
		wordhits = []			
		for hit in hits.scoreDocs:
			wordhit = {}
			doc = self.searcher.doc(hit.doc)
			text = doc.get("text")
			ts = self.analyzer.tokenStream("text", StringReader(text))
			wordhit['reference'] = doc.get('reference')
			wordhit['text'] = highlighter.getBestFragments(ts,text,3)
			wordhit['score'] = hit.score
			wordhits.append(wordhit)		
			
		# postgresql search for word details
		cursor = yield gen.Task(self.db.execute, "select * from worddetailsbyword(%s);", (searchterm,))
		worddetails = cursor.fetchall()		
		
		# postgresql search for person hits
		cursor = yield gen.Task(self.db.execute, "select * from getrelatedbyname(%s);", (searchterm.capitalize(),))
		p_res = cursor.fetchall()				
		
		# parse this stuff out before sending to template so that the logic stays here
		people = []
		if len(p_res) > 0:		
			person = {}
			for p in p_res:			
				if p[4] == 0:
					if len(person.keys()) is not 0:
						people.append(person)
					person = { 'name': p[1], 'surname': p[2], 'information': p[3], 'relationships': [] }
				else:
					person['relationships'].append({'name':p[1],'relationshiptype':p[5]})
			people.append(person)		
		self.render("search.html",worddetails=worddetails,wordhits=wordhits,people=people)
Пример #6
0
    def testHits(self):

        searcher = self.getSearcher()
        query = TermQuery(Term("title", "action"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            title = doc["title"]
            stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream("title", StringReader(title))
            fragment = highlighter.getBestFragment(stream, title)
    
            print fragment
Пример #7
0
    def testHits(self):

        searcher = IndexSearcher(self.directory, True)
        query = TermQuery(Term("title", "action"))
        scoreDocs = searcher.search(query, 50).scoreDocs

        scorer = QueryScorer(query)
        highlighter = Highlighter(scorer)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            title = doc["title"]
            stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream(
                "title", StringReader(title))
            fragment = highlighter.getBestFragment(stream, title)

            print fragment
Пример #8
0
def run(command,pageindex=1,pagesize=15):
    global searcher, analyzer,old_command,old_res_list
    global STORE_DIR,directory,searcher,analyzer
    if command == '':
        return

    print "Searching for:", command  
    
    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k,v in command_dict.iteritems():            
        if(k=='site'):
            t = Term('url','*'+v.strip()+'*')
            query = WildcardQuery(t)

        else:
            query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    
    scoreDocs = searcher.search(querys, 4000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)
    
    res_list = []
    simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>", "</font>")
    queryToHigh = QueryParser(Version.LUCENE_CURRENT,"lrc",analyzer).parse(command_dict['content'])
    hlter = Highlighter(simpleHTMLFormatter,QueryScorer(queryToHigh))
    hlter.setTextFragmenter(SimpleFragmenter(200))
    start = (pageindex-1)*pagesize
    end = start+pagesize
    print start,end
    for scoreDoc in scoreDocs[start:end+10]:
        doc = searcher.doc(scoreDoc.doc)
        res = []
        res.append(doc.get('url'))
        res.append(doc.get('music_name'))
        res.append(doc.get('artist'))
        res.append(doc.get('album_name'))
        res.append(doc.get('lrc'))
        output = hlter.getBestFragment(analyzer,"lrc",clear(doc.get('lrc')))
        res.append(output)
        res.append(doc.get('musicID'))
        if(res[5]!=None):
            res_list.append(res) 
        if(len(res_list)==8):
            break
    return res_list,len(scoreDocs)
Пример #9
0
def run(command, pageindex=1, pagesize=15):
    global searcher, analyzer, old_command, old_res_list
    global STORE_DIR, directory, searcher, analyzer
    if command == '':
        return

    print "Searching for:", command  #朱莉与茱莉娅

    # final = jieba.cut(command)
    # query = QueryParser(Version.LUCENE_CURRENT, "contents",
    #                     analyzer).parse(' '.join(final))

    querys = BooleanQuery()
    command_dict = parseCommand(command)
    for k, v in command_dict.iteritems():
        if (k == 'site'):
            t = Term('url', '*' + v.strip() + '*')
            query = WildcardQuery(t)
        else:
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = searcher.search(querys, 4000).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    res_list = []
    simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>",
                                              "</font>")

    queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents",
                              analyzer).parse(command_dict['contents'])

    hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh))
    hlter.setTextFragmenter(SimpleFragmenter(200))
    start = (pageindex - 1) * pagesize
    end = start + pagesize
    for scoreDoc in scoreDocs[start:end + 1]:
        doc = searcher.doc(scoreDoc.doc)
        res = []
        res.append(doc.get('title'))
        res.append(doc.get('url'))
        output = hlter.getBestFragment(analyzer, "contents",
                                       clear(doc.get('contents')))
        res.append(output)
        res_list.append(res)
    return res_list, len(scoreDocs)
Пример #10
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command  #朱莉与茱莉娅

        # final = jieba.cut(command)
        # query = QueryParser(Version.LUCENE_CURRENT, "contents",
        #                     analyzer).parse(' '.join(final))

        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k, v in command_dict.iteritems():
            if (k == 'site'):
                t = Term('url', '*' + v.strip() + '*')
                query = WildcardQuery(t)
            else:
                query = QueryParser(Version.LUCENE_CURRENT, k,
                                    analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 10).scoreDocs

        print "%s total matching documents." % len(scoreDocs)
        simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>",
                                                  "</font>")

        queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents",
                                  analyzer).parse(command_dict['contents'])

        hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh))
        hlter.setTextFragmenter(SimpleFragmenter(500))
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------'
            #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site')
            print 'title:', doc.get('title'),
            print 'url:', doc.get('url')
            ori_text = clear(doc.get('contents'))
            output = hlter.getBestFragment(analyzer, "contents", ori_text)
            print output
Пример #11
0
def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'UTF-8')
        if command == '':
            return

        print
        print "Searching for:", command #朱莉与茱莉娅

        # final = jieba.cut(command)
        # query = QueryParser(Version.LUCENE_CURRENT, "contents",
        #                     analyzer).parse(' '.join(final))
        
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k,v in command_dict.iteritems():            
            if(k=='site'):
                t = Term('url','*'+v.strip()+'*')
                query = WildcardQuery(t)
            else:
                query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 10).scoreDocs
        
        print "%s total matching documents." % len(scoreDocs)
        simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>", "</font>")

        queryToHigh = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(command_dict['contents'])

        hlter = Highlighter(simpleHTMLFormatter,QueryScorer(queryToHigh))
        hlter.setTextFragmenter(SimpleFragmenter(500))
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------'
            #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site')
            print 'title:',doc.get('title'),
            print 'url:',doc.get('url')
            ori_text = clear(doc.get('contents'))
            output = hlter.getBestFragment(analyzer,"contents",ori_text)
            print output
Пример #12
0
    def main(cls, argv):

        query = TermQuery(Term("f", "ipsum"))
        scorer = QueryScorer(query)
        formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>")
        highlighter = Highlighter(formatter, scorer)
        fragmenter = SimpleFragmenter(50)
        highlighter.setTextFragmenter(fragmenter)

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        tokenStream = analyzer.tokenStream("f", StringReader(cls.text))
        result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...")

        stdout.write("<html>")
        stdout.write("<style>\n")
        stdout.write(".highlight {\n")
        stdout.write(" background: yellow\n")
        stdout.write("}\n")
        stdout.write("</style>")

        stdout.write("<body>")
        stdout.write(result)
        stdout.write("</body></html>\n")
        stdout.flush()
Пример #13
0
    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Constructs a query parser.
    queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

    # Create a query
    query = queryParser.parse(QUERY_STRING)

    topDocs = searcher.search(query, 50)

    # Get top hits
    scoreDocs = topDocs.scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    HighlightFormatter = SimpleHTMLFormatter()
    query_score = QueryScorer (query)

    highlighter = Highlighter(HighlightFormatter, query_score)

    # Set the fragment size. We break text in to fragment of 64 characters
    fragmenter  = SimpleSpanFragmenter(query_score, 64)
    highlighter.setTextFragmenter(fragmenter)

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        text = doc.get(FIELD_CONTENTS)
        ts = analyzer.tokenStream(FIELD_CONTENTS, StringReader(text))
        print doc.get(FIELD_PATH)
        print highlighter.getBestFragments(ts, text, 3, "...")
    print ""
Пример #14
0
def search(request):

    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    ret = {}
    maxLength = 38

    search_content = request.GET.get('content')
    if len(search_content) > maxLength:
        pass

    query = QueryParser(Version.LUCENE_CURRENT, "contentKeyword",
                        analyzer).parse(search_content)
    scoreDocs = searcher.search(query, 50).scoreDocs

    scorer = QueryScorer(query)
    formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>")
    highlighter = Highlighter(formatter, scorer)
    fragmenter = SimpleFragmenter(50)
    highlighter.setTextFragmenter(fragmenter)

    ret['NumOfDocs'] = str(len(scoreDocs)) + "total matching documents."

    print ret['NumOfDocs']

    conn = pymysql.connect(host='localhost',
                           user=user,
                           password=password,
                           db=db_name,
                           charset='utf8mb4',
                           cursorclass=pymysql.cursors.DictCursor)

    rst = ''
    ret['search_list'] = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        _id = str(doc.get("id"))
        print _id
        sql = 'select * from webpage where id=%s'

        with conn.cursor() as cursor:
            cursor.execute(sql, (_id))
            rst = cursor.fetchone()

        titleStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "title", StringReader(rst['title']))
        titleFragment = highlighter.getBestFragment(titleStream, rst['title'])
        if titleFragment is None:
            titleFragment = rst['title']

        contentStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream(
            "content", StringReader(rst['content']))
        contentFragment = highlighter.getBestFragments(contentStream,
                                                       rst['content'], 5,
                                                       '...')

        ret['search_list'].append({
            'title': titleFragment,
            'url': rst['url'],
            'content': contentFragment
        })
    #searcher.close()
    conn.close()

    return render(request, 'tjut/result.html', {
        'search_list': ret['search_list'],
        'search_content': search_content
    })