def perform_search(self, searchterm, results_per_page, page):
        # if there is a field in the searchterm
        """if ":" in searchterm:
            # processing a query
            parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
            parser.setDefaultOperator(QueryParser.Operator.AND)

            query = parser.parse(searchterm)

        else:
            query = BooleanQuery()
            query_title = TermQuery(Term("title", searchterm))
            query_description = TermQuery(Term("description", searchterm))
            query_content = TermQuery(Term("content", searchterm))

            #  BooleanClause.Occur.MUST for AND queries
            query.add(query_title, BooleanClause.Occur.SHOULD)
            query.add(query_description, BooleanClause.Occur.SHOULD)
            query.add(query_content, BooleanClause.Occur.SHOULD)"""

        # create QueryParser for each field to be searched
        parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer)
        parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer)
        parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)

        # put fields together
        query = BooleanQuery()
        query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        hits = searcher.search(query, results_per_page + (results_per_page * page))
        score_docs = hits.scoreDocs
        count_results = hits.totalHits
        duration = datetime.now() - start

        # results to return
        results = []
        count = 0

        for scoreDoc in score_docs:

            # skip offset
            if count < results_per_page * page:
                count += 1
                continue
            count += 1


            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            results.append(table)

        return results, duration, count_results
示例#2
0
 def findPropertyURIs(self, propertyType, max):
     uris = list() # list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.CLASS_FEATURE_LKB, analyzer)
         query = parser.parse("\"" + QueryParser.escape(propertyType) + "\"")
         result = self._searcher.search(query, 1)
         freq = result.totalHits
         if max != None:
             freq = max.intValue()
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         logging.debug("For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max))
         print "For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             self._searcher.explain(query, hit.doc)
             uris.append(doc.get(FreyaConstants.INST_FEATURE_LKB))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return uris
示例#3
0
 def findDirectTypes(self, instanceUri, max):
     dTypes = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, "inst", analyzer)
         query = parser.parse("\"" + QueryParser.escape(instanceUri) + "\"")
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = 0
         if max != None:
             freq = max
         else:
             freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             self._searcher.explain(query, hit.doc)
             dTypes.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     logging.debug("there are " + str(len(dTypes)) + " unique direct types")
     return dTypes
def buscar(indexDir, args,options = None):
    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
    fsDir = SimpleFSDirectory(File(indexDir))
    #print fsDir
    
    #Criando buscador baseado no diretorio dos indices passados pelo usuario
    searcher = IndexSearcher(DirectoryReader.open(fsDir))
    
    #Analizador para filtro dos tokens 
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    #print analyzer

    #Criando um QueryParser usando por padrao contents
    #Variavel com as restricoes da busca
    parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
    #print parser

    parser.setDefaultOperator(QueryParser.Operator.AND)

    #print args
    #Juntando parametros passados com o valor do mesmo
    command = ' +'.join(args)
    #print command

    query = parser.parse(command)
    print query

    #Criando um JArray com resultado da consulta
    return searcher.search(query, 200).scoreDocs
示例#5
0
    def more_like_this(self, result_num, query):
        result = []
        queryparser = QueryParser(Version.LUCENE_CURRENT, "methods_called",
                                  self.porter_analyzer)
        if query:
            try:
                query = arranging_query_regex(query=query)
                # print '4. Right after the regex handling : ', query
                like_query = queryparser.parse(query)
                # print '5. Right after the Lucene parser : ', like_query

                hits = self.searcher.search(like_query, result_num).scoreDocs
                # filterScoreDosArray = hits.topDocs().scoreDocs;

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    # matched_terms = self.get_matched_keywords(like_query, hit.doc)
                    result.append(doc.get("answer_id"))

            except Exception as e:
                print "AnswerSearcher: Error: %s" % e
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return result
示例#6
0
文件: retriever.py 项目: kevkid/YIF
def get_image_pmcid(pmcid, classes = ""):
    fields = ["pmcid", "class"]
    docs = []
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    # query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    
    #query = query.parse(query, ('4175339','1'))
    # query.parse(queryString)#"Shigella sonnei"
    # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    #hits = searcher.search(query, MAX)
    if classes == "all":
        queryStr = "pmcid:(" + ' '.join(pmcid) +")"
    else:
        queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes
    query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query
    q = query.parse(queryStr) 
    hits = searcher.search(q, MAX)
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        docs.append(searcher.doc(hit.doc))
    return docs #This will return the image documents that belong to a pmcid(article)
示例#7
0
    def more_like_this2(self, item_doc, result_num):
        similar_questions = []
        if not item_doc:
            item_doc.append(ResultItem(None, 1.0, "No Title", 0))
        query = ""
        if item_doc.doc:
            query += self.document_to_query(item_doc.doc)

        query = remove_unified_stop_lists(query)
        queryparser = QueryParser(Version.LUCENE_CURRENT, "term",
                                  self.analyzer)

        if query:
            try:
                like_query = queryparser.parse(query)
                hits = self.searcher.search(like_query, result_num).scoreDocs

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    similar_questions.append(doc.get("question_id"))

            except Exception as e:
                print "Question Searcher: Error: %s" % e
                # write_search_log("Question Searcher: Error: %s" % e + "\n")
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        # self.searchermgr.release(self.searcher)
        # self.searcher = None
        # self.directory.close()
        # self.directory = None
        return similar_questions

    # def release(self, searcher):
示例#8
0
    def Qsearch(self,query):
        words = seg.segment(query.strip())
        #words = self.segmentor.segment(query.strip())
        #print ' '.join(words)
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        result = QueryParser(Version.LUCENE_CURRENT, "contents",self.analyzer)
        result.setPhraseSlop(0)
        # "\""+' '.join(words)+"\"~0" means words should be continuous
        query = result.parse("\""+' '.join(words)+"\"~0")
        totalHits = self.searcher.search(query, 50)
        #print "%s total matching documents." % totalHits.totalHits
        #return totalHits.totalHits

        for hit in totalHits.scoreDocs:
            #print"Hit Score: ",hit.score, "Hit Doc:",hit.doc, "HitString:",hit.toString()
            doc= self.searcher.doc(hit.doc)
            #print doc.get("name").encode("utf-8")
        #print "----------------------------------------"
        t = Term('contents',' '.join(words))
        #termDocs = ireader.termDocs(t)
        #for tt in termDocs:
        #       print ireader.document(termDocs.docs).getFeildable('neme'),termDocs.freq()
        #print self.reader.totalTermFreq(t)
        return self.reader.totalTermFreq(t)
	def search(self, index_dir):
		# Get handle to index directory
		directory = SimpleFSDirectory(File(index_dir))

		# Creates a searcher searching the provided index.
		ireader  = DirectoryReader.open(directory)

		# Implements search over a single IndexReader.
		# Use a single instance and use it across queries
		# to improve performance.
		searcher = IndexSearcher(ireader)

		# Get the analyzer
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

		# Constructs a query parser. We specify what field to search into.
		queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

		# Create the query
		query = queryParser.parse(self.query)

		# Run the query and get top 50 results
		topDocs = searcher.search(query, self.retrieve_count)

		# Get top hits
		scoreDocs = topDocs.scoreDocs

		doc_ids = []
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			doc_ids.append(doc.get(FIELD_PATH))
		return [int(item) for item in doc_ids]
示例#10
0
    def retrieve_sents(self):

        indexDir = self.indexDir
        query = self.query

        sent_ind_list = []
        # template = CustomTemplate(format)
        fsDir = SimpleFSDirectory(Paths.get(indexDir))
        # print indexDir
        searcher = IndexSearcher(DirectoryReader.open(fsDir))

        analyzer = StandardAnalyzer()
        parser = QueryParser("contents", analyzer)
        parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(query)
        # print query
        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start
        # print query
        if self.stats:
            print("Found %d sentences (in %s) that matched query '%s':" % (len(scoreDocs), duration, query),
                  file=sys.stderr)

        for scoreDoc in scoreDocs:
            # print scoreDoc.doc
            # doc = searcher.doc(scoreDoc.doc)
            sent_ind_list.append(scoreDoc.doc)

        return sent_ind_list
示例#11
0
    def more_like_this2(self, so_items):

        github_result = []
        if not so_items:
            so_items.append(SOResultItem(None, 1.0, "No Title", 0, ""))

        query = ""
        for so_item in so_items:
            queryparser = QueryParser(Version.LUCENE_CURRENT,
                                      "typed_method_call", self.analyzer)

            if so_item.doc:
                query += self.document_to_query(so_item.doc)

            query += self.code_as_text()

        if query:
            print "-" * 30
            print "UNified Query: %s" % query
            print "-" * 30
            try:
                like_query = queryparser.parse(query)

                hits = self.searcher.search(like_query, 10).scoreDocs

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords2(
                        like_query, hit.doc)

                    # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]
                    print("file__", doc.get("file"), "file_content",
                          doc.get("file_content"), "line_numbers",
                          doc.get("line_numbers"))
                    file_path = "/extdsk/FaCoY/Git_data/G" + doc.get(
                        "file")[24:]
                    print(file_path)
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        pass
                    if content:
                        item = GithubResultItem(doc.get("file"), content,
                                                matched_terms,
                                                hit.score, so_item,
                                                doc.get("line_numbers"),
                                                hit.doc)  # code

                        github_result.append(item)
                    #print("%d. File: %s, Matched: %s, Score: %s" % (i + 1, doc.get("file"), matched_terms, hit.score))
            except Exception as e:
                print "GitSearcher: Error: %s" % e
                print(traceback.format_exc())

        # print Counter(files).most_common(5)

        return github_result
示例#12
0
class SearchImgs(object):
    def __init__(self, store_dir, analyzer, preprocess = lambda x: x):
        '''
        Input: `store_dir`: directory storing the Lucene index
               `analyzer`: analyzer required to split the query
               `preprocess`: user-defined preprocess function
        '''
        # Initialize `IndexSearcher`
        self.dir = SimpleFSDirectory(File(store_dir).toPath())
        self.searcher = IndexSearcher(DirectoryReader.open(self.dir))
        self.preprocess = preprocess

        # Initialize `QueryParser`
        self.parser = QueryParser("description", analyzer)


    def search_command(self, command):
        '''
        Interface for other programs to search in a particular index.

        Input: `command`: raw query in the str format
        Output: list of documents found in the index
        '''
        command = self.preprocess(command)
        score_docs = self.search(command)
        return self.output(score_docs)


    def search(self, command):
        '''
        Search for the query in the Lucene index.

        Input: `command`: keyword to be searched
        Output: score_docs satisfying the requirement
        '''
        query = self.parser.parse(command)
        return self.searcher.search(query, 50).scoreDocs


    def output(self, score_docs):
        '''
        Highlight and return the search results.

        Input: `score_docs`: search results from the index
        Output: list of documents info found in the index,
                details includes `title`, `url` and `description` and `action_url`
        '''
        results = []
        for score_doc in score_docs:
            doc = self.searcher.doc(score_doc.doc)
            result = {
                'title': doc.get('url_title'),
                'url': doc.get('img_url'),
                'description': doc.get('description').replace(' ', ''),
                'action_url': doc.get('url')
            }
            results.append(result)
        return results
    def more_like_this2(self, limit, item_doc, score_logs_for_each, user_query,
                        flag):  #flag = UQ(1) or not(0)
        bench_result = []
        query = ""
        if flag == 1:
            query += user_query
            # item_doc = ResultItem(None, 0.0, "No Title", 'None','None', None)

        if flag == 0 and item_doc.doc:
            query += self.document_to_query(item_doc.doc)

        query = remove_unified_stop_lists(query)

        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)
        if query:
            try:
                parsed_query = queryparser.parse(query)
                hits = self.searcher.search(parsed_query, limit).scoreDocs
                temp = 1
                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched = doc.get('file').split('/')[9].split('.')[0]
                    score_logs_for_each += str(matched) + '\t' + str(
                        round(hit.score, 2)) + '\n'
                    matched_terms = self.get_matched_keywords2(
                        parsed_query, hit.doc)
                    # print "Matched Terms : ", matched_terms

                    # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content"))
                    temp += 1

                    file_path = doc.get("file")
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        pass

                    if content:
                        item = BenchResultItem(doc.get("file"), content,
                                               matched_terms,
                                               hit.score, item_doc,
                                               doc.get("line_numbers"),
                                               hit.doc)
                        bench_result.append(item)

            except Exception as e:
                print "BenchSearcher Error: %s" % e
                print(traceback.format_exc())

        # self.searchermgr.release()
        # self.searcher = None
        # self.directory.close()
        # self.directory = None
        return bench_result, score_logs_for_each
示例#14
0
    def pairSearch(self, pair, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        title = pair[0].replace('_', ' ')
        content = pair[1]
        parser = QueryParser("content_section", self.analyzer)
        query1 = parser.parse(QueryParser.escape(title))
        query2 = parser.parse(QueryParser.escape(content))

        bq = BooleanQuery.Builder()
        bq.add(query1, BooleanClause.Occur.FILTER)
        bq.add(query2, BooleanClause.Occur.SHOULD)

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(bq.build(), 6).scoreDocs
        return hits
示例#15
0
class SearchIndex(object):
    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents",
                                  self.analyzer)

    def search(self, q, page=1, duplicates=False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)

        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream(
                "contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream,
                                                     doc['contents'], 3, "...")

            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight
            })

        del self.searcher

        totalPages = int(math.ceil(results.getTotalHits() / float(perPage)))

        return totalPages, docs

    def addDuplicatesQuery(self, query):
        not_duplicate = TermQuery(Term('duplicate', 'false'))
        booleanQuery = BooleanQuery()
        booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
        booleanQuery.add(query, BooleanClause.Occur.MUST)
        return booleanQuery
示例#16
0
class SearchIndex(object):

    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)


    def search(self, q, page = 1, duplicates = False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)
        
        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
            
            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight}
            )

        del self.searcher
        
        totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))

        return totalPages, docs

    def addDuplicatesQuery(self, query):
        not_duplicate = TermQuery(Term('duplicate', 'false'))
        booleanQuery = BooleanQuery()
        booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
        booleanQuery.add(query, BooleanClause.Occur.MUST)
        return booleanQuery
示例#17
0
 def simpleSearchID(self, query, sim):
     """
     Method that searches through documents using only content_section Field
     searchDir : the path to the folder that contains the index.
     """
     # Now search the index:
     parser = QueryParser("id_section", self.analyzer)
     query = parser.parse(QueryParser.escape(query))
     self.searcher.setSimilarity(sim)
     hits = self.searcher.search(query, 6).scoreDocs
     return hits
    def query_parser_filter(self, field_values, field_filter=['Vector']):
        """
		Filtering queries according to field values
		:param field_values: values of the fields
		:param field_filter: fields to filter
		"""
        assert len(field_filter) == len(
            field_values), "Number of fields different from number of values"
        for i in range(len(field_filter)):
            query_parser = QueryParser(field_filter[i], self.analyzer)
            query = query_parser.parse(field_values[i])
            self.constrained_query.add(query, BooleanClause.Occur.FILTER)
    def more_like_this3(self, limit, score_logs_for_each, user_query):
        query = ""
        bench_result = []
        # if not item_doc:
        # 	item_doc.append(ResultItem(None, 1.0, "No Title", 0, 0))
        # if item_doc.doc:
        # 	query += self.document_to_query(item_doc.doc)

        query += user_query
        query = remove_unified_stop_lists(query)

        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)
        if query:
            try:
                parsed_query = queryparser.parse(query)
                hits = self.searcher.search(parsed_query, limit).scoreDocs
                temp = 1
                for i, hit in enumerate(hits):
                    score_logs_for_each += str(round(hit.score, 2)) + '\n'
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords2(
                        parsed_query, hit.doc)
                    # print "Matched Terms : ", matched_terms

                    # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content"))
                    temp += 1

                    file_path = doc.get("file")
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        pass

                    if content:
                        item = BenchResultItem_UQ(doc.get("file"), content,
                                                  matched_terms, hit.score,
                                                  doc.get("line_numbers"),
                                                  hit.doc)
                        bench_result.append(item)

            except Exception as e:
                print "BenchSearcher Error: %s" % e
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        # self.searchermgr.release(self.searcher)
        # self.searcher = None
        # self.directory.close()
        # self.directory = None
        return bench_result, score_logs_for_each
    def query_parser_must(self, field_values, field_must=['Text']):
        """
		The values that the fields must match
		:param field_values: values of the fields
		:param field_must: fields that must match
		"""
        assert len(field_must) == len(
            field_values), "Number of fields different from number of values"
        for i in range(len(field_must)):
            query_parser = QueryParser(field_must[i], self.analyzer)
            query = query_parser.parse(field_values[i])
            self.constrained_query.add(query, BooleanClause.Occur.MUST)
示例#21
0
class LuceneRanker(object):
    def __init__(self, tfidf_path, strict=True):
        lucene.initVM()
        analyzer = StandardAnalyzer()
        reader = DirectoryReader.open(SimpleFSDirectory(Paths.get(tfidf_path)))
        self.searcher = IndexSearcher(reader)

        self.parser = QueryParser("text", analyzer)
        self.parser.setDefaultOperator(QueryParser.Operator.OR)

    def closest_docs(self, query, k=1):
        """Closest docs by dot product between query and documents
        in tfidf weighted word vector space.
        """
        query = self.parser.parse(
            query.replace('/', '//').replace('?', '').replace('"', ''))
        hits = self.searcher.search(query, k)
        docids = []
        docs = []
        for i, hit in enumerate(hits.scoreDocs):
            doc = self.searcher.doc(hit.doc)
            docs.append(unicode(doc['text']))
            docids.append(unicode(doc['title']))
        return docids, docs

    def batch_closest_docs(self, queries, k=1, num_workers=None):
        """Process a batch of closest_docs requests multithreaded."""
        # get highest scoring document for multiple queries
        batch = []
        for i, q in enumerate(queries):
            if i % 100 == 0:
                print(i)

            t0 = time.time()
            docids, docs = self.closest_docs(q, k)
            batch.append((docids, docs))
        return batch

    def parse(self, query):
        return None

    def text2spvec(self, query):
        return None

    def get_doc_index(self, doc_id):
        return 0

    def get_doc_id(self, doc_index):
        return 0

    def __exit__(self, *args):
        pass
示例#22
0
def define_search_params(STORE_DIR, FIELD_CONTENTS, TERM):

    store = SimpleFSDirectory(Paths.get(STORE_DIR))
    reader = DirectoryReader.open(store)
    searcher = IndexSearcher(reader)

    # Get the analyzer
    analyzer = WhitespaceAnalyzer()
    # Constructs a query parser. We specify what field to search into.
    queryParser = QueryParser(FIELD_CONTENTS, analyzer)

    # Create the query
    query = queryParser.parse(TERM)
    return searcher, reader, query
示例#23
0
def getQueryBuiler():
    # builder = QueryBuilder(analyzer)
    boolean_query = BooleanQuery.Builder()

    # print(args.search)

    if len(args.search) == 0:
        boolean_query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
        return boolean_query
    
    for i in range(len(args.search)):
        curSearch = args.search[i].split(' ')

        if curSearch[1] == 'query':
            parser = QueryParser(curSearch[2], analyzer)
            query = parser.parse(curSearch[3])
        elif curSearch[1] == 'intrange':
            query = IntPoint.newRangeQuery(curSearch[2], curSearch[3], curSearch[4])
        elif curSearch[1] == 'termrange':
            lowerDate = handleDate(curSearch[3], '%d/%b/%Y:%H:%M:%S')
            upperDate = handleDate(curSearch[4], '%d/%b/%Y:%H:%M:%S')
            query = TermRangeQuery.newStringRange(curSearch[2], lowerDate, upperDate, True, True)

        if curSearch[0] == 'must':
            boolean_query.add(query, BooleanClause.Occur.MUST)
        elif curSearch[0] == 'should':
            boolean_query.add(query, BooleanClause.Occur.SHOULD)
        elif curSearch[0] == 'filter':
            boolean_query.add(query, BooleanClause.Occur.FILTER)
        elif curSearch[0] == 'must_not':
            boolean_query.add(query, BooleanClause.Occur.MUST_NOT)
        else:
            print('raise exception')
            # raise Exception
    # exit()
    # parser = QueryParser('method1', analyzer)
    # query = parser.parse('options')
    # boolean_query.add(query, BooleanClause.Occur.MUST)

    # parser = QueryParser('response_code', analyzer)
    # query = IntPoint.newRangeQuery('response_code', 200, 300)
    # boolean_query.add(query, BooleanClause.Occur.MUST)

    # lowerDate = handleDate("19/Jul/2020:05:40:00 +0000")
    # upperDate = handleDate("19/Jul/2020:06:45:04 +0000")
    # query = TermRangeQuery.newStringRange("date_time", lowerDate, upperDate, True, True)
    # boolean_query.add(query, BooleanClause.Occur.MUST)


    return boolean_query
示例#24
0
def retrieve_wiki(text_query, searcher, analyzer):    
    txt =text_query
    try:
        query = QueryParser(Version.LUCENE_CURRENT, "contents", 
                            analyzer).parse(txt)
    except:
        qp = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        txt = qp.escape(txt)
        query = qp.parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs
    
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('title'), doc.get('contents')    
    def similarityOfSynopsis(self):
        directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX))
        ireader = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS,
                                  analyzer)
        for root, dirnames, filenames in os.walk(settings.SYNOPSIS):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    while True:
                        try:
                            query = queryParser.parse(
                                QueryParser.escape(content))
                        except Exception as e:
                            self.boolean_query.setMaxClauseCount(
                                self.boolean_query.maxClauseCount * 2)
                            print self.boolean_query.maxClauseCount
                            continue
                        break

                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(
                                first_movie=major_movie,
                                second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(
                                    first_movie=minor_movie,
                                    second_movie=major_movie).first()
                            similarity.synopsis = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)
示例#26
0
def retrieve_wiki(text_query, searcher, analyzer):
    txt = text_query
    try:
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(txt)
    except:
        qp = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        txt = qp.escape(txt)
        query = qp.parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('title'), doc.get('contents')
示例#27
0
class Searcher:
    def __init__(self, indexDir):
        self.directory = SimpleFSDirectory(Paths.get(indexDir))
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
        self.nameQueryParser = QueryParser('name', StandardAnalyzer())
        self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND)
        self.idQueryParser = QueryParser('id', StandardAnalyzer())
        self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)

    def find_by_name(self, name):
        query = self.nameQueryParser.parse(name)
        docs = self.searcher.search(query, 100).scoreDocs
        tables = []
        for scoreDoc in docs:
            doc = self.searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue())
                         for field in doc.getFields())
            tables.append(table)

        return tables

    def find_by_id(self, id):
        query = self.idQueryParser.parse(id)
        docs = self.searcher.search(query, 100).scoreDocs
        tables = []
        for scoreDoc in docs:
            doc = self.searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue())
                         for field in doc.getFields())
            tables.append(table)

        return tables

    def close(self):
        self.directory.close()
        self.reader.close()
示例#28
0
    def doc_search(self, keywords):

        analyzer = StandardAnalyzer()
        parser = QueryParser('Title', analyzer)
        query = parser.parse(keywords)

        try:
            collector = TopScoreDocCollector.create(3000)
            self.lSearcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

        except RuntimeError:
            print "Score docoment run fail"
        self.hits = hits
        return hits
def author_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	print(entry['prim_author'])
        
        if qry in entry['prim_author'].lower():
     	
             fname =  short_title + CONTENT_EXT
             results[entry_id] = {'title': short_title, 'file': fname }
    
    f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w')
    f.write(json.dumps(results))
    f.close()
    return json.dumps(results)
    def perform_search(self, searchterm):
        # processing a query
        parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
        parser.setDefaultOperator(QueryParser.Operator.AND)

        query = parser.parse(searchterm)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start

        print scoreDocs
        print duration
示例#31
0
    def more_like_this(self, so_items):

        github_result = []
        if not so_items:
            so_items.append(SOResultItem(None, 1.0, "No Title", 0, ""))

        for so_item in so_items:
            queryparser = QueryParser(Version.LUCENE_CURRENT,
                                      "typed_method_call", self.analyzer)

            query = ""
            if so_item.doc:
                query = self.document_to_query(so_item.doc)

            query += self.code_as_text()
            if query:
                print "-" * 30
                print "Query: %s" % query
                print "-" * 30
                try:
                    like_query = queryparser.parse(query)

                    hits = self.searcher.search(like_query, 10).scoreDocs

                    for i, hit in enumerate(hits):
                        doc = self.searcher.doc(hit.doc)
                        matched_terms = self.get_matched_keywords2(
                            like_query, hit.doc)

                        # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]

                        item = GithubResultItem(doc.get("file"),
                                                decompress(
                                                    doc.get("file_content")),
                                                matched_terms,
                                                hit.score, so_item,
                                                doc.get("line_numbers"),
                                                hit.doc)  # code

                        github_result.append(item)
                        #print("%d. File: %s, Matched: %s, Score: %s" % (i + 1, doc.get("file"), matched_terms, hit.score))
                except Exception as e:
                    print "Error: %s" % e

        # print Counter(files).most_common(5)

        return github_result
示例#32
0
    def more_like_this2(self, limit, item_doc, user_query,
                        flag):  #flag = UQ(1) or not(0)
        results = []
        query = ""
        if flag == 1:
            query += user_query
            # item_doc = ResultItem(None, 0.0, "No Title", 'None','None', None)

        if flag == 0 and item_doc.doc:
            query += self.document_to_query(item_doc.doc)

        query = remove_unified_stop_lists(query)

        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)
        if query:
            try:
                parsed_query = queryparser.parse(query)
                hits = self.searcher.search(parsed_query, limit).scoreDocs
                temp = 1
                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords2(
                        parsed_query, hit.doc)
                    temp += 1
                    file_path = doc.get("file")
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        pass

                    if content:
                        item = GithubResultItem(doc.get("file"), content,
                                                matched_terms, hit.score,
                                                item_doc,
                                                doc.get("line_numbers"),
                                                hit.doc)
                        results.append(item)

            except Exception as e:
                print "GitHub Searcher Error: %s" % e
                print(traceback.format_exc())

        return results
示例#33
0
    def search_phrase(self, term, phrase):
        print('Phrase search')
        self.hits = []
        index_list = []
        parser = QueryParser('text', self.analyzer)
        query = parser.parse(term)

        hits = self.searcher.search(query, 1000).scoreDocs
        if hits is None:
            return

        for hit in hits:
            index = []
            doc = self.searcher.doc(hit.doc)
            text = doc.get("text")
            phrases = doc.get("phrase")

            # processing with saved text and phrase
            terms = text.split()
            phrases = phrases.split()
            flag = 1  # this flag is judging for phrase in every target term in text
            index = []  # index number for searched term, maybe many terms
            for i in range(len(terms)):
                if term == terms[i]:
                    index.append(i)
                    if not phrase == phrases[i]:
                        flag = 0
                        break
            if flag == 1:
                self.hits.append(text)
                index_list.append(index)
        self.recover_sentence(index_list)
        hits_copy = self.hits
        self.hits = []
        # add font tags for terms
        for hit in hits_copy:
            simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML)
            highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query))
            highLightText = highlighter.getBestFragment(
                self.analyzer, 'text', hit)
            if highLightText is not None:
                self.hits.append(highLightText)

        return self.hits[:40]
    def similarityOfSynopsis(self):
        directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX))
        ireader  = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)
        for root, dirnames, filenames in os.walk(settings.SYNOPSIS):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    while True:
                        try:
                            query = queryParser.parse(QueryParser.escape(content))
                        except Exception as e:
                            self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2)
                            print self.boolean_query.maxClauseCount
                            continue
                        break

                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first()
                            similarity.synopsis = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)
def custom_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    print rootdir
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	year = entry['publ_year']
    	
      fname = short_title + CONTENT_EXT
      results[fname] = year;
def do_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    print os.path.abspath(os.path.pardir)
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    results = []
    for hit in hits:
        doc = searcher.doc(hit.doc);
        entry_id = doc.get('entry_id')

        entry = entry_map.get(entry_id)
        #print 'entry:', entry
        score = hit.score
        #print 'Hit:', entry['short_title'], score
        results.append((score, doc, entry))
        
    return results
示例#37
0
 def findTopClasses(self):
     propertyURI = RDFS.SUBCLASSOF
     allClasses = list()
     topClasses = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.PROPERTY_FEATURE_LKB, analyzer)
         query = parser.parse("\"" + QueryParser.escape(propertyURI) + "\"")
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             allClasses.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
             indexus += 1
         # for (String classUri : allClasses) {
         indexus = 0
         while indexus < len(allClasses):
             classUri = allClasses[indexus]
             logging.info("Checking whether " + classUri + " is a top class.")
             # search inst and pred retrieve class
             # if class exists that means it is not top class otherwise add to
             # topClasses
             classes = self.searchForClass(classUri, propertyURI)
             logging.info("top classes:" + str(len(classes)))
             if classes != None or len(classes) > 0:
                 logging.info("This is not a top class...")
             else:
                 topClasses.append(classUri)
                 logging.info("Adding " + classUri + " to top classes.")
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return topClasses
示例#38
0
    def closest_docs(self, question_, k=5):
        """Closest docs by dot product between query and documents
        in tfidf weighted word vector space.
        """
        doc_scores = []
        doc_ids = []
        doc_texts = []
        words = self.parse(utils.normalize(question_))
        query = ' '.join(words)
        if not query:
            logger.warning('has no query!')
            return doc_ids, doc_scores, doc_texts

        # bq_builder = BooleanQuery.Builder()
        # title_query = TermQuery(Term("title", query))
        # # boosted_title_query = BoostQuery(title_query, 2)
        # bq_builder.add(TermQuery(Term("text", query)), BooleanClause.Occur.SHOULD)
        # bq_builder.add(title_query, BooleanClause.Occur.SHOULD)
        # lucene_query = bq_builder.build()

        # lucene_query = self.query_parser.parse(query, ["title", "text"],
        #                                        [BooleanClause.Occur.SHOULD, BooleanClause.Occur.MUST],
        #                                        self.analyzer)
        # lucene_query = 'title:"{0}"^2 OR "{0}"'.format(query)

        self.env.attachCurrentThread()
        query_parser = QueryParser("text", self.analyzer)
        search_results = self.searcher.search(query_parser.parse(query),
                                              k).scoreDocs
        for search_result in search_results:
            doc = self.searcher.doc(search_result.doc)
            doc_id = doc["id"] + ", title=" + doc["title"]
            doc_score = search_result.score
            text = doc["text"]
            doc_ids.append(doc_id)
            doc_scores.append(doc_score)
            doc_texts.append(text)
            # print('id:', doc_id, 'ds:', doc_score, 'text:', text)
        # logger.debug('question_d:%s, query:%s, doc_ids:%s, doc_scores:%s'
        #              % (question_, query, doc_ids, doc_scores))
        return doc_ids, doc_scores, doc_texts
def search(termo, **args):
	
	indexDir = os.environ.get('MANDEX') or '3iteracao'
	fsDir = SimpleFSDirectory(File(indexDir))
	searcher = IndexSearcher(DirectoryReader.open(fsDir))
	
	analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
	parser = QueryParser(Version.LUCENE_CURRENT, field, analyzer)
	parser.setDefaultOperator(QueryParser.Operator.OR)
	query = parser.parse(termo + ' '.join(args.values()))
	start = datetime.now()
	scoreDocs = searcher.search(query, 50).scoreDocs
	duration = datetime.now() - start

	politicos = []
	for scoreDoc in scoreDocs:	    
	    doc = searcher.doc(scoreDoc.doc)
	    table = dict((field.name(), field.stringValue()) for field in doc.getFields())	   
	    politicos.append(table)

	return politicos
示例#40
0
def get_document_vector(searcher, reader, document_id, \
      id_field, text_field):
    ''' 
		Given a document id, fetch the tf-idf vector of the document.
	'''
    tc_dict = {}  # Counts of each term
    dc_dict = {}  # Number of docs associated with each term
    tfidf_dict = {}  # TF-IDF values of each term in the doc
    # Get the document id.
    query_parser = QueryParser(id_field, WhitespaceAnalyzer())
    score_docs = searcher.search(query_parser.parse(str(document_id)),
                                 1).scoreDocs
    if len(score_docs) > 0:
        # get the tf-idf vector.
        termVector = reader.getTermVector(score_docs[0].doc, text_field)
        termsEnumvar = termVector.iterator()
        termsref = BytesRefIterator.cast_(termsEnumvar)
        N_terms = 0
        try:
            while (termsref.next()):
                termval = TermsEnum.cast_(termsref)
                fg = termval.term().utf8ToString()  # Term in unicode
                if len(fg) > 3 and not fg.isdigit():
                    tc = termval.totalTermFreq()  # Term count in the doc

                    # Number of docs having this term in the index
                    dc = reader.docFreq(Term(text_field, termval.term()))
                    N_terms = N_terms + 1
                    tc_dict[fg] = tc
                    dc_dict[fg] = dc
        except:
            print('error in term_dict')

        # Compute TF-IDF for each term
        for term in tc_dict:
            tf = tc_dict[term] / N_terms
            idf = 1 + math.log(reader.numDocs() / (dc_dict[term] + 1))
            tfidf_dict[term] = tf * idf

    return tfidf_dict
示例#41
0
    def search(self, term, window=2):
        self.hits = []
        index_list = []
        sort_para = term

        parser = QueryParser('text', self.analyzer)
        query = parser.parse(term)
        print(query)

        # Jump to multi-terms search if there are several words
        if self.multi_terms(query):
            self.search_multi_terms(query)
            return self.hits[:40]

        hits = self.searcher.search(query, 1000).scoreDocs

        for hit in hits:
            index = []
            doc = self.searcher.doc(hit.doc)
            text = doc.get("text")
            self.hits.append(text)
            # save indexes of target term in each document
            terms = text.split()
            for i in range(len(terms)):
                if term == terms[i]:
                    index.append(i)
            index_list.append(index)

        self.recover_sentence(index_list, window)
        hits_copy = self.hits
        self.hits = []
        for hit in hits_copy:
            simpleHTMLFormatter = SimpleHTMLFormatter(prefixHTML, suffixHTML)
            highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query))
            highLightText = highlighter.getBestFragment(
                self.analyzer, 'text', hit)
            if highLightText is not None:
                self.hits.append(highLightText)
        print('search over')
        return self.hits[:40]
示例#42
0
    def doc_search(self, field, keywords, numHits):
        if field != 'All':
            analyzer = StandardAnalyzer()
            parser = QueryParser(field, analyzer)
            query = parser.parse(keywords)

            # self.lReader.getDocCount("title");

            try:
                collector = TopScoreDocCollector.create(numHits)
                self.lSearcher.search(query, collector)
                hits = collector.topDocs().scoreDocs

            except RuntimeError:
                print "Score docoment run fail"
            self.hits = hits
            self.field = field
            return hits
        else:
            analyzer = WhitespaceAnalyzer()
            parser = MultiFieldQueryParser(['Title', 'Body'], analyzer)
            query = MultiFieldQueryParser.parse(parser, keywords)

            # self.lReader.getDocCount("title");

            try:
                collector = TopScoreDocCollector.create(numHits)
                self.lSearcher.search(query, collector)
                hits = collector.topDocs().scoreDocs

            except RuntimeError:
                print "Score docoment run fail"
            self.hits = hits
            self.field = field
            return hits

            self.hits = hits
            self.field = field
            return hits
示例#43
0
	def more_like_this2(self, limit, score_logs_for_each, user_query, flag):
		bench_result = []
		query = ""
		if flag == 1:
			query += user_query

		query = remove_unified_stop_lists(query)
		queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer)
		if query:
			try:
				parsed_query = queryparser.parse(query)
				hits = self.searcher.search(parsed_query, limit).scoreDocs
				temp = 1
				for i, hit in enumerate(hits):
					doc = self.searcher.doc(hit.doc)
					matched = doc.get('file').split('/')[9].split('.')[0]
					score_logs_for_each += str(matched) + '\t' + str(round(hit.score, 2)) + '\n'
					matched_terms = self.get_matched_keywords2(parsed_query, hit.doc)
					temp += 1

					file_path = doc.get("file")
					content = None
					try:
						with open(file_path) as f:
							content = f.read()
					except:
						pass

					if content:
						item = BenchResultItem_UQ(doc.get("file"), content, matched_terms, hit.score, doc.get("line_numbers"), hit.doc)
						bench_result.append(item)

			except Exception as e:
				print "BenchSearcher Error: %s" % e
				print(traceback.format_exc())

		return bench_result, score_logs_for_each
示例#44
0
    def extract_phrase_query(self, q, field, slop=0, boost=5):
        phrases = re.findall(r'"([^"]*)"', q)
        if len(phrases) == 0:
            return None, q

        q = re.sub(r'"([^"]*)"', "", q).strip()  # query without phrases
        if self.verbose:
            print "Detected phrases: ", phrases

        bq = BooleanQuery()
        for phrase in phrases:
            # pq = PhraseQuery()
            # for term in filter(None, phrase.split(' ')):
            #     pq.add(Term(field, term))
            qparser = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer)
            # parse phrase - this may or may not be desired
            # pq = qparser.parse(field + ':"' + phrase + '"')
            pq = qparser.parse('%s "%s"~%d^%.1f' %
                               (phrase, phrase, slop, boost))
            # phrase queries have high priority
            bq.add(pq, BooleanClause.Occur.MUST)
            # bq.add(pq, BooleanClause.Occur.SHOULD)

        return bq, q
示例#45
0
    def more_like_this2(
        self, item_doc, result_num
    ):  #들어온 질문 docs들에 대해 순회하면서 최종 query로 생성하고 Question Index에서 비슷한거 검색할 것.
        similar_questions = []
        if not item_doc:
            item_doc.append(ResultItem(None, 1.0, "No Title", 0))
        query = ""
        if item_doc.doc:
            query += self.document_to_query(item_doc.doc)

        query = remove_unified_stop_lists(query)
        queryparser = QueryParser(Version.LUCENE_CURRENT, "term",
                                  self.analyzer)

        if query:  #########이 시점에서의 Unified Query는 Tokenization, Stemming 이 되어있음..########
            try:
                like_query = queryparser.parse(query)
                hits = self.searcher.search(
                    like_query,
                    result_num).scoreDocs  #Q와 비슷한 Q들 상위 3개씩의 결과 그럼 총 9개

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    similar_questions.append(doc.get("question_id"))

            except Exception as e:
                print "Question Searcher: Error: %s" % e
                # write_search_log("Question Searcher: Error: %s" % e + "\n")
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        # self.searchermgr.release(self.searcher)
        # self.searcher = None
        # self.directory.close()
        # self.directory = None
        return similar_questions
示例#46
0
def get_doc_list(TERM, searcher, reader):

    FIELD_CONTENTS = "text"
    DOC_NAME = "identifier"
    STORE_DIR = "./full_index1"

    # Get the analyzer
    analyzer = WhitespaceAnalyzer()
    
    # Constructs a query parser. We specify what field to search into.
    queryParser = QueryParser(FIELD_CONTENTS, analyzer)
    
    # Create the query
    query = queryParser.parse(TERM)
    
    #lucene.initVM()
    #searcher, reader, query = define_search_params(STORE_DIR, FIELD_CONTENTS, TERM)
    
    # fieldInfos = MultiFields.getMergedFieldInfos(reader)
    # print(fieldInfos)
    # for fieldInfo in fieldInfos.iterator():
    # print(fieldInfo.name)
    # Run the query and get documents that contain the term
    return searcher.search(query, reader.numDocs())
class TASearcher():
    def __init__(self, queries=[], criteria=[], conjunctions=[], orderby=["ta"], ascending=True, limit=10000):
        vm.attachCurrentThread()

        self.queries = [query for query in queries if len(query.strip()) > 0]
        self.criteria = criteria
        self.conjunctions = conjunctions
        self.orderby = orderby
        self.ascending = ascending
        self.queryString = ""
        self.limit = limit

        self.fields = fields
        self.analyzer = PorterStemmerAnalyzer()
        self.queryParser = QueryParser(Version.LUCENE_30, "freetext", self.analyzer)
        self.queryParser.setAllowLeadingWildcard(True)
        self.queryParser.setDefaultOperator(QueryParser.Operator.AND)
        indexDir = settings.LUCENE_INDEX_DIRECTORY
        self.index = MMapDirectory(File(indexDir))

    def createQueryString(self):
        # Simple
        if len(self.criteria) == 0:
            self.queryString = "(%s) OR freetext-normalized:(%s)" % (self.queries[0], self.queries[0])
        # Advanced
        else:
            queryPairs = []
            criteriaQueries = zip(self.criteria, self.queries)
            self.criteria = dict(criteriaQueries).keys()
            for criterion, query in criteriaQueries:
                if criterion in ("volume", "number", "category-label", "pubtype", "author-sort"):
                    queryPairs.append("%s:%s" % (criterion, query))
                elif criterion == "year":
                    queryPairs.append("year-start:%s OR year-end:%s" % (query, query))
                else:
                    queryPairs.append('%s:%s OR %s-normalized:%s' % (criterion, query, criterion, query))
            # queryPairs = ["%s:%s"%(criterion,query.replace(" ", "+")) for criterion, query in zip(criteria, queries)]
            try:
                queryString = "%s %s" % (queryPairs[0], " ".join(
                    ["%s (%s)" % (conj, pair) for conj, pair in zip(self.conjunctions, queryPairs[1:])]))
                self.queryString = queryString
                return queryString
            except:
                self.queryString = "freetext"
                return self.queryString

    def getQueryString(self):
        return self.queryString

    def _getHits(self):
        reader = IndexReader.open(self.index)
        searcher = IndexSearcher(reader)

        # Sortierung nach Band- und Eintragsnummer (4: Wert als Integer behandeln)
        sortDict = {
            "ta": (("volume", SortField.Type.INT), ("number", SortField.Type.INT)),
            "year": (("year-start", SortField.Type.INT), ("year-end", SortField.Type.INT)),
            "author-title": (("author-sort", SortField.Type.STRING), ("title-sort", SortField.Type.STRING)),
            "title": (("title-sort", Locale.GERMAN),),
            "author": (("author-sort", Locale.GERMAN),),
        }

        sortFields = []

        reverse = not self.ascending

        for name in self.orderby:
            for fieldName, typeNum in sortDict.get(name, []):
                sortFields.append(SortField(fieldName, typeNum, reverse))

        if len(sortFields) == 0:
            sortFields = [SortField("volume", SortField.Type.INT), SortField("number", SortField.Type.INT)]

        sort = Sort(sortFields)

        topDocs = searcher.search(self.query, None, 80000, sort)
        hits = topDocs.scoreDocs
        self.hits = hits
        self.searcher = searcher

        lang = translation.get_language()
        if lang != "de":
            lang = "en"

        facets = {"author": {}, "pubtype": {}, "category-%s" % lang: {}}

        # Highlighting
        highlighter = Highlighter(SimpleHTMLFormatter('<span class="highlight">', '</span>'), QueryScorer(self.query))

        hitObjects = []
        fields = {}
        for hit in hits:
            doc = searcher.doc(hit.doc)
            # print unicode(doc)
            fields["score"] = hit.score
            fields["volume"] = doc["volume"]
            fields["number"] = doc["number"]
            fields["id"] = doc["id"]
            fields["title"] = doc["title"]
            fields["author"] = doc["author"]
            fields["authors"] = [field.stringValue() for field in doc.getFields("author")]
            for author in fields["authors"]:  # XXX
                facets["author"][author] = facets["author"].get(author, 0) + 1  # XXX

            fields["categories"] = [field.stringValue() for field in doc.getFields("category-%s" % lang)]
            for cat in fields["categories"]:
                facets["category-%s" % lang][cat] = facets["category-%s" % lang].get(cat, 0) + 1
            maxNumFragmentsRequired = 2
            fragmentSeparator = "...";
            pubtype = doc["pubtype"]
            fields["pubtype"] = pubtype
            facets["pubtype"][pubtype] = facets["pubtype"].get(pubtype, 0) + 1
            fields["city"] = doc["city"]
            fields["year"] = doc["year-start"]
            if fields["year"] and doc["year-end"] and doc["year-end"] != fields["year"]:
                fields["year"] += " - " + doc["year-end"]
            highlightFields = ("title", "author", "city", "year", "category")

            if "freetext" in self.criteria:
                for fieldName in highlightFields:
                    try:
                        tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName]))
                        newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired,
                                                              fragmentSeparator)
                        if len(newVal) > 0:
                            # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal)
                            fields[fieldName] = newVal
                    except:
                        continue

            for fieldName in highlightFields:
                if fieldName in self.criteria or fieldName + "-de" in self.criteria or fieldName + "-en" in self.criteria:
                    try:
                        tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName]))
                        newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired,
                                                              fragmentSeparator)
                        if len(newVal) > 0:
                            # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal)
                            fields[fieldName] = newVal
                    except:
                        continue
            """if "author" in self.criteria:
                try:
                    tokenStream = self.analyzer.tokenStream("author", lucene.StringReader(fields["author"]))
                    fields["author"] = highlighter.getBestFragments(tokenStream, fields["author"], maxNumFragmentsRequired, fragmentSeparator)
                except:
                        pass"""

            hitObjects.append(
                Hit(fields["id"], fields["volume"], fields["number"], fields["title"], fields["author"], fields["city"],
                    fields["year"], fields["categories"], fields["pubtype"], fields["score"]))

        facetsToDelete = []
        for facet in facets:
            if len(facets[facet]) < 2:
                facetsToDelete.append(facet)
                continue
            values = sorted(facets[facet].items(), key=itemgetter(0))
            values = sorted(values, key=itemgetter(1), reverse=True)
            facets[facet] = values[:25]
        for facet in facetsToDelete:
            del facets[facet]
        self.facets = facets
        reader.close()
        self.hitObjects = hitObjects
        return hitObjects

    def search(self):
        self.createQueryString()
        querystr = self.getQueryString()
        self.query = self.queryParser.parse(querystr)
        return self._getHits()

    def getAll(self):
        self.query = MatchAllDocsQuery()
        return self._getHits()
class HighlighterTestCase(PyLuceneTestCase):
    """
    Unit tests ported from Java Lucene.
    2004 by Yura Smolsky ;)
    """

    FIELD_NAME = "contents"
    texts = [ "A wicked problem is one for which each attempt to create a solution changes the understanding of the problem.  Wicked problems cannot be solved in a traditional linear fashion, because the problem definition evolves as new possible solutions are considered and/or implemented."
              "Wicked problems always occur in a social context -- the wickedness of the problem reflects the diversity among the stakeholders in the problem."
              "From http://cognexus.org/id42.htm"
              "Most projects in organizations -- and virtually all technology-related projects these days -- are about wicked problems.  Indeed, it is the social complexity of these problems, not their technical complexity, that overwhelms most current problem solving and project management approaches."
              "This text has a typo in referring to whicked problems" ];

    def __init__(self, *args):
        super(HighlighterTestCase, self).__init__(*args)

        self.parser = QueryParser(Version.LUCENE_CURRENT, self.FIELD_NAME,
                                  StandardAnalyzer(Version.LUCENE_CURRENT))

    def setUp(self):
        super(HighlighterTestCase, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        writer = self.getWriter(analyzer=self.analyzer)
        for text in self.texts:
            self.addDoc(writer, text)

        writer.commit()
        writer.close()
        self.reader = self.getReader()
        self.numHighlights = 0;

    def testSimpleHighlighter(self):

        self.doSearching("Wicked")
        highlighter = Highlighter(QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))
        maxNumFragmentsRequired = 2

        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream, text,
                                                  maxNumFragmentsRequired,
                                                  "...")
            print "\t", result

        # Not sure we can assert anything here - just running to check we don't
        # throw any exceptions

    def testGetBestFragmentsSimpleQuery(self):

        self.doSearching("Wicked")
        self.doStandardHighlights()
        self.assert_(self.numHighlights == 3,
                     ("Failed to find correct number of highlights, %d found"
                      %(self.numHighlights)))
        
    def doSearching(self, queryString):

        self.searcher = self.getSearcher()
        self.query = self.parser.parse(queryString)
        # for any multi-term queries to work (prefix, wildcard, range,
        # fuzzy etc) you must use a rewritten query!
        self.query = self.query.rewrite(self.reader)

        print "Searching for:", self.query.toString(self.FIELD_NAME)
        self.scoreDocs = self.searcher.search(self.query, 100).scoreDocs
        self.numHighlights = 0

    def doStandardHighlights(self):
        
        formatter = TestFormatter(self)
        
        highlighter = Highlighter(formatter, QueryScorer(self.query))
        highlighter.setTextFragmenter(SimpleFragmenter(20))
        for scoreDoc in self.scoreDocs:
            text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME)
            maxNumFragmentsRequired = 2
            fragmentSeparator = "..."
            tokenStream = self.analyzer.tokenStream(self.FIELD_NAME,
                                                    StringReader(text))

            result = highlighter.getBestFragments(tokenStream,
                                                  text,
                                                  maxNumFragmentsRequired,
                                                  fragmentSeparator)
            print "\t", result
            
    def countHighlightTerm(self):

        self.numHighlights += 1 # update stats used in assertions
        
    def addDoc(self, writer, text):

        d = Document()
        f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)

        d.add(f)
        writer.addDocument(d)
def run(command,searcher, aWrapper):

    print 

    if command == '':
        return
    
    #debug
    #print "Searching for:"+command

    #query = MultiFieldQueryParser(Version.LUCENE_CURRENT,['subject_id','summary'],analyzer).parse(command); 
    #query = MultiFieldQueryParser.parse(command,['subject_id','summary'],analyzer); 

    #'''
    #MultiFieldQueryParser(Version matchVersion, String[] fields, Analyzer analyzer)
    #'''
    #parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, JArray('string')(['subject_id','summary']),analyzer)
    #query = MultiFieldQueryParser.parse(parser, command_jarr)

    #创建QueryParser对象 默认的搜索域为title 
    parser = QueryParser(Version.LUCENE_CURRENT, "title", aWrapper) 
    #A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing and query parsing. 
    query = parser.parse(command)

    print query.toString()

    #test the analyzerWrapper
    #printTokens(aWrapper,command,'title')
    #printWrappedAnalyzer(aWrapper)

    #所有有相关度的doc,进行排序
    #sortField = SortField('boost',SortField.Type.FLOAT,True) #True表示降序
    #sort = Sort(sortField)
    '''
    Error with:
    > query = lucene.MultiFieldQueryParser(lucene.Version.LUCENE_CURRENT,
    > ["payload","subject"], analyzer).parse(command)

    I think there's a bug with the method binding.  MultiFieldQueryParser has several static parse
    methods, plus the inherited regular method from QueryParser.  It looks like all of them are
    being resolved as if they were static.  As a workaround, you can call it like this:

    parser = lucene.MultiFieldQueryParser(lucene.Version.LUCENE_CURRENT, ["payload","subject"],
    analyzer)
    lucene.MultiFieldQueryParser.parse(parser, command)
    '''

    #occ=[BooleanClause.Occur.SHOULD , BooleanClause.Occur.SHOULD]
    #query = MultiFieldQueryParser.parse(command_list,['subject_id','summary'],occ,analyzer)

    #query = QueryParser(Version.LUCENE_CURRENT, FIELD,analyzer).parse(command)

    #scoreDocs = searcher.search(query, 50,sort).scoreDocs
    scoreDocs = searcher.search(query, 50).scoreDocs




    # retList = []
    # for scoreDoc in scoreDocs:
    #     doc = searcher.doc(scoreDoc.doc)
    #     score = scoreDoc.score
    #     #print 'subject_id:', doc.get('subject_id')
    #     #print 'title:', doc.get('title')

    #     tmpDict = {
    #     'subject_id':doc.get('subject_id'),
    #     'title':doc.get('title'),
    #     'directors':doc.get('directors'),
    #     'summary':doc.get('summary'),
    #     'image_small':doc.get('image_small'),
    #     'boost':doc.get('boost'),
    #     'user_tags':doc.get('user_tags'),
    #     'year':doc.get('year'),
    #     'score':score
    #     }
    #     retList.append(tmpDict)

    maxDict = utils.maxDict
    movieDictList =  utils.scoreDocs2dictList(scoreDocs,searcher)

    retList = movieDictList
    retList = utils.reRank(movieDictList,maxDict,command)


    #人工排序
    #retList = sorted(retList, key=operator.itemgetter('boost'), reverse=True)  

    del searcher
    return retList
示例#50
0
 def searchStemFirst(self, annotation):
     annotations = list()
     pocString = QueryParser.escape(annotation.getText())
     preparePocStringOriginal = "\"" + pocString + "\""
     preparePocStringLowercase = "\"" + pocString.lower() + "\""
     try:
         maxSynonyms = 0
         # Analyzer stemmedAnalyser =
         # AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil
         # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)),
         # synonymMap, maxSynonyms);
         stemmedAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT)
         analyser = StandardAnalyzer(Version.LUCENE_CURRENT)
         stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemmedAnalyser)
         query = stemParser.parse(preparePocStringLowercase)
         result = self._searcher.search(query, 1)
         logging.info("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         stemHits = result.scoreDocs
         allHits = stemHits
         # if(stemHits.length == 0) {
         # search lowercased exact
         parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser)
         query = parser.parse(preparePocStringLowercase)
         result = self._searcher.search(query, 1)
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         lowHits = result.scoreDocs
         allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(lowHits) # ArrayUtils.addAll(allHits, lowHits)
         logging.info("For " + str(query) + " : " + str(result.totalHits))
         # }
         # if(allHits.length == 0) {
         # search exact
         exactParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser)
         query = exactParser.parse(preparePocStringLowercase)
         result = self._searcher.search(query, 1)
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(result.scoreDocs) #ArrayUtils.addAll(allHits, result.scoreDocs)
         logging.info("For " + str(query) + " : " + str(result.totalHits))
         # }
         # for (ScoreDoc hit : allHits) {
         indexus = 0
         while indexus < len(allHits):
             hit = allHits[indexus]
             doc = self._searcher.doc(hit.doc)
             self._searcher.explain(query, hit.doc)
             ann = Annotation()
             features = dict()
             features[FreyaConstants.CLASS_FEATURE_LKB] = doc.get(FreyaConstants.CLASS_FEATURE_LKB)
             features[FreyaConstants.INST_FEATURE_LKB] = doc.get(FreyaConstants.INST_FEATURE_LKB)
             features[FreyaConstants.PROPERTY_FEATURE_LKB] = doc.get(FreyaConstants.PROPERTY_FEATURE_LKB)
             features["string"] = doc.get(FreyaConstants.FIELD_EXACT_CONTENT)
             features["score"] = hit.score
             ann.setFeatures(features)
             ann.setEndOffset(annotation.getEndOffset())
             ann.setStartOffset(annotation.getStartOffset())
             ann.setSyntaxTree(annotation.getSyntaxTree())
             ann.setText(annotation.getText())
             annotations.append(ann)
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return annotations
示例#51
0
        format = a
    elif o == "--index":
        indexDir = a
    elif o == "--stats":
        stats = True


class CustomTemplate(Template):
    delimiter = '#'

template = CustomTemplate(format)

fsDir = SimpleFSDirectory(Paths.get(indexDir))
searcher = IndexSearcher(DirectoryReader.open(fsDir))

analyzer = StandardAnalyzer()
parser = QueryParser("keywords", analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(' '.join(args))
start = datetime.now()
scoreDocs = searcher.search(query, 50).scoreDocs
duration = datetime.now() - start
if stats:
    print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    table = dict((field.name(), field.stringValue())
                 for field in doc.getFields())
    print template.substitute(table)
示例#52
0
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version


# This script will prepare a CSV for Cassandra DB
if __name__ == '__main__':
    lucene.initVM()

    base_dir = os.path.abspath(os.path.curdir)
    index_file = os.path.join(base_dir, INDEX_DIR)
    store = SimpleFSDirectory(File(index_file))

    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    reader = IndexReader.open(store)
    searcher = IndexSearcher(reader)

    query_parser = QueryParser(Version.LUCENE_CURRENT, "netflix_id", analyzer)

    with open(sys.argv[1], 'r') as ratings:
        for line in ratings:
            user_id, netflix_id, score = line.split(",")

            query = query_parser.parse(netflix_id)

            scoreDocs = searcher.search(query, 1).scoreDocs
            if scoreDocs:
                doc = searcher.doc(scoreDocs[0].doc)
                film_id = doc.getField("id").stringValue()
                print "{0},{1},{2}".format(user_id, film_id, score),
示例#53
0
文件: views.py 项目: asxzy/weiso
def search(request):
    query = request.GET.get('q', None)
    page = int(request.GET.get('page', 1))
    perPage = 5
    nodes = []
    usage = {}
    usage["time"] = time.time()

    if not query:
        count = 0
        nodes = []
        keywords = []
    else:
        #conn = ReplicaSetConnection('localhost', replicaSet='jlu')
        conn = MongoClient('localhost')
        db = conn.sina
        #db.read_preference = ReadPreference.SECONDARY
        CACHE = db.cache
        keywords = query.split(' ')
        cache = CACHE.find_one({"query":keywords,"page":page})
        if cache == None:
            print "query cache not found"
            VM_ENV.attachCurrentThread()
            fsDir = SimpleFSDirectory(File(settings.ROOT_DIR+'/index'))
            searcher = IndexSearcher(DirectoryReader.open(fsDir))

            analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT)
            parser = QueryParser(Version.LUCENE_CURRENT, 'text', analyzer)
            parser.setDefaultOperator(QueryParser.Operator.AND)
            lucene_query = parser.parse(query)

            scoreDocs = searcher.search(lucene_query, 3000000).scoreDocs


            ids = []

            for scoreDoc in scoreDocs:
                doc = searcher.doc(scoreDoc.doc)
                for field in doc.getFields():
                    ids.append(field.stringValue())
            print "got ids from lucene",len(ids)

            ids = [int(x) for x in ids]
            NODES = conn.sina.nodes
            count = 0
            for n in NODES.find({"node_id":{"$in":ids}}).sort("in_degree",-1).skip((page-1)*perPage):
                count += 1
                print "doing",n["node_id"],count,"/",perPage
                n["js"] = similarity(n["node_id"],topk=10)
                nodes.append(n)
                if len(nodes) == perPage:
                    break
            count = len(ids)
            CACHE.insert({"query":keywords,"page":page,"cache":nodes,"count":len(ids)})
            usage["isCache"] = False
        else:
            print "found query cache"
            usage["isCache"] = True
            nodes = cache["cache"]
            count = cache["count"]
        pagenav = {}
        if page == 1:
            pagenav["has_pre"] = None
        else:
            pagenav["has_pre"] = page - 1
        if page > count/perPage:
            pagenav["has_next"] = None
        else:
            pagenav["has_next"] = page + 1
        pagenav["page"] = page
        usage["time"] = time.time() - usage["time"]

    return {
        'q' : request.GET.get('q', ''),
        'keywords' : keywords,
        'nodes' : nodes,
        'count' : count,
        'page' : pagenav,
        'usage' : usage,
    }
def run(command, searcher, aWrapper, use_custom_parser=False, debug=False):


    if not os.path.isdir(query_log_dir):
        os.mkdir(query_log_dir)

    search_start = time.time()
    query_log_file = os.path.join(query_log_dir, 'query_log.%s' % datetime.now().strftime('%Y-%m-%d'))
    fw = open(query_log_file, 'a+')
    cur_time = datetime.now()
    cur_time = cur_time + timedelta(hours=8)#解决时区不在中国的问题
    fw.write('\n*********query-log,time=%s*************\n' % cur_time.strftime('%Y-%m-%d %H:%M:%S') )
    fw.write('raw_str=%s\n' % unicode_to_str(command))

    if command == '':
        return

    #debug
    #print "Searching for:"+command

    #query = MultiFieldQueryParser(Version.LUCENE_CURRENT,['subject_id','summary'],analyzer).parse(command); 
    #query = MultiFieldQueryParser.parse(command,['subject_id','summary'],analyzer); 

    #'''
    #MultiFieldQueryParser(Version matchVersion, String[] fields, Analyzer analyzer)
    #'''
    #parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, JArray('string')(['subject_id','summary']),analyzer)
    #query = MultiFieldQueryParser.parse(parser, command_jarr)

    if debug:
        print 'before query parser: ', command

    command = custom_parser.parse(command) if use_custom_parser else command
    fw.write('parsed_str=%s\n' % unicode_to_str(command))
    if debug:
        print 'after query parser: ', command
    #创建QueryParser对象 默认的搜索域为title 
    parser = QueryParser(Version.LUCENE_CURRENT, "title", aWrapper)
    #A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing and query parsing. 
    query = parser.parse(command)
    if debug:
        print 'after lucene QueryParser: ', query.toString().encode('utf8')
    fw.write('lucene_str=%s\n' % unicode_to_str(query.toString()))
    #test the analyzerWrapper
    #printTokens(aWrapper,command,'title')
    #printWrappedAnalyzer(aWrapper)

    #所有有相关度的doc,进行排序
    #sortField = SortField('boost',SortField.Type.FLOAT,True) #True表示降序
    #sort = Sort(sortField)
    '''
    Error with:
    > query = lucene.MultiFieldQueryParser(lucene.Version.LUCENE_CURRENT,
    > ["payload","subject"], analyzer).parse(command)

    I think there's a bug with the method binding.  MultiFieldQueryParser has several static parse
    methods, plus the inherited regular method from QueryParser.  It looks like all of them are
    being resolved as if they were static.  As a workaround, you can call it like this:

    parser = lucene.MultiFieldQueryParser(lucene.Version.LUCENE_CURRENT, ["payload","subject"],
    analyzer)
    lucene.MultiFieldQueryParser.parse(parser, command)
    '''

    #occ=[BooleanClause.Occur.SHOULD , BooleanClause.Occur.SHOULD]
    #query = MultiFieldQueryParser.parse(command_list,['subject_id','summary'],occ,analyzer)

    #query = QueryParser(Version.LUCENE_CURRENT, FIELD,analyzer).parse(command)

    #scoreDocs = searcher.search(query, 50,sort).scoreDocs
    retN = 50
    start_time = time.time()
    scoreDocs = searcher.search(query, retN).scoreDocs
    cost_time = time.time() - start_time




    # retList = []
    # for scoreDoc in scoreDocs:
    #     doc = searcher.doc(scoreDoc.doc)
    #     score = scoreDoc.score
    #     #print 'subject_id:', doc.get('subject_id')
    #     #print 'title:', doc.get('title')

    #     tmpDict = {
    #     'subject_id':doc.get('subject_id'),
    #     'title':doc.get('title'),
    #     'directors':doc.get('directors'),
    #     'summary':doc.get('summary'),
    #     'image_small':doc.get('image_small'),
    #     'boost':doc.get('boost'),
    #     'user_tags':doc.get('user_tags'),
    #     'year':doc.get('year'),
    #     'score':score
    #     }
    #     retList.append(tmpDict)

    maxDict = utils.maxDict
    movieDictList =  utils.scoreDocs2dictList(scoreDocs,searcher)
    retList = movieDictList
    retList = utils.reRank(movieDictList,maxDict,command)

    #人工排序
    #retList = sorted(retList, key=operator.itemgetter('boost'), reverse=True)  
    fw.write('***********return list(search/total=%.2fs/%.2fs)***************\n' % (cost_time, time.time() - search_start))
    for r in retList:
        line = '%s: %s, boost->%s||score=%s\n' % (r['subject_id'], r['title'], r['boost'], r['score'])
        fw.write(unicode_to_str(line))
    fw.write('**************************************************************\n\n')

    del searcher
    return retList[:20] if debug else retList
示例#55
0
 def searchIndex(self, annotation, specialTreatment):
     if specialTreatment:
         return self.searchStemFirst(annotation)
     annotations = list() #ArrayList[Annotation]()
     try:
         maxSynonyms = 0
         stemAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT)
         # Analyzer stemmedAnalyser = AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil
         # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)),
         # synonymMap, maxSynonyms);
         analyser = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser)
         pocString = QueryParser.escape(annotation.getText())
         preparePocString = "\"" + pocString + "\""
         preparePocStringLowercase = "\"" + pocString.lower() + "\""
         query = parser.parse(preparePocString)
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         if freq <= 0:
             # search lowercased exact
             lowerCasedParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser)
             query = lowerCasedParser.parse(preparePocStringLowercase)
             # logging.info("Searching for: " + query.toString());
             result = self._searcher.search(query, 1)
             freq = result.totalHits
             if freq > 0:
                 result = self._searcher.search(query, freq)
             hits = pyJava.JArray2List(result.scoreDocs)
             logging.debug("For " + str(query) + " : " + str(result.totalHits))
         if len(hits) == 0 and preparePocStringLowercase.index(" ") < 0:
             # search stemmed
             stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemAnalyser)
             query = stemParser.parse(preparePocStringLowercase)
             # logging.info("Searching for: " + query.toString());
             result = self._searcher.search(query, 1)
             freq = result.totalHits
             if freq > 0:
                 result = self._searcher.search(query, freq)
             hits = pyJava.JArray2List(result.scoreDocs)
             logging.info("For " + str(query) + " : " + str(result.totalHits))
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             self._searcher.explain(query, hit.doc)
             ann = Annotation()
             features = dict()
             features[FreyaConstants.CLASS_FEATURE_LKB]=doc.get(FreyaConstants.CLASS_FEATURE_LKB)
             features[FreyaConstants.INST_FEATURE_LKB]=doc.get(FreyaConstants.INST_FEATURE_LKB)
             features[FreyaConstants.PROPERTY_FEATURE_LKB]=doc.get(FreyaConstants.PROPERTY_FEATURE_LKB)
             features["string"]=doc.get(FreyaConstants.FIELD_EXACT_CONTENT)
             features[FreyaConstants.SCORE]=hit.score
             ann.setFeatures(features)
             ann.setEndOffset(annotation.getEndOffset())
             ann.setStartOffset(annotation.getStartOffset())
             ann.setSyntaxTree(annotation.getSyntaxTree())
             ann.setText(annotation.getText())
             annotations.append(ann)
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return annotations