示例#1
0
 def search(self, field: str):
     sear = self._search
     if len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']:
         query = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
     elif self._commandInfo.getKey()[0] == '#':
         query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
         query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1]))
         bc1 = BooleanClause(query1, BooleanClause.Occur.MUST)
         bc2 = BooleanClause(query2, BooleanClause.Occur.MUST)
         query = BooleanQuery.Builder().add(bc1).add(bc2).build()
     elif self._commandInfo.getKey()[0] in ['$', '+']:
         bq = BooleanQuery.Builder()
         for w in self._commandInfo.getWordList():
             queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w))
             bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
             bq.add(bc)
         query = bq.build()
     else:
         query = ''
     hits = sear.search(query, 999999)
     for hit in hits.scoreDocs:
         doc = sear.doc(hit.doc)
         res = doc.get(field)
         id = doc.get(field+'_id')
         if doc_hit(res, self._commandInfo):
             sentences = re.split('[!?!?。]', res)
             map(lambda x: sentences.pop(x) if x == '' else 0, range(len(sentences)))
             for sentence in sentences:
                 if key_filter(self._commandInfo, sentence):
                     self._doc[id] = res
                     self._resultSentencesList.append((id, sentence))
     return self
示例#2
0
    def search_pair_score_multithread_part(self, q_doc_int):

        # print(q_doc_int)
        spl=q_doc_int.split('<|endoftext|>')
        q = spl[0]
        print(q)
        doc_int = int(spl[1])
        print(doc_int)

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = self.curr_searcher.explain(query, doc_int)
        c[1] = exp

        return c
示例#3
0
    def _search_singlethread(
            self, queries: List[str],
            doc_max: int) -> List[List[Dict[str, Union[float, str]]]]:
        queries_result = []
        for query in queries:
            try:
                query = QueryParser('text', self.analyzer).parse(
                    QueryParser.escape(query))
            except Exception as exception:  # pylint: disable=broad-except
                logger.warning(
                    colored(f'{exception}: {query}, use query dummy.'),
                    'yellow')
                query = QueryParser('text', self.analyzer).parse('dummy')

            query_results = []
            hits = self.searcher.search(query, doc_max)

            for hit in hits.scoreDocs:
                doc = self.searcher.doc(hit.doc)

                query_results.append({
                    'score': hit.score,
                    'title': doc['title'],
                    'text': doc['text']
                })

            if not query_results:
                logger.warning(
                    colored(
                        f'WARN: search engine returns no results for query: {query}.',
                        'yellow'))

            queries_result.append(query_results)

        return queries_result
示例#4
0
    def more_like_this2(self, item_doc, result_num):
        similar_questions = []
        if not item_doc:
            item_doc.append(ResultItem(None, 1.0, "No Title", 0))
        query = ""
        if item_doc.doc:
            query += self.document_to_query(item_doc.doc)

        query = remove_unified_stop_lists(query)
        queryparser = QueryParser(Version.LUCENE_CURRENT, "term",
                                  self.analyzer)

        if query:
            try:
                like_query = queryparser.parse(query)
                hits = self.searcher.search(like_query, result_num).scoreDocs

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    similar_questions.append(doc.get("question_id"))

            except Exception as e:
                print "Question Searcher: Error: %s" % e
                # write_search_log("Question Searcher: Error: %s" % e + "\n")
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        # self.searchermgr.release(self.searcher)
        # self.searcher = None
        # self.directory.close()
        # self.directory = None
        return similar_questions

    # def release(self, searcher):
示例#5
0
def get_evidence(searcher, analyzer, claim):
    escaped_string = QueryParser.escape(claim)
    query = QueryParser("text", analyzer).parse(escaped_string)
    start = datetime.now()
    scoreDocs = searcher.search(query, 50).scoreDocs
    duration = datetime.now() - start
    claim = nlp(claim)
    claim_evid = []
    line_no = []
    sim_score = []
    final_evidence = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        norm_doc = doc.get("text")
        norm_doc = nlp(norm_doc)
        val = claim.similarity(norm_doc)
        try:
            int(doc.get("Sno"))
            claim_evid.append(doc.get("keyterm"))
            line_no.append(int(doc.get("Sno")))
            sim_score.append(val)
        except ValueError:
            pass      # or whatever
        
    if len(sim_score)>5:
        for val in range(0,5):
            index = sim_score.index(max(sim_score))
            claim = claim_evid.pop(index)
            line = line_no.pop(index)
            final_evidence.append([claim , line])
            del sim_score[index]
    else:
        for i in range(0, len(sim_score)-1):
            final_evidence.append([claim_evid[i] , int(line_no[i])])
    return final_evidence
示例#6
0
def run_img(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index2"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    querys = BooleanQuery()
    query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent",
                                analyzer).parse(command)
    query_title = QueryParser(Version.LUCENE_CURRENT, "title",
                              analyzer).parse(command)
    querys.add(query_content, BooleanClause.Occur.SHOULD)
    querys.add(query_title, BooleanClause.Occur.SHOULD)
    scoreDocs = searcher.search(querys, 50).scoreDocs
    if len(scoreDocs) == 0:
        print "WARNING: No result"
    result = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        print doc.get("title")
        data = {}
        data['title'] = doc.get('title')
        data['url'] = doc.get('url')
        data['imgurl'] = doc.get('imgurl')
        result.append(data)
    return result
示例#7
0
 def findLiteral(self, instanceUri, propertyURI):
     labels = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
         flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
         labelOrTitleUris = "\"" + propertyURI + "\""
         queries = ["\"" + QueryParser.escape(instanceUri) + "\"", QueryParser.escape(labelOrTitleUris)]
         query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer)
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             labels.append(doc.get(FreyaConstants.FIELD_EXACT_CONTENT))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return labels
示例#8
0
def searcher(directory, analyzer, queries_file):
    lines = queries_file.readlines()
    length = len(lines)
    a_query = ''
    query_counter = 0
    log = open("log.txt", "a")
    for line_number in range(length):
        if lines[line_number].startswith("<num>"):
            query_id = lines[line_number][14:].strip()
        elif lines[line_number].startswith("<desc>"):
            a_query = lines[line_number + 1].strip()
            a_query = stop_words(a_query)
        if a_query != '':
            # searching the index
            reader = DirectoryReader.open(directory)
            searcher = IndexSearcher(reader)
            # parse the query
            parser = QueryParser("DocParagraph", analyzer)
            query = parser.parse(a_query)
            # return 50 queries are required by the assignment
            hits = searcher.search(query, 50).scoreDocs
            # rank counter 1 through 50
            rank_counter = 1
            for hit in hits:
                result = searcher.doc(hit.doc)
                # write search result to log text file
                to_log = str(query_id) + " " + "Q" + str(
                    query_counter) + " " + str(result.get(
                        "DocID")) + " " + str(rank_counter) + " " + str(
                            hit.score) + " " + "Alex's" + "\n"
                log.write(to_log)
                rank_counter += 1
            query_counter += 1
            a_query = ''
    log.close()
示例#9
0
    def more_like_this(self, result_num, query):
        result = []
        queryparser = QueryParser(Version.LUCENE_CURRENT, "methods_called",
                                  self.porter_analyzer)
        if query:
            try:
                query = arranging_query_regex(query=query)
                # print '4. Right after the regex handling : ', query
                like_query = queryparser.parse(query)
                # print '5. Right after the Lucene parser : ', like_query

                hits = self.searcher.search(like_query, result_num).scoreDocs
                # filterScoreDosArray = hits.topDocs().scoreDocs;

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    # matched_terms = self.get_matched_keywords(like_query, hit.doc)
                    result.append(doc.get("answer_id"))

            except Exception as e:
                print "AnswerSearcher: Error: %s" % e
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return result
示例#10
0
 def findDirectTypes(self, instanceUri, max):
     dTypes = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, "inst", analyzer)
         query = parser.parse("\"" + QueryParser.escape(instanceUri) + "\"")
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = 0
         if max != None:
             freq = max
         else:
             freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             self._searcher.explain(query, hit.doc)
             dTypes.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     logging.debug("there are " + str(len(dTypes)) + " unique direct types")
     return dTypes
示例#11
0
文件: retriever.py 项目: kevkid/YIF
def get_image_pmcid(pmcid, classes = ""):
    fields = ["pmcid", "class"]
    docs = []
    location = web.__path__[0] + "/static/web/files/index/index.figures"
    #lucene.initVM()
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(location)))
    searcher = IndexSearcher(reader)
    # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
    
    #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
    # query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    
    #query = query.parse(query, ('4175339','1'))
    # query.parse(queryString)#"Shigella sonnei"
    # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"

    MAX = 10000
    #hits = searcher.search(query, MAX)
    if classes == "all":
        queryStr = "pmcid:(" + ' '.join(pmcid) +")"
    else:
        queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes
    query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query
    q = query.parse(queryStr) 
    hits = searcher.search(q, MAX)
    for hit in hits.scoreDocs:#should only be one
        #print hit.score, hit.doc, hit.toString()
        docs.append(searcher.doc(hit.doc))
    return docs #This will return the image documents that belong to a pmcid(article)
示例#12
0
文件: syntax.py 项目: zoudajia/rencos
def retriever(file_dir):
    analyzer = WhitespaceAnalyzer()
    reader = DirectoryReader.open(
        SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")))
    searcher = IndexSearcher(reader)
    queryParser = QueryParser("code", analyzer)
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE)

    with open(file_dir + "/train/train.spl.src",
              'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu:
        sources = [line.strip() for line in fso.readlines()]
        summaries = [line.strip() for line in fsu.readlines()]
    with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \
            open(file_dir+"/output/ast.out", 'w') as fws:
        queries = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in ft.readlines()
        ]

        for i, line in enumerate(queries):
            print("query %d" % i)
            query = queryParser.parse(QueryParser.escape(line))
            hits = searcher.search(query, 1).scoreDocs
            flag = False

            for hit in hits:
                doc = searcher.doc(hit.doc)
                _id = eval(doc.get("id"))
                flag = True
                fwo.write(sources[_id] + '\n')
                fws.write(summaries[_id] + '\n')
            if not flag:
                print(query)
                print(hits)
                exit(-1)
示例#13
0
def search_index(indexfile, querytext, top=10, qe=False, default_field="text", display_fields=["subreddit", "author", "text"]):
    lucene.initVM()

    lindex = SimpleFSDirectory(Paths.get(indexfile))
    ireader = DirectoryReader.open(lindex)
    isearcher = IndexSearcher(ireader)

    analyser = StandardAnalyzer()

    parser = QueryParser(default_field, analyser)
    query = parser.parse(querytext)

    hits = isearcher.search(query, top).scoreDocs
    docIDs = [hit.doc for hit in hits]
    print_results(isearcher, hits, display_fields)
    if len(hits) == 0:
        print("No hits!")
    elif qe:
        print("\n")
        print("Which documents were relevant to your search need? (Enter spaced list of result numbers [1-{}], e.g. 2 4 5)".format(top))
        relevantids = [docIDs[i-1] for i in [int(x) for x in input().split()]]
        nonrelevantids = [id for id in docIDs if id not in relevantids]

        print("\n\n")

        qequerytext = queryexpansion.rocchio(ireader, querytext, relevantids, nonrelevantids)
        print("Expanded search query: '{}'\n".format(qequerytext))
        qequery = parser.parse(qequerytext)
        qehits = isearcher.search(qequery, top).scoreDocs
        print_results(isearcher, qehits, display_fields)

    ireader.close()
    lindex.close()
示例#14
0
 def __init__(self,
              index_path,
              field,
              similarity="boolean",
              use_relevance_feedback=False,
              feedback_index_path=None):
     self.reader = DirectoryReader.open(
         FSDirectory.open(Paths.get(index_path)))
     self.searcher = IndexSearcher(self.reader)
     if use_relevance_feedback and feedback_index_path is not None:
         self.feedback_reader = DirectoryReader.open(
             FSDirectory.open(Paths.get(feedback_index_path)))
         self.feedback_searcher = IndexSearcher(self.feedback_reader)
     self.similarity = similarity
     self.stopwords = stop_words()
     if similarity == "boolean":
         self.searcher.setSimilarity(BooleanSimilarity())
     elif similarity == "tf":
         self.searcher.setSimilarity(TFSimilarity())
     elif similarity == "tfidf":
         self.searcher.setSimilarity(ClassicSimilarity())
     elif similarity == "BM25":
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     else:
         print("Unknown similarity, so we use BM25(1.2, 0.2) as default")
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     analyzer = StandardAnalyzer()
     print(self.searcher.getSimilarity())
     self.parser = QueryParser(field, analyzer)
示例#15
0
    def search_synonym(self, query):
        self.hits_dict = {}
        self.hits = []
        similar_terms = self.w2v_model.most_similar(query)
        parser = QueryParser('text', self.analyzer)
        query = parser.parse(query)

        for s_term in similar_terms[:20]:
            s_term_query = parser.parse(s_term[0])
            hits = self.searcher.search(s_term_query, 1000).scoreDocs
            hit_count = 0
            for hit in hits:
                doc = self.searcher.doc(hit.doc)
                text = doc.get('text')
                terms = text.split()
                sentence = ''
                for term in terms:
                    sentence += term
                simpleHTMLFormatter = SimpleHTMLFormatter(
                    prefixHTML, suffixHTML)
                highlighter = Highlighter(simpleHTMLFormatter,
                                          QueryScorer(query))
                highLightText = highlighter.getBestFragment(
                    self.analyzer, 'text', sentence)
                if highLightText is not None:
                    self.hits.append(highLightText)
                    hit_count += 1
                if hit_count >= 3:
                    break

            if len(self.hits) > 0:
                self.hits_dict[s_term] = self.hits
                self.hits = []

        return self.hits_dict
	def search(self, index_dir):
		# Get handle to index directory
		directory = SimpleFSDirectory(File(index_dir))

		# Creates a searcher searching the provided index.
		ireader  = DirectoryReader.open(directory)

		# Implements search over a single IndexReader.
		# Use a single instance and use it across queries
		# to improve performance.
		searcher = IndexSearcher(ireader)

		# Get the analyzer
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

		# Constructs a query parser. We specify what field to search into.
		queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)

		# Create the query
		query = queryParser.parse(self.query)

		# Run the query and get top 50 results
		topDocs = searcher.search(query, self.retrieve_count)

		# Get top hits
		scoreDocs = topDocs.scoreDocs

		doc_ids = []
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			doc_ids.append(doc.get(FIELD_PATH))
		return [int(item) for item in doc_ids]
示例#17
0
    def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True):

        # if prm.n_threads > 1:
        #     out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher)
        #     if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #         terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term)
        # else:
        # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher)
        # if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #     terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term)
        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))

        c = OrderedDict()
        exp = self.searcher.explain(query, doc_int)
        c[1] = exp
        out.append(c)

        return out
示例#18
0
    def Qsearch(self,query):
        words = seg.segment(query.strip())
        #words = self.segmentor.segment(query.strip())
        #print ' '.join(words)
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        result = QueryParser(Version.LUCENE_CURRENT, "contents",self.analyzer)
        result.setPhraseSlop(0)
        # "\""+' '.join(words)+"\"~0" means words should be continuous
        query = result.parse("\""+' '.join(words)+"\"~0")
        totalHits = self.searcher.search(query, 50)
        #print "%s total matching documents." % totalHits.totalHits
        #return totalHits.totalHits

        for hit in totalHits.scoreDocs:
            #print"Hit Score: ",hit.score, "Hit Doc:",hit.doc, "HitString:",hit.toString()
            doc= self.searcher.doc(hit.doc)
            #print doc.get("name").encode("utf-8")
        #print "----------------------------------------"
        t = Term('contents',' '.join(words))
        #termDocs = ireader.termDocs(t)
        #for tt in termDocs:
        #       print ireader.document(termDocs.docs).getFeildable('neme'),termDocs.freq()
        #print self.reader.totalTermFreq(t)
        return self.reader.totalTermFreq(t)
示例#19
0
 def findPropertyURIs(self, propertyType, max):
     uris = list() # list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.CLASS_FEATURE_LKB, analyzer)
         query = parser.parse("\"" + QueryParser.escape(propertyType) + "\"")
         result = self._searcher.search(query, 1)
         freq = result.totalHits
         if max != None:
             freq = max.intValue()
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         logging.debug("For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max))
         print "For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             self._searcher.explain(query, hit.doc)
             uris.append(doc.get(FreyaConstants.INST_FEATURE_LKB))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return uris
示例#20
0
 def findSubClasses(self, classUri): #RESOLVE multifieldqueryparser DOCUMENTATION PROBLEM!!!!
     propertyURI = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
     subClasses = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         fields = [FreyaConstants.CLASS_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
         flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
         subClassUri = "\"" + QueryParser.escape(propertyURI) + "\""
         queries = ["\"" + QueryParser.escape(classUri) + "\"", subClassUri]
         query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,queries, fields,flags,analyzer)
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             subClasses.append(doc.get(FreyaConstants.INST_FEATURE_LKB))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return subClasses
def text_search(command):
    envir.vm_env.attachCurrentThread()
    command_dict = parseCommand(command, "contents")
    querys = BooleanQuery()
    for k, v in command_dict.iteritems():
        query = QueryParser(Version.LUCENE_CURRENT, k,
                            envir.analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)

    scoreDocs = envir.text_searcher.search(querys, 30).scoreDocs
    res = []

    query_highlight = QueryParser(Version.LUCENE_CURRENT, k,
                                  envir.analyzer).parse(command_dict["contents"])
    myhighlighter = Highlighter(
        SimpleHTMLFormatter(), QueryScorer(query_highlight))
    myhighlighter.setTextFragmenter(SimpleFragmenter(50))
    for scoreDoc in scoreDocs:
        # find texts which are around the keyword
        doc = envir.text_searcher.doc(scoreDoc.doc)
        text = doc.get("contents")
        key_text = "".join((myhighlighter.getBestFragments(
            envir.analyzer, "contents", text, 3)))
        key_text = re.sub('\s', '', key_text)
        temp = [doc.get("title"), doc.get('url'), key_text]
        res.append(temp)
    return res
    def retrieve_sents(self):

        indexDir = self.indexDir
        query = self.query

        sent_ind_list = []
        # template = CustomTemplate(format)
        fsDir = SimpleFSDirectory(Paths.get(indexDir))
        # print indexDir
        searcher = IndexSearcher(DirectoryReader.open(fsDir))

        analyzer = StandardAnalyzer()
        parser = QueryParser("contents", analyzer)
        parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(query)
        # print query
        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start
        # print query
        if self.stats:
            print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % (
                len(scoreDocs), duration, query)

        for scoreDoc in scoreDocs:
            # print scoreDoc.doc
            # doc = searcher.doc(scoreDoc.doc)
            sent_ind_list.append(scoreDoc.doc)

        return sent_ind_list
def buscar(indexDir, args,options = None):
    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
    fsDir = SimpleFSDirectory(File(indexDir))
    #print fsDir
    
    #Criando buscador baseado no diretorio dos indices passados pelo usuario
    searcher = IndexSearcher(DirectoryReader.open(fsDir))
    
    #Analizador para filtro dos tokens 
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    #print analyzer

    #Criando um QueryParser usando por padrao contents
    #Variavel com as restricoes da busca
    parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
    #print parser

    parser.setDefaultOperator(QueryParser.Operator.AND)

    #print args
    #Juntando parametros passados com o valor do mesmo
    command = ' +'.join(args)
    #print command

    query = parser.parse(command)
    print query

    #Criando um JArray com resultado da consulta
    return searcher.search(query, 200).scoreDocs
示例#24
0
    def search(**kwargs):
        vm_env.attachCurrentThread()
        query = BooleanQuery() 

        print("Searched keywords:")
        for field_name, keywords in kwargs.items():
            # assert field_name in SearchConfig.searchable_fields

            # keywords = list(filter(None, jieba.cut(keywords, cut_all=True)))
            keywords = list(filter(None, (k.strip() for k in jieba.cut_for_search(keywords))))
            
            for kw in keywords:
                print(kw)

            # construct query
            for kw in keywords:
                q = QueryParser(Version.LUCENE_CURRENT, field_name, analyzer).parse(kw)
                query.add(q, BooleanClause.Occur.SHOULD)

            if field_name == 'keywords':
                for kw in keywords:
                    q = QueryParser(Version.LUCENE_CURRENT, 'ent_name', analyzer).parse(kw)
                    query.add(q, BooleanClause.Occur.SHOULD)

        # search
        scoreDocs = searcher.search(query, 50).scoreDocs

        return [retrieve(searcher.doc(scoreDoc.doc)) for scoreDoc in scoreDocs]
示例#25
0
def find_all_text_occurrences(objects: list) -> (dict, DirectoryReader):
    docs_lookup = dict()
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT")
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)
    searcher = IndexSearcher(reader)
    parser = QueryParser('contents', StandardAnalyzer())

    logging.warning(
        'FOR MULTI-WORD OBJECTS, ALL DOCUMENTS WITH BOTH TERMS SEPARATELY WILL BE RETRIEVED'
    )

    for object in objects:
        tokens = object.split(' ')

        doc_sets = []
        for token in tokens:
            q = parser.parse(f'"{token}"')
            # TODO maybe use minimum score
            topdocs = searcher.search(q, 99999999)
            results = set([topdoc.doc for topdoc in topdocs.scoreDocs])
            doc_sets.append(results)
        docs_lookup[object] = set.intersection(*doc_sets)

    return docs_lookup, reader
示例#26
0
    def search(self, query_str, restriction=2):
        self.attachCurrentThread()

        # 对query进行解析
        result_contexts = []
        # 根据有没有‘/’判断有没有词性,
        if '/' in query_str:
            # 有词性就转到search_phrases
            result_contexts = self.search_phrases(query_str)
        else:
            # 有词性就转到search_terms
            result_contexts = self.search_terms(
                QueryParser("context", self.analyzer).parse(query_str))

        # 将搜索结果复原为文章返回
        self.recover_to_article(query_str, result_contexts, restriction)

        final_result = []
        #进行搜索结果中跟query相关的文段高量处理
        simpleHTMLFormatter = SimpleHTMLFormatter(u"<b><font color='red'>",
                                                  u"</font></b>")
        for index, recovered_query in enumerate(self.recovered_queries):
            # 不是直接拿用户输入的query来进行高亮处理,而是通过我们自己处理好的包含了位置约束的query进行高亮处理
            recovered_query = recovered_query.replace("/", ",")
            highlighter = Highlighter(
                simpleHTMLFormatter,
                QueryScorer(
                    QueryParser("context",
                                self.analyzer).parse(recovered_query)))
            highLightText = highlighter.getBestFragment(
                self.analyzer, 'context', self.recovered_contexts[index])
            if highLightText is not None:
                final_result.append(highLightText)

        return final_result
示例#27
0
 def searchForClass(self, inst, pred):
     classUris = list()
     fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
     flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
     queries = ["\"" + QueryParser.escape(inst) + "\"", "\"" + QueryParser.escape(pred) + "\""]
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer)
         result = self._searcher.search(query, 1)
         logging.info("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             classUris.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
             indexus += 1
     except Exception as e:#ParseException(e):
         print e.message
         logging.error("Error")
     return classUris
示例#28
0
    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()
    
        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            except:
                print('Unexpected error when processing query:', str(q))
                print('Using query "dummy".')
                q = 'dummy'
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = list(map(int, doc['word_idx'].split(' ')))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                # c[int(doc['id'])] = [word_idx, word]
                c[int(doc['id'])] = [word_idx, word, hit.score]
            # print(c)
            return c
def run(searcher, analyzer,keyword,way):
    while True:
        try:
            command = keyword.encode('utf8')
        except UnicodeDecodeError:
            command = keyword
        if command == '':
            return []
        if way=='contents':
            query = QueryParser(Version.LUCENE_CURRENT, "contents",analyzer).parse(command)
        elif way=='tags':
            query = QueryParser(Version.LUCENE_CURRENT, "tag",analyzer).parse(command)            
        scoreDocs = searcher.search(query, 50).scoreDocs
        result=[]
        result.append(command)
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            item=[]
            item.append(doc.get('title'))
            item.append(doc.get('url'))
            item.append(doc.get('price'))
            item.append(doc.get('imgurl'))
            item.append(doc.get('wellrate'))
            item.append(doc.get('comment'))
            item.append(doc.get('tag'))
            #print doc.get('comment').encode('utf8')
            result.append(item)
	#print result
        
        return result
示例#30
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q, a, t, p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n += 1

        q = q.replace('AND', '\\AND').replace('OR',
                                              '\\OR').replace('NOT', '\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text",
                            analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)

    return candidates
示例#31
0
 def __init__(self, indexDir):
     self.directory = SimpleFSDirectory(Paths.get(indexDir))
     self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
     self.nameQueryParser = QueryParser('name', StandardAnalyzer())
     self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND)
     self.idQueryParser = QueryParser('id', StandardAnalyzer())
     self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
示例#32
0
def func1(genre, year):
    vm_env.attachCurrentThread()
    lists = []
    query = BooleanQuery()
    if genre != "111":
        item = QueryParser(Version.LUCENE_CURRENT, "genre",
                           analyzer).parse(genre)
        query.add(item, BooleanClause.Occur.MUST)
    if year != "111":
        item = QueryParser(Version.LUCENE_CURRENT, "year",
                           analyzer).parse(year)
        query.add(item, BooleanClause.Occur.MUST)
    sf = SortField("score", SortField.Type.STRING, True)
    s = Sort(sf)
    scoreDocs = searcher1.search(query, 20, s).scoreDocs
    for scoreDoc in scoreDocs:
        movie = []
        doc = searcher1.doc(scoreDoc.doc)
        movie.append(doc.get("url"))
        movie.append(doc.get("picture"))
        movie.append(doc.get("title"))
        movie.append(doc.get("score"))
        movie.append(doc.get("genre"))
        movie.append(doc.get("stars"))
        movie.append(doc.get("comments"))
        lists.append(movie)
    return lists
示例#33
0
    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND', '\\AND').replace('OR',
                                                          '\\OR').replace(
                                                              'NOT', '\\NOT')
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape(q))
                except:
                    print 'Unexpected error when processing query:', str(q)
                    print 'Using query "dummy".'
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = map(int, doc['word_idx'].split(' '))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    c[int(doc['id'])] = [word_idx, word]

                out.append(c)

        return out
示例#34
0
    def more_like_this2(self, so_items):

        github_result = []
        if not so_items:
            so_items.append(SOResultItem(None, 1.0, "No Title", 0, ""))

        query = ""
        for so_item in so_items:
            queryparser = QueryParser(Version.LUCENE_CURRENT,
                                      "typed_method_call", self.analyzer)

            if so_item.doc:
                query += self.document_to_query(so_item.doc)

            query += self.code_as_text()

        if query:
            print "-" * 30
            print "UNified Query: %s" % query
            print "-" * 30
            try:
                like_query = queryparser.parse(query)

                hits = self.searcher.search(like_query, 10).scoreDocs

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords2(
                        like_query, hit.doc)

                    # apis = [d.stringValue() for d in doc.getFields("typed_method_call")]
                    print("file__", doc.get("file"), "file_content",
                          doc.get("file_content"), "line_numbers",
                          doc.get("line_numbers"))
                    file_path = "/extdsk/FaCoY/Git_data/G" + doc.get(
                        "file")[24:]
                    print(file_path)
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        pass
                    if content:
                        item = GithubResultItem(doc.get("file"), content,
                                                matched_terms,
                                                hit.score, so_item,
                                                doc.get("line_numbers"),
                                                hit.doc)  # code

                        github_result.append(item)
                    #print("%d. File: %s, Matched: %s, Score: %s" % (i + 1, doc.get("file"), matched_terms, hit.score))
            except Exception as e:
                print "GitSearcher: Error: %s" % e
                print(traceback.format_exc())

        # print Counter(files).most_common(5)

        return github_result
示例#35
0
    def __init__(self, tfidf_path, strict=True):
        lucene.initVM()
        analyzer = StandardAnalyzer()
        reader = DirectoryReader.open(SimpleFSDirectory(Paths.get(tfidf_path)))
        self.searcher = IndexSearcher(reader)

        self.parser = QueryParser("text", analyzer)
        self.parser.setDefaultOperator(QueryParser.Operator.OR)
示例#36
0
    def getDoc(self, doc, sentenseid):

        query = QueryParser.escape(doc + ' ' + str(sentenseid))
        query = QueryParser('docname', self.analyzer).parse(query)
        score = self.searcher.search(query, 1).scoreDocs

        doc = self.searcher.doc(score[0].doc)
        return doc.get('docname'), doc.get('contents')
示例#37
0
 def retrieve(self, term, sid):
     query = term + ' ' + str(sid)
     query = self.repalcer(query)
     query = QueryParser.escape(query)
     query = QueryParser('name-sid', self.analyzer).parse(query)
     score = self.searcher.search(query, 1).scoreDocs
     doc = self.searcher.doc(score[0].doc)
     return doc.get('name-sid'), doc.get('contents')
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
示例#39
0
    def document_to_query(self, doc):
        """ Given a document it transforms the source code related fields to a lucene query string"""
        query = ""
        for field in [
                "description"
        ]:  #여기의 필드가 description 으로 설정 했고... 맨 끝에서 field, term이런식으로 넣으니.. 중복이 많음..
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    #tokenize
                    term = self.tokenize_string(StandardAnalyzer(), term)
                    #CamelCase
                    temp = []
                    for t in term:
                        temp += self.camel_case_split(t)
                    #stopwords
                    temp_2 = []
                    for t in temp:
                        if t not in english_stop_words:
                            temp_2.append(t)
                    #stemming
                    temp_3 = []
                    for t in temp_2:
                        temp_3.append(stem(t))
                    #stopwords
                    temp_4 = []
                    for t in temp_3:
                        if t not in english_stop_words:
                            temp_4.append(t)
                    #query generation
                    for term in temp_4:
                        query += "%s:%s " % (field, term)

        for field in [
                "typed_method_call", "methods", "used_classes",
                "class_instance_creation", "methods_called"
        ]:  # "extends", "annotations", "literals"
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue(
                    ))  #이 자리에서 Unified Query 정제 되나 한번 보자......
                    stoplist = ["java.lang.Object"]
                    if term not in stoplist:
                        query += "%s:%s " % (field, term)

        if len(doc.getFields("code_hints")) > 0:
            hints = [
                hint.stringValue() for hint in doc.getFields("code_hints")
            ]
            hints_str = " ".join(hints)
            for term in hints:
                if term:
                    term = QueryParser.escape(term)
                    if term not in english_stop_words:
                        # print "Including 'code_hints' from Doc_To_Query TERMs... //", term
                        query += "code_hints:%s " % term
        return query
示例#40
0
def retrieve_wiki(text_query, searcher, analyzer):    
    txt =text_query
    try:
        query = QueryParser(Version.LUCENE_CURRENT, "contents", 
                            analyzer).parse(txt)
    except:
        qp = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        txt = qp.escape(txt)
        query = qp.parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs
    
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('title'), doc.get('contents')    
def author_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	print(entry['prim_author'])
        
        if qry in entry['prim_author'].lower():
     	
             fname =  short_title + CONTENT_EXT
             results[entry_id] = {'title': short_title, 'file': fname }
    
    f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w')
    f.write(json.dumps(results))
    f.close()
    return json.dumps(results)
    def perform_search(self, searchterm):
        # processing a query
        parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
        parser.setDefaultOperator(QueryParser.Operator.AND)

        query = parser.parse(searchterm)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start

        print scoreDocs
        print duration
示例#43
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q,a,t,p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n+=1

        q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)
        
    return candidates
示例#44
0
def do_mapping(line):
    regex = re.match(r"(?P<netflix_id>[0-9]+),(?P<year>([0-9]+)|NULL),(?P<title>.+)", line)
    if not regex:
        raise ValueError(line)
    netflix_id = int(regex.group("netflix_id"))

    title = QueryParser.escape(regex.group("title"))
    query1 = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(title)

    year = regex.group("year")
    if year == "NULL":
        scoreDocs = searcher.search(query1, 1).scoreDocs
    else:
        year = int(year)

        query2 = NumericRangeQuery.newIntRange("year", year, year, True, True)
        booleanQuery = BooleanQuery();
        booleanQuery.add(query1, BooleanClause.Occur.MUST);
        booleanQuery.add(query2, BooleanClause.Occur.MUST);

        scoreDocs = searcher.search(booleanQuery, 1).scoreDocs

    if scoreDocs:
        if scoreDocs[0].score > 1.5:
            doc = searcher.doc(scoreDocs[0].doc)
            doc_id = doc.getField("id").stringValue()
            doc.add(StringField("netflix_id", str(netflix_id), Field.Store.YES))
            writer.updateDocument(Term("id", doc_id), doc)
示例#45
0
    def run(self, searcher, analyzer, rawQuery):
        query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(QueryParser.escape(rawQuery))  # escape special characters 
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")
示例#46
0
 def form_new_query_from_rf(self, relevant_doc_ids):
     firstSet  = True
     new_query = set()
     for id in relevant_doc_ids:
         doc          = self.searcher.doc(id)
         contents     = re.sub('[/\*&^%$#@?\'`":()<>]', " ", doc.get("title")).strip()
         query        = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer).parse(contents)
         keywords     = query.toString().split("contents:")
         keywords_set = set()
         for k in keywords:
             if k.strip() != "":
                 keywords_set.add(k)
         if firstSet:
             new_query = set(keywords_set)
         else:
             new_query = new_query & set(keywords_set)
             firstSet  = False
     return " ".join(new_query)
    def similarityOfSynopsis(self):
        directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX))
        ireader  = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)
        for root, dirnames, filenames in os.walk(settings.SYNOPSIS):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    while True:
                        try:
                            query = queryParser.parse(QueryParser.escape(content))
                        except Exception as e:
                            self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2)
                            print self.boolean_query.maxClauseCount
                            continue
                        break

                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first()
                            similarity.synopsis = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)
示例#48
0
    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
def custom_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()
    rootdir = OUT_RAW_DIR

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    print rootdir
    
    results = {}
    for hit in hits:
    	doc = searcher.doc(hit.doc)
    	entry_id = doc.get('entry_id')
    	
    	entry = entry_map.get(entry_id)
    	
    	short_title = entry['short_title']
    	year = entry['publ_year']
    	
      fname = short_title + CONTENT_EXT
      results[fname] = year;
示例#50
0
class SearchIndex(object):

    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)


    def search(self, q, page = 1, duplicates = False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)
        
        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
            
            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight}
            )

        del self.searcher
        
        totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))

        return totalPages, docs

    def addDuplicatesQuery(self, query):
        not_duplicate = TermQuery(Term('duplicate', 'false'))
        booleanQuery = BooleanQuery()
        booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
        booleanQuery.add(query, BooleanClause.Occur.MUST)
        return booleanQuery
示例#51
0
 def findTopClasses(self):
     propertyURI = RDFS.SUBCLASSOF
     allClasses = list()
     topClasses = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.PROPERTY_FEATURE_LKB, analyzer)
         query = parser.parse("\"" + QueryParser.escape(propertyURI) + "\"")
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             allClasses.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
             indexus += 1
         # for (String classUri : allClasses) {
         indexus = 0
         while indexus < len(allClasses):
             classUri = allClasses[indexus]
             logging.info("Checking whether " + classUri + " is a top class.")
             # search inst and pred retrieve class
             # if class exists that means it is not top class otherwise add to
             # topClasses
             classes = self.searchForClass(classUri, propertyURI)
             logging.info("top classes:" + str(len(classes)))
             if classes != None or len(classes) > 0:
                 logging.info("This is not a top class...")
             else:
                 topClasses.append(classUri)
                 logging.info("Adding " + classUri + " to top classes.")
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return topClasses
def do_search(qry, limit):
    helper.initPyLucene()
    RNLP_ctxt = _get_rnlp_ctxt()
    entry_map = RNLP_ctxt.get_entry_map()

    from org.apache.lucene.index import DirectoryReader
    from org.apache.lucene.search import IndexSearcher
    from org.apache.lucene.queryparser.classic import QueryParser
    from org.apache.lucene.analysis.standard import StandardAnalyzer
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from java.io import File
    
    print os.path.abspath(os.path.pardir)
    
    reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR)))
    searcher = IndexSearcher(reader)
    analyzer = StandardAnalyzer(Version.LUCENE_40)

    field = 'contents'
    parser = QueryParser(Version.LUCENE_40, field, analyzer);

    query = parser.parse(qry);
    print 'Searching for:', query.toString(field)
    raw_results = searcher.search(query, limit)
    
    hits = raw_results.scoreDocs
    numTotalHits = raw_results.totalHits
    print numTotalHits, 'total matching documents'
    results = []
    for hit in hits:
        doc = searcher.doc(hit.doc);
        entry_id = doc.get('entry_id')

        entry = entry_map.get(entry_id)
        #print 'entry:', entry
        score = hit.score
        #print 'Hit:', entry['short_title'], score
        results.append((score, doc, entry))
        
    return results
def search(termo, **args):
	
	indexDir = os.environ.get('MANDEX') or '3iteracao'
	fsDir = SimpleFSDirectory(File(indexDir))
	searcher = IndexSearcher(DirectoryReader.open(fsDir))
	
	analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
	parser = QueryParser(Version.LUCENE_CURRENT, field, analyzer)
	parser.setDefaultOperator(QueryParser.Operator.OR)
	query = parser.parse(termo + ' '.join(args.values()))
	start = datetime.now()
	scoreDocs = searcher.search(query, 50).scoreDocs
	duration = datetime.now() - start

	politicos = []
	for scoreDoc in scoreDocs:	    
	    doc = searcher.doc(scoreDoc.doc)
	    table = dict((field.name(), field.stringValue()) for field in doc.getFields())	   
	    politicos.append(table)

	return politicos
    def perform_search(self, searchterm, results_per_page, page):
        # if there is a field in the searchterm
        """if ":" in searchterm:
            # processing a query
            parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
            parser.setDefaultOperator(QueryParser.Operator.AND)

            query = parser.parse(searchterm)

        else:
            query = BooleanQuery()
            query_title = TermQuery(Term("title", searchterm))
            query_description = TermQuery(Term("description", searchterm))
            query_content = TermQuery(Term("content", searchterm))

            #  BooleanClause.Occur.MUST for AND queries
            query.add(query_title, BooleanClause.Occur.SHOULD)
            query.add(query_description, BooleanClause.Occur.SHOULD)
            query.add(query_content, BooleanClause.Occur.SHOULD)"""

        # create QueryParser for each field to be searched
        parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer)
        parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer)
        parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)

        # put fields together
        query = BooleanQuery()
        query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        hits = searcher.search(query, results_per_page + (results_per_page * page))
        score_docs = hits.scoreDocs
        count_results = hits.totalHits
        duration = datetime.now() - start

        # results to return
        results = []
        count = 0

        for scoreDoc in score_docs:

            # skip offset
            if count < results_per_page * page:
                count += 1
                continue
            count += 1


            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            results.append(table)

        return results, duration, count_results
示例#55
0
	def getQueryParser(self):
		
		analyzers = self.getSearchingAnalyzers()

		map = HashMap()
		map.put('name', analyzers['name'])
		map.put('parent', analyzers['parent'])
		map.put('content', analyzers['default'])
		map.put('id', analyzers['id'])
		analyzerWrapper = PerFieldAnalyzerWrapper(analyzers['default'], map)

		queryParser = QueryParser(Version.LUCENE_CURRENT, 'content', analyzerWrapper)

		queryParser.setAutoGeneratePhraseQueries(PHRASE_QUERY_BY_DEFAULT)
		queryParser.setPhraseSlop(PHRASE_SLOP)
		queryParser.setFuzzyMinSim(FUZZY_MIN_SIM)
		queryParser.setDefaultOperator(DEFAULT_OPERATOR)

		return queryParser
示例#56
0
    def __BM25(self,searcher,rawQuery):
        '''retrieve documents with a single query'''
        if 'Standard' in self.indexFile:
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # build a standard analyzer with default stop words
        if 'Porter' in self.indexFile:
            analyzer = PorterStemmerAnalyzer()

        query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(QueryParser.escape(rawQuery)) # escape special characters
        scoreDocs = searcher.search(query, 100).scoreDocs
        docList = []
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            docList.append(doc.get("name"))
        return docList
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
示例#58
0
    def extract_phrase_query(self, q, field, slop=0, boost=5):
        phrases = re.findall(r'"([^"]*)"', q)
        if len(phrases) == 0:
            return None, q

        q = re.sub(r'"([^"]*)"', "", q).strip()  # query without phrases
        if self.verbose:
            print "Detected phrases: ", phrases

        bq = BooleanQuery()
        for phrase in phrases:
            # pq = PhraseQuery()
            # for term in filter(None, phrase.split(' ')):
            #     pq.add(Term(field, term))
            qparser = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer)
            # parse phrase - this may or may not be desired
            # pq = qparser.parse(field + ':"' + phrase + '"')
            pq = qparser.parse('%s "%s"~%d^%.1f' %
                               (phrase, phrase, slop, boost))
            # phrase queries have high priority
            bq.add(pq, BooleanClause.Occur.MUST)
            # bq.add(pq, BooleanClause.Occur.SHOULD)

        return bq, q
def lucene_retrieval(q_string, use_BM25=False):
    """

    :param q_string:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def doc_text(hists):
        """
        return doc_name & score
        :param hists:
        """
        text = '_NONE_'
        for h in hists:
            docID = h.doc
            doc = searcher.doc(docID)
            # file_name = doc.get("corpus_name")
            # doc_name = doc.get("doc_name")
            text = doc.get("text")
            #score = h.score
            # yield (file_name, doc_name, score, text)
        return text

    result = '_NONE_'

    # escape special characters via escape function
    if q_string and q_string.strip():   # when pre-process answers, `none of the above` -> '' cause error here
        #print(q_string)
        query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

        # search
        reader = IndexReader.open(index)
        searcher = IndexSearcher(reader)

        if use_BM25:
            searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(query, collector)
        hs = collector.topDocs().scoreDocs  # hists
        result = doc_text(hs)

        # reader.close()
    return result  # text: also nodes
    def __init__(self, queries=[], criteria=[], conjunctions=[], orderby=["ta"], ascending=True, limit=10000):
        vm.attachCurrentThread()

        self.queries = [query for query in queries if len(query.strip()) > 0]
        self.criteria = criteria
        self.conjunctions = conjunctions
        self.orderby = orderby
        self.ascending = ascending
        self.queryString = ""
        self.limit = limit

        self.fields = fields
        self.analyzer = PorterStemmerAnalyzer()
        self.queryParser = QueryParser(Version.LUCENE_30, "freetext", self.analyzer)
        self.queryParser.setAllowLeadingWildcard(True)
        self.queryParser.setDefaultOperator(QueryParser.Operator.AND)
        indexDir = settings.LUCENE_INDEX_DIRECTORY
        self.index = MMapDirectory(File(indexDir))