示例#1
0
 def findLiteral(self, instanceUri, propertyURI):
     labels = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
         flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
         labelOrTitleUris = "\"" + propertyURI + "\""
         queries = ["\"" + QueryParser.escape(instanceUri) + "\"", QueryParser.escape(labelOrTitleUris)]
         query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer)
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             labels.append(doc.get(FreyaConstants.FIELD_EXACT_CONTENT))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return labels
示例#2
0
    def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True):

        # if prm.n_threads > 1:
        #     out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher)
        #     if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #         terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term)
        # else:
        # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher)
        # if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #     terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term)
        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))

        c = OrderedDict()
        exp = self.searcher.explain(query, doc_int)
        c[1] = exp
        out.append(c)

        return out
示例#3
0
 def searchForClass(self, inst, pred):
     classUris = list()
     fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
     flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
     queries = ["\"" + QueryParser.escape(inst) + "\"", "\"" + QueryParser.escape(pred) + "\""]
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer)
         result = self._searcher.search(query, 1)
         logging.info("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             classUris.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
             indexus += 1
     except Exception as e:#ParseException(e):
         print e.message
         logging.error("Error")
     return classUris
示例#4
0
 def findSubClasses(self, classUri): #RESOLVE multifieldqueryparser DOCUMENTATION PROBLEM!!!!
     propertyURI = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
     subClasses = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         fields = [FreyaConstants.CLASS_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB]
         flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST]
         subClassUri = "\"" + QueryParser.escape(propertyURI) + "\""
         queries = ["\"" + QueryParser.escape(classUri) + "\"", subClassUri]
         query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,queries, fields,flags,analyzer)
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             subClasses.append(doc.get(FreyaConstants.INST_FEATURE_LKB))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return subClasses
示例#5
0
    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()
    
        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            except:
                print('Unexpected error when processing query:', str(q))
                print('Using query "dummy".')
                q = 'dummy'
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = list(map(int, doc['word_idx'].split(' ')))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                # c[int(doc['id'])] = [word_idx, word]
                c[int(doc['id'])] = [word_idx, word, hit.score]
            # print(c)
            return c
示例#6
0
    def search_pair_score_multithread_part(self, q_doc_int):

        # print(q_doc_int)
        spl=q_doc_int.split('<|endoftext|>')
        q = spl[0]
        print(q)
        doc_int = int(spl[1])
        print(doc_int)

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = self.curr_searcher.explain(query, doc_int)
        c[1] = exp

        return c
示例#7
0
    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND', '\\AND').replace('OR',
                                                          '\\OR').replace(
                                                              'NOT', '\\NOT')
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape(q))
                except:
                    print 'Unexpected error when processing query:', str(q)
                    print 'Using query "dummy".'
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = map(int, doc['word_idx'].split(' '))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    c[int(doc['id'])] = [word_idx, word]

                out.append(c)

        return out
示例#8
0
    def document_to_query(self, doc):
        """ Given a document it transforms the source code related fields to a lucene query string"""
        query = ""
        for field in [
                "description"
        ]:  #여기의 필드가 description 으로 설정 했고... 맨 끝에서 field, term이런식으로 넣으니.. 중복이 많음..
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    #tokenize
                    term = self.tokenize_string(StandardAnalyzer(), term)
                    #CamelCase
                    temp = []
                    for t in term:
                        temp += self.camel_case_split(t)
                    #stopwords
                    temp_2 = []
                    for t in temp:
                        if t not in english_stop_words:
                            temp_2.append(t)
                    #stemming
                    temp_3 = []
                    for t in temp_2:
                        temp_3.append(stem(t))
                    #stopwords
                    temp_4 = []
                    for t in temp_3:
                        if t not in english_stop_words:
                            temp_4.append(t)
                    #query generation
                    for term in temp_4:
                        query += "%s:%s " % (field, term)

        for field in [
                "typed_method_call", "methods", "used_classes",
                "class_instance_creation", "methods_called"
        ]:  # "extends", "annotations", "literals"
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue(
                    ))  #이 자리에서 Unified Query 정제 되나 한번 보자......
                    stoplist = ["java.lang.Object"]
                    if term not in stoplist:
                        query += "%s:%s " % (field, term)

        if len(doc.getFields("code_hints")) > 0:
            hints = [
                hint.stringValue() for hint in doc.getFields("code_hints")
            ]
            hints_str = " ".join(hints)
            for term in hints:
                if term:
                    term = QueryParser.escape(term)
                    if term not in english_stop_words:
                        # print "Including 'code_hints' from Doc_To_Query TERMs... //", term
                        query += "code_hints:%s " % term
        return query
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
示例#10
0
	def document_to_query(self, doc):
		""" Given a document it transforms the source code related fields to a lucene query string """
		query = ""
		for field in ["description"]:
			for val in doc.getFields(field):
				if val.stringValue().strip():
					term = QueryParser.escape(val.stringValue())
					# tokenize
					term = self.tokenize_string(StandardAnalyzer(), term)
					# CamelCase
					temp = []
					for t in term:
						temp += self.camel_case_split(t)
					# stopwords
					temp_2 = []

					for t in temp:
						if t not in english_stop_words:
							temp_2.append(t)
					# stemming
					temp_3 = []
					for t in temp_2:
						temp_3.append(stem(t))
					# stopwords
					temp_4 = []

					for t in temp_3:
						if t not in english_stop_words:
							temp_4.append(t)
					# query generation
					for term in temp_4:
						query += "%s:%s " % (field, term)

		for field in ["typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called",
					  "annotations", "literals"]:  # "used_classes", , "literals" , "extends"
			for val in doc.getFields(field):
				if val.stringValue().strip():
					term = QueryParser.escape(val.stringValue())
					java_stoplist = ["java.lang.Object", 'void', 'Global', 'boolean', 'String', 'int', 'char', 'float',
									 'double', 'write', 'close', 'from', 'println', 'StringBuilder', 'write',
									 'toString',
									 'close', 'mkdir', 'exists']

					if term not in java_stoplist:
						query += "%s:%s " % (field, term)

		if len(doc.getFields("code_hints")) > 0:
			hints = [hint.stringValue() for hint in doc.getFields("code_hints")]
			hints_str = " ".join(hints)
			for term in hints:
				if term:
					term = QueryParser.escape(term)
					if term not in english_stop_words:
						# print "Including 'code_hints' from Doc_To_Query TERMs... //", term
						query += "code_hints:%s " % term
		return query
示例#11
0
    def document_to_query(self, doc):
        """ Given a document it transforms the source code related fields to a lucene query string"""
        query = ""
        for field in [
                "typed_method_call", "methods", "used_classes",
                "class_instance_creation", "methods_called", "annotations",
                "literals"
        ]:  #"used_classes", , "literals" , "extends"
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())

                    # Filter out noisy terms
                    stoplist = ["java.lang.Object"]
                    if term not in stoplist:
                        # idf = self.get_IDF(field, term)

                        # print self.get_DF(field, term), term, field
                        #query += "%s:%s^%s " % (field, term, idf)
                        query += "%s:%s " % (field, term)

                    #print "term: %s idf: %s" % (term, self.get_minimum_IDF())

                #query += "%s:%s " % (field, term)
                #print "%s:%s^%s" % (field, term, self.getIDF(field, term))
        # for hint in doc.getFields("code_hints"):
        # 	tokens = utils.tokenize(hint.stringValue())
        # 	for token in tokens:
        # 		#print token
        # 		token = QueryParser.escape(token)
        # 		if token.strip():
        # 			print "HINTS", token
        # 			query += "code:%s^5.0 " % (token)

        if len(doc.getFields("code_hints")) > 0:
            hints = [
                hint.stringValue() for hint in doc.getFields("code_hints")
            ]
            hints_str = " ".join(hints)
            for term in hints:
                if term:
                    term = QueryParser.escape(term)
                    print "TERM", term
                    # if term[0].isupper():
                    # 	query += "used_classes:%s^5.0 class_instance_creation:%s^5.0 " % (term, term)
                    # elif "(" in term or "." in term or "#" in term: # Heuristic to boost only code identifiers
                    # 	query += "methods:%s^5.0 methods_called:%s^5.0 " % (term, term)

                    #query += "code:%s^5.0 " % (term)

        return query
示例#12
0
 def findPropertyURIs(self, propertyType, max):
     uris = list() # list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.CLASS_FEATURE_LKB, analyzer)
         query = parser.parse("\"" + QueryParser.escape(propertyType) + "\"")
         result = self._searcher.search(query, 1)
         freq = result.totalHits
         if max != None:
             freq = max.intValue()
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         logging.debug("For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max))
         print "For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             self._searcher.explain(query, hit.doc)
             uris.append(doc.get(FreyaConstants.INST_FEATURE_LKB))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return uris
示例#13
0
    def searchDocs(self, command, topK=30):

        if command == '':
            return

#         print("Searching for:", command)

        parser = PythonMultiFieldQueryParser(['name', 'contents'],
                                             self.analyzer)

        query = parser.parse(
            QueryParser.escape(command), ['name', 'contents'],
            [BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD],
            self.analyzer)

        scoreDocs = self.searcher.search(query, topK).scoreDocs
        #         print("%s total matching documents." % len(scoreDocs))

        docName = []
        docContents = []

        for scoreDoc in scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            docName.append(doc.get("docname"))
            docContents.append(doc.get("contents"))

#             print('docname:', doc.get("docname"), 'name:', doc.get("name"), 'content:', doc.get("contents"))

        return docName, docContents
示例#14
0
 def findDirectTypes(self, instanceUri, max):
     dTypes = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, "inst", analyzer)
         query = parser.parse("\"" + QueryParser.escape(instanceUri) + "\"")
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = 0
         if max != None:
             freq = max
         else:
             freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             self._searcher.explain(query, hit.doc)
             dTypes.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     logging.debug("there are " + str(len(dTypes)) + " unique direct types")
     return dTypes
示例#15
0
def get_evidence(searcher, analyzer, claim):
    escaped_string = QueryParser.escape(claim)
    query = QueryParser("text", analyzer).parse(escaped_string)
    start = datetime.now()
    scoreDocs = searcher.search(query, 50).scoreDocs
    duration = datetime.now() - start
    claim = nlp(claim)
    claim_evid = []
    line_no = []
    sim_score = []
    final_evidence = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        norm_doc = doc.get("text")
        norm_doc = nlp(norm_doc)
        val = claim.similarity(norm_doc)
        try:
            int(doc.get("Sno"))
            claim_evid.append(doc.get("keyterm"))
            line_no.append(int(doc.get("Sno")))
            sim_score.append(val)
        except ValueError:
            pass      # or whatever
        
    if len(sim_score)>5:
        for val in range(0,5):
            index = sim_score.index(max(sim_score))
            claim = claim_evid.pop(index)
            line = line_no.pop(index)
            final_evidence.append([claim , line])
            del sim_score[index]
    else:
        for i in range(0, len(sim_score)-1):
            final_evidence.append([claim_evid[i] , int(line_no[i])])
    return final_evidence
示例#16
0
def do_mapping(line):
    regex = re.match(r"(?P<netflix_id>[0-9]+),(?P<year>([0-9]+)|NULL),(?P<title>.+)", line)
    if not regex:
        raise ValueError(line)
    netflix_id = int(regex.group("netflix_id"))

    title = QueryParser.escape(regex.group("title"))
    query1 = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(title)

    year = regex.group("year")
    if year == "NULL":
        scoreDocs = searcher.search(query1, 1).scoreDocs
    else:
        year = int(year)

        query2 = NumericRangeQuery.newIntRange("year", year, year, True, True)
        booleanQuery = BooleanQuery();
        booleanQuery.add(query1, BooleanClause.Occur.MUST);
        booleanQuery.add(query2, BooleanClause.Occur.MUST);

        scoreDocs = searcher.search(booleanQuery, 1).scoreDocs

    if scoreDocs:
        if scoreDocs[0].score > 1.5:
            doc = searcher.doc(scoreDocs[0].doc)
            doc_id = doc.getField("id").stringValue()
            doc.add(StringField("netflix_id", str(netflix_id), Field.Store.YES))
            writer.updateDocument(Term("id", doc_id), doc)
示例#17
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q, a, t, p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n += 1

        q = q.replace('AND', '\\AND').replace('OR',
                                              '\\OR').replace('NOT', '\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text",
                            analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)

    return candidates
示例#18
0
    def _search_singlethread(
            self, queries: List[str],
            doc_max: int) -> List[List[Dict[str, Union[float, str]]]]:
        queries_result = []
        for query in queries:
            try:
                query = QueryParser('text', self.analyzer).parse(
                    QueryParser.escape(query))
            except Exception as exception:  # pylint: disable=broad-except
                logger.warning(
                    colored(f'{exception}: {query}, use query dummy.'),
                    'yellow')
                query = QueryParser('text', self.analyzer).parse('dummy')

            query_results = []
            hits = self.searcher.search(query, doc_max)

            for hit in hits.scoreDocs:
                doc = self.searcher.doc(hit.doc)

                query_results.append({
                    'score': hit.score,
                    'title': doc['title'],
                    'text': doc['text']
                })

            if not query_results:
                logger.warning(
                    colored(
                        f'WARN: search engine returns no results for query: {query}.',
                        'yellow'))

            queries_result.append(query_results)

        return queries_result
示例#19
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q,a,t,p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n+=1

        q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)
        
    return candidates
示例#20
0
 def code_as_text(self, query):
     new_query = " "
     for term in self.tokenize_string(self.porter_analyzer, query):
         if term:
             term = QueryParser.escape(term)
             new_query += "description:%s " % (term)
     return new_query
示例#21
0
文件: syntax.py 项目: zoudajia/rencos
def retriever(file_dir):
    analyzer = WhitespaceAnalyzer()
    reader = DirectoryReader.open(
        SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")))
    searcher = IndexSearcher(reader)
    queryParser = QueryParser("code", analyzer)
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE)

    with open(file_dir + "/train/train.spl.src",
              'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu:
        sources = [line.strip() for line in fso.readlines()]
        summaries = [line.strip() for line in fsu.readlines()]
    with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \
            open(file_dir+"/output/ast.out", 'w') as fws:
        queries = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in ft.readlines()
        ]

        for i, line in enumerate(queries):
            print("query %d" % i)
            query = queryParser.parse(QueryParser.escape(line))
            hits = searcher.search(query, 1).scoreDocs
            flag = False

            for hit in hits:
                doc = searcher.doc(hit.doc)
                _id = eval(doc.get("id"))
                flag = True
                fwo.write(sources[_id] + '\n')
                fws.write(summaries[_id] + '\n')
            if not flag:
                print(query)
                print(hits)
                exit(-1)
示例#22
0
    def getDoc(self, doc, sentenseid):

        query = QueryParser.escape(doc + ' ' + str(sentenseid))
        query = QueryParser('docname', self.analyzer).parse(query)
        score = self.searcher.search(query, 1).scoreDocs

        doc = self.searcher.doc(score[0].doc)
        return doc.get('docname'), doc.get('contents')
示例#23
0
    def run(self, searcher, analyzer, rawQuery):
        query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(QueryParser.escape(rawQuery))  # escape special characters 
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")
示例#24
0
 def retrieve(self, term, sid):
     query = term + ' ' + str(sid)
     query = self.repalcer(query)
     query = QueryParser.escape(query)
     query = QueryParser('name-sid', self.analyzer).parse(query)
     score = self.searcher.search(query, 1).scoreDocs
     doc = self.searcher.doc(score[0].doc)
     return doc.get('name-sid'), doc.get('contents')
示例#25
0
    def code_as_text(self):
        """ Extends a query by matching query keywords in source code as text"""

        query = " "
        for term in tokenize_string(self.porter_analyzer, self.query):
            if term:
                term = QueryParser.escape(term)
                query += "code:%s " % (term)

        return query
示例#26
0
    def pairSearch(self, pair, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        title = pair[0].replace('_', ' ')
        content = pair[1]
        parser = QueryParser("content_section", self.analyzer)
        query1 = parser.parse(QueryParser.escape(title))
        query2 = parser.parse(QueryParser.escape(content))

        bq = BooleanQuery.Builder()
        bq.add(query1, BooleanClause.Occur.FILTER)
        bq.add(query2, BooleanClause.Occur.SHOULD)

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(bq.build(), 6).scoreDocs
        return hits
示例#27
0
    def perfume_search(command,tb_data_line,f):
        query = QueryParser(Version.LUCENE_CURRENT, "name",
                            analyzer).parse(QueryParser.escape(command))
        scoreDocs = searcher.search(query, 1).scoreDocs
        # print "%s total matching documents." % len(scoreDocs)
        contents=tb_data_line.split('\t')
        name=contents[0]
        url=contents[1]
        price=str(contents[2])
        post=str(contents[3])
        sales=str(contents[4][:-3])
        comments=str(contents[5])
        place=contents[6]
        shop=contents[7]
        img=contents[-1]

        for scoreDoc in scoreDocs:
		doc = searcher.doc(scoreDocs[0].doc)
        	data={}
        	data.setdefault('name',name)
        	data.setdefault('url',url)
        	data.setdefault('price',price)
        	data.setdefault('post',post)
        	data.setdefault('sales',sales)
        	data.setdefault('comments',comments)
        	data.setdefault('place',place)
        	data.setdefault('shop',shop)
        	data.setdefault('img',img)

        	data.setdefault('xssd_name',doc.get('name'))
        	data.setdefault('perfumer',doc.get('perfumer'))
		data.setdefault('tune',doc.get('tune'))
		data.setdefault('xssd_url', doc.get('url'))
		data.setdefault('brand',doc.get('brand'))
		data.setdefault('rate:',float(doc.get('rate')))
		data.setdefault('xssd_comments',doc.get('comment'))
		if doc.get('former_scents')!=None:
			former=doc.get('former_scents')
			mid=doc.get('mid_scents')
			last=doc.get('last_scents')
			data.setdefault( 'former',former)
			data.setdefault( 'mid',mid)
			data.setdefault('last',last)
			scents=former+' '+mid+' '+last
			data.setdefault('scents',scents)
		else:
		    data.setdefault( 'scents',doc.get('scents'))

			
		for k,v in data.items():
			if v==None:
				f.write('None'+'\t')
			else:
				f.write(str(v)+'\t')
		f.write('\n')
示例#28
0
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type) if len(doc_score_list) != 0 else [0] * len(
                       feature_type)  # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text',
                        analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
示例#29
0
 def simpleSearchID(self, query, sim):
     """
     Method that searches through documents using only content_section Field
     searchDir : the path to the folder that contains the index.
     """
     # Now search the index:
     parser = QueryParser("id_section", self.analyzer)
     query = parser.parse(QueryParser.escape(query))
     self.searcher.setSimilarity(sim)
     hits = self.searcher.search(query, 6).scoreDocs
     return hits
    def explain(self, query, fields, doc):

        if not self.searcher:
            self.open_searcher()

        query = QueryParser.escape(query)

        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                       self.analyzer)
        query = MultiFieldQueryParser.parse(parser, query)

        return self.searcher.explain(query, doc)
示例#31
0
 def document_to_query(self):
     """ Given a document it transforms the source code related fields to a lucene query string"""
     query = ""
     for field in [
             "typed_method_call", "methods", "extends", "used_classes",
             "class_instance_creation", "methods_called", "annotations",
             "literals"
     ]:  #"used_classes", , "literals"
         for val in self.ast[field]:
             term = QueryParser.escape(val)
             query += "%s:%s " % (field, term)
     return query
示例#32
0
    def multiFieldsSearch(self, query, sim):
        lucene.getVMEnv().attachCurrentThread()

        parser = MultiFieldQueryParser(
            ["content_section", "title_section", 'title_article'],
            self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query))

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(query, 6).scoreDocs
        return hits
示例#33
0
    def multiFieldsPairSearch(self, pair, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        title = pair[0].replace('_', ' ')
        content = pair[1]
        parser = MultiFieldQueryParser(["content_section", "title_article"],
                                       self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title))
        query2 = MultiFieldQueryParser.parse(parser,
                                             QueryParser.escape(content))

        bq = BooleanQuery.Builder()
        bq.add(query1, BooleanClause.Occur.FILTER)
        bq.add(query2, BooleanClause.Occur.SHOULD)

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(bq.build(), 6).scoreDocs
        return hits
示例#34
0
def retrieve_wiki(text_query, searcher, analyzer):    
    txt =text_query
    try:
        query = QueryParser(Version.LUCENE_CURRENT, "contents", 
                            analyzer).parse(txt)
    except:
        qp = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        txt = qp.escape(txt)
        query = qp.parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs
    
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('title'), doc.get('contents')    
示例#35
0
def retrieve_wiki(text_query, searcher, analyzer):
    txt = text_query
    try:
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(txt)
    except:
        qp = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer)
        txt = qp.escape(txt)
        query = qp.parse(txt)
    scoreDocs = searcher.search(query, 1000).scoreDocs

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        yield doc.get('title'), doc.get('contents')
    def similarityOfSynopsis(self):
        directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX))
        ireader = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS,
                                  analyzer)
        for root, dirnames, filenames in os.walk(settings.SYNOPSIS):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    while True:
                        try:
                            query = queryParser.parse(
                                QueryParser.escape(content))
                        except Exception as e:
                            self.boolean_query.setMaxClauseCount(
                                self.boolean_query.maxClauseCount * 2)
                            print self.boolean_query.maxClauseCount
                            continue
                        break

                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(
                                first_movie=major_movie,
                                second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(
                                    first_movie=minor_movie,
                                    second_movie=major_movie).first()
                            similarity.synopsis = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)
示例#37
0
 def search_scores(self, query, topk=10):
     query = self.repalcer(query)
     query = QueryParser.escape(query)
     query1 = QueryParser('name', self.analyzer).parse(query)
     query2 = QueryParser('name-contents', self.analyzer).parse(query)
     # print(query2)
     scores1 = self.searcher.search(query1, 30).scoreDocs
     scores2 = self.searcher.search(query2, 30).scoreDocs
     name1 = []
     name2 = []
     for score1 in scores1:
         doc1 = self.searcher.doc(score1.doc)
         t = doc1.get('name')
         if t not in name1:
             name1.append(t)
         if len(name1) > 1:
             break
     # print(name1)
     name2.append(name1[0])
     docnames = []
     doccontents = []
     s = []
     maxscore = scores2[0].score
     t_doc = self.searcher.doc(scores2[0].doc)
     for score2 in scores2:
         doc2 = self.searcher.doc(score2.doc)
         tname = doc2.get('name')
         # print(tname)
         # print(tname,name1)
         if score2.score == maxscore:
             docnames.append(doc2.get('name-sid'))
             doccontents.append(doc2.get('contents'))
             s.append(score2.score)
             # print(docnames)
         elif tname in name1 and score2.score > maxscore - 5:
             docnames.append(doc2.get('name-sid'))
             doccontents.append(doc2.get('contents'))
             s.append(score2.score)
             # print(docnames)
         # print(score2.score)
         # print(maxscore)
         if len(docnames) > 2:
             break
     if len(docnames) == 1 and scores1[0].score > maxscore - 10:
         docnames.append(self.searcher.doc(scores1[0].doc).get('name-sid'))
         doccontents.append(
             self.searcher.doc(scores1[0].doc).get('contents'))
         s.append(scores1[0].score)
     assert len(docnames) != 0
     return docnames, doccontents, s
示例#38
0
    def __BM25(self,searcher,rawQuery):
        '''retrieve documents with a single query'''
        if 'Standard' in self.indexFile:
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # build a standard analyzer with default stop words
        if 'Porter' in self.indexFile:
            analyzer = PorterStemmerAnalyzer()

        query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(QueryParser.escape(rawQuery)) # escape special characters
        scoreDocs = searcher.search(query, 100).scoreDocs
        docList = []
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            docList.append(doc.get("name"))
        return docList
示例#39
0
    def search_pair_score_singlethread(self, q, doc_int, searcher):

        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = searcher.explain(query, doc_int)
        c[1] = exp

        out.append(c)

        return out
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
示例#41
0
    def multiFieldsSearch(self, query, sim):
        """
        Method that searches through documents using content_section and title_article Fields
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        lucene.getVMEnv().attachCurrentThread()

        parser = MultiFieldQueryParser(["content_section", "title_article"],
                                       self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query))

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(query, 6).scoreDocs
        return hits
示例#42
0
def annotate_all_questions(analyzer, searcher):
    df = pd.concat(map(dm.ALLEN_AI_OBQA, list(OBQAType)))
    annotations = {}
    for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        for answer in row.answers:
            sent = row.question + " " + answer
            query_string = QueryParser.escape(sent)
            query = QueryParser("contents", analyzer).parse(query_string)
            hits = searcher.search(query, 75).scoreDocs
            closest = [
                searcher.doc(score_doc.doc).get("contents")
                for score_doc in hits
            ]  # noqa: E501:
            annotations[sent] = closest
    pickle.dump(annotations, open("annotations.pkl", "wb"))
    print("Annotations written to annotations.pkl")
def lucene_retrieval(q_string, use_BM25=False):
    """

    :param q_string:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def doc_text(hists):
        """
        return doc_name & score
        :param hists:
        """
        text = '_NONE_'
        for h in hists:
            docID = h.doc
            doc = searcher.doc(docID)
            # file_name = doc.get("corpus_name")
            # doc_name = doc.get("doc_name")
            text = doc.get("text")
            #score = h.score
            # yield (file_name, doc_name, score, text)
        return text

    result = '_NONE_'

    # escape special characters via escape function
    if q_string and q_string.strip():   # when pre-process answers, `none of the above` -> '' cause error here
        #print(q_string)
        query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

        # search
        reader = IndexReader.open(index)
        searcher = IndexSearcher(reader)

        if use_BM25:
            searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(query, collector)
        hs = collector.topDocs().scoreDocs  # hists
        result = doc_text(hs)

        # reader.close()
    return result  # text: also nodes
    def similarityOfSynopsis(self):
        directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX))
        ireader  = DirectoryReader.open(directory)
        searcher = IndexSearcher(ireader)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)
        for root, dirnames, filenames in os.walk(settings.SYNOPSIS):
            filenames = [int(item) for item in filenames]
            filenames.sort()
            filenames = [str(item) for item in filenames]
            for filename in filenames:
                path = os.path.join(root, filename)
                major_movie = models.Movie.objects.get(pk=filename)
                with open(path, 'r') as moviedoc:
                    content = moviedoc.read().replace('\n', ' ')
                    content = re.sub('[^A-Za-z0-9 ]+', '', content)
                    while True:
                        try:
                            query = queryParser.parse(QueryParser.escape(content))
                        except Exception as e:
                            self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2)
                            print self.boolean_query.maxClauseCount
                            continue
                        break

                    topDocs = searcher.search(query, len(filenames))
                    scoreDocs = topDocs.scoreDocs
                    for scoreDoc in scoreDocs:
                        doc = searcher.doc(scoreDoc.doc)
                        movie_id = int(doc.get(FIELD_PATH))
                        if movie_id <= major_movie.id:
                            continue
                        minor_movie = models.Movie.objects.get(pk=movie_id)
                        try:
                            similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first()
                            if not similarity:
                                similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first()
                            similarity.synopsis = scoreDoc.score
                            similarity.save()
                        except Exception as e:
                            print major_movie.id, minor_movie.id
                            raise e
                print u"{0} completed.".format(major_movie.id)
示例#45
0
 def findTopClasses(self):
     propertyURI = RDFS.SUBCLASSOF
     allClasses = list()
     topClasses = list()
     try:
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.PROPERTY_FEATURE_LKB, analyzer)
         query = parser.parse("\"" + QueryParser.escape(propertyURI) + "\"")
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             allClasses.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB))
             indexus += 1
         # for (String classUri : allClasses) {
         indexus = 0
         while indexus < len(allClasses):
             classUri = allClasses[indexus]
             logging.info("Checking whether " + classUri + " is a top class.")
             # search inst and pred retrieve class
             # if class exists that means it is not top class otherwise add to
             # topClasses
             classes = self.searchForClass(classUri, propertyURI)
             logging.info("top classes:" + str(len(classes)))
             if classes != None or len(classes) > 0:
                 logging.info("This is not a top class...")
             else:
                 topClasses.append(classUri)
                 logging.info("Adding " + classUri + " to top classes.")
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return topClasses
示例#46
0
 def searchStemFirst(self, annotation):
     annotations = list()
     pocString = QueryParser.escape(annotation.getText())
     preparePocStringOriginal = "\"" + pocString + "\""
     preparePocStringLowercase = "\"" + pocString.lower() + "\""
     try:
         maxSynonyms = 0
         # Analyzer stemmedAnalyser =
         # AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil
         # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)),
         # synonymMap, maxSynonyms);
         stemmedAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT)
         analyser = StandardAnalyzer(Version.LUCENE_CURRENT)
         stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemmedAnalyser)
         query = stemParser.parse(preparePocStringLowercase)
         result = self._searcher.search(query, 1)
         logging.info("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         stemHits = result.scoreDocs
         allHits = stemHits
         # if(stemHits.length == 0) {
         # search lowercased exact
         parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser)
         query = parser.parse(preparePocStringLowercase)
         result = self._searcher.search(query, 1)
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         lowHits = result.scoreDocs
         allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(lowHits) # ArrayUtils.addAll(allHits, lowHits)
         logging.info("For " + str(query) + " : " + str(result.totalHits))
         # }
         # if(allHits.length == 0) {
         # search exact
         exactParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser)
         query = exactParser.parse(preparePocStringLowercase)
         result = self._searcher.search(query, 1)
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(result.scoreDocs) #ArrayUtils.addAll(allHits, result.scoreDocs)
         logging.info("For " + str(query) + " : " + str(result.totalHits))
         # }
         # for (ScoreDoc hit : allHits) {
         indexus = 0
         while indexus < len(allHits):
             hit = allHits[indexus]
             doc = self._searcher.doc(hit.doc)
             self._searcher.explain(query, hit.doc)
             ann = Annotation()
             features = dict()
             features[FreyaConstants.CLASS_FEATURE_LKB] = doc.get(FreyaConstants.CLASS_FEATURE_LKB)
             features[FreyaConstants.INST_FEATURE_LKB] = doc.get(FreyaConstants.INST_FEATURE_LKB)
             features[FreyaConstants.PROPERTY_FEATURE_LKB] = doc.get(FreyaConstants.PROPERTY_FEATURE_LKB)
             features["string"] = doc.get(FreyaConstants.FIELD_EXACT_CONTENT)
             features["score"] = hit.score
             ann.setFeatures(features)
             ann.setEndOffset(annotation.getEndOffset())
             ann.setStartOffset(annotation.getStartOffset())
             ann.setSyntaxTree(annotation.getSyntaxTree())
             ann.setText(annotation.getText())
             annotations.append(ann)
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return annotations
示例#47
0
 def testSearcher(self):
     query=QueryParser(Version.LUCENE_CURRENT, "class", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(QueryParser.escape('http\://www.mooney.net/geo#River'))
     print query
     hits = self._searcher.search(query, 50)
     for hit in hits.scoreDocs:
         print hit.score, hit.doc, hit.toString()
         doc = self._searcher.doc(hit.doc)
         print doc.get("class").encode("utf-8")
示例#48
0
 def searchIndex(self, annotation, specialTreatment):
     if specialTreatment:
         return self.searchStemFirst(annotation)
     annotations = list() #ArrayList[Annotation]()
     try:
         maxSynonyms = 0
         stemAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT)
         # Analyzer stemmedAnalyser = AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil
         # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)),
         # synonymMap, maxSynonyms);
         analyser = StandardAnalyzer(Version.LUCENE_CURRENT)
         parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser)
         pocString = QueryParser.escape(annotation.getText())
         preparePocString = "\"" + pocString + "\""
         preparePocStringLowercase = "\"" + pocString.lower() + "\""
         query = parser.parse(preparePocString)
         result = self._searcher.search(query, 1)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         freq = result.totalHits
         if freq > 0:
             result = self._searcher.search(query, freq)
         hits = pyJava.JArray2List(result.scoreDocs)
         logging.debug("For " + str(query) + " : " + str(result.totalHits))
         if freq <= 0:
             # search lowercased exact
             lowerCasedParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser)
             query = lowerCasedParser.parse(preparePocStringLowercase)
             # logging.info("Searching for: " + query.toString());
             result = self._searcher.search(query, 1)
             freq = result.totalHits
             if freq > 0:
                 result = self._searcher.search(query, freq)
             hits = pyJava.JArray2List(result.scoreDocs)
             logging.debug("For " + str(query) + " : " + str(result.totalHits))
         if len(hits) == 0 and preparePocStringLowercase.index(" ") < 0:
             # search stemmed
             stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemAnalyser)
             query = stemParser.parse(preparePocStringLowercase)
             # logging.info("Searching for: " + query.toString());
             result = self._searcher.search(query, 1)
             freq = result.totalHits
             if freq > 0:
                 result = self._searcher.search(query, freq)
             hits = pyJava.JArray2List(result.scoreDocs)
             logging.info("For " + str(query) + " : " + str(result.totalHits))
         # for (ScoreDoc hit : hits) {
         indexus = 0
         while indexus < len(hits):
             hit = hits[indexus]
             doc = self._searcher.doc(hit.doc)
             self._searcher.explain(query, hit.doc)
             ann = Annotation()
             features = dict()
             features[FreyaConstants.CLASS_FEATURE_LKB]=doc.get(FreyaConstants.CLASS_FEATURE_LKB)
             features[FreyaConstants.INST_FEATURE_LKB]=doc.get(FreyaConstants.INST_FEATURE_LKB)
             features[FreyaConstants.PROPERTY_FEATURE_LKB]=doc.get(FreyaConstants.PROPERTY_FEATURE_LKB)
             features["string"]=doc.get(FreyaConstants.FIELD_EXACT_CONTENT)
             features[FreyaConstants.SCORE]=hit.score
             ann.setFeatures(features)
             ann.setEndOffset(annotation.getEndOffset())
             ann.setStartOffset(annotation.getStartOffset())
             ann.setSyntaxTree(annotation.getSyntaxTree())
             ann.setText(annotation.getText())
             annotations.append(ann)
             indexus += 1
     except Exception as e:#CorruptIndexException(e):
         print e.message
         logging.error("Error")
     return annotations