Пример #1
0
    def testEquality(self):

        b1 = BooleanQuery.Builder()
        b1.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD)
        b1.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD)
        bq1 = b1.build()

        n1 = BooleanQuery.Builder()
        n1.add(TermQuery(Term("field", "nestedvalue1")),
               BooleanClause.Occur.SHOULD)
        n1.add(TermQuery(Term("field", "nestedvalue2")),
               BooleanClause.Occur.SHOULD)
        nested1 = n1.build()
        b1.add(nested1, BooleanClause.Occur.SHOULD)
        bq1 = b1.build()

        b2 = BooleanQuery.Builder()
        b2.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD)
        b2.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD)

        n2 = BooleanQuery.Builder()
        n2.add(TermQuery(Term("field", "nestedvalue1")),
               BooleanClause.Occur.SHOULD)
        n2.add(TermQuery(Term("field", "nestedvalue2")),
               BooleanClause.Occur.SHOULD)
        nested2 = n2.build()
        b2.add(nested2, BooleanClause.Occur.SHOULD)
        bq2 = b2.build()

        self.assert_(bq1.equals(bq2))
Пример #2
0
def search_html(searcher, analyzer):
    while True:
        print("Hit enter with no input to quit.")
        command = input("Query:")
        os.system("clear")
        if command == "":
            return
        print("Searching for:", command)
        command_dict = parse_command(command)
        querys = BooleanQuery.Builder()
        for k, v in command_dict.items():
            if k == "content":
                cutted = [x for x in jieba.cut_for_search(v) if x.strip()]
                v = " ".join(cutted)
                print("After segmentation:", v)
            query = QueryParser(k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        # query = QueryParser("content", analyzer).parse(command)
        querys = querys.build()
        # query = QueryParser("content", analyzer).parse("美好")
        # query = TermQuery(Term("content", "美好"))
        querys = BooleanQuery.Builder().add(query,
                                            BooleanClause.Occur.MUST).build()
        scoreDocs = searcher.search(querys, 10).scoreDocs
        print("{} total matching documents.".format(len(scoreDocs)))
        for num, scoreDoc in enumerate(scoreDocs):
            doc = searcher.doc(scoreDoc.doc)
            print(
                "\n#{num}:\nTitle:{title}\nURL:{url}\nSite:{site}\nPath:{path}\nFile Name:{name}\n"
                .format(num=num + 1,
                        title=doc.get("title"),
                        url=doc.get("url"),
                        path=doc.get("path"),
                        name=doc.get("name"),
                        site=doc.get("site")))
Пример #3
0
 def search(self, field: str):
     sear = self._search
     if len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']:
         query = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
     elif self._commandInfo.getKey()[0] == '#':
         query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
         query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1]))
         bc1 = BooleanClause(query1, BooleanClause.Occur.MUST)
         bc2 = BooleanClause(query2, BooleanClause.Occur.MUST)
         query = BooleanQuery.Builder().add(bc1).add(bc2).build()
     elif self._commandInfo.getKey()[0] in ['$', '+']:
         bq = BooleanQuery.Builder()
         for w in self._commandInfo.getWordList():
             queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w))
             bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
             bq.add(bc)
         query = bq.build()
     else:
         query = ''
     hits = sear.search(query, 999999)
     for hit in hits.scoreDocs:
         doc = sear.doc(hit.doc)
         res = doc.get(field)
         id = doc.get(field+'_id')
         if doc_hit(res, self._commandInfo):
             sentences = re.split('[!?!?。]', res)
             map(lambda x: sentences.pop(x) if x == '' else 0, range(len(sentences)))
             for sentence in sentences:
                 if key_filter(self._commandInfo, sentence):
                     self._doc[id] = res
                     self._resultSentencesList.append((id, sentence))
     return self
Пример #4
0
    def testParenthesisMust2(self):

        b3 = BooleanQuery.Builder()
        b3.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD))
        b3.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD))
        q3 = b3.build()
        b4 = BooleanQuery.Builder()
        b4.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD))
        b4.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD))
        q4 = b4.build()
        b2 = BooleanQuery.Builder()
        b2.add(q3, BooleanClause.Occur.SHOULD)
        b2.add(q4, BooleanClause.Occur.MUST)
        q2 = b2.build()
        self.assertEqual(1, self.search(q2))
Пример #5
0
def search_html(query_string, limit=10):

    command_dict = parse_command(query_string)
    vm_env.attachCurrentThread()
    querys = BooleanQuery.Builder()
    cutted_query = None
    for k, v in command_dict.items():
        if k == "content":
            cutted_query = [x for x in jieba.cut_for_search(v) if x.strip()]
            v = " ".join(cutted_query)
        query = QueryParser(k, analyzer).parse(v)
        querys.add(query, BooleanClause.Occur.MUST)
    querys = querys.build()
    querys = BooleanQuery.Builder().add(query,
                                        BooleanClause.Occur.MUST).build()
    scoreDocs = searcher["html"].search(querys, limit).scoreDocs
    result = list()
    for num, scoreDoc in enumerate(scoreDocs):
        doc = searcher["html"].doc(scoreDoc.doc)
        single_result = {
            "title": doc.get("title"),
            "url": doc.get("url"),
        }
        with open(doc.get("path"), mode="r", encoding="utf8") as file:
            content = file.read()
        html2text = HTML2Text()
        html2text.ignore_links = True
        html2text.ignore_images = True
        content = html2text.handle(content)
        cutted_content = jieba.cut(content)
        flag = False
        if cutted_query:
            word_num, cnt = 20, 0
            for x in cutted_content:
                if not x.strip():
                    continue
                if not flag and x in cutted_query:
                    flag = True
                    content = ""
                if flag and cnt < word_num:
                    cnt += 1
                    content += (x if x not in cutted_query else
                                "<span class='highlight'>{0}</span>".format(x))
                elif cnt >= word_num:
                    break
        single_result["content"] = content if flag else content[:100]
        result.append(single_result)
    return result
    def get_most_similar(self, sentence, do_log=False):
        # print('query string is',string)
        # q = QueryParser('pa', self.analyzer).parse(sentence)
        query_builder = BooleanQuery.Builder()
        for token in sentence.split(' '):
            if token not in sw:
                qtq = TermQuery(Term("pa", token))
                query_builder.add(
                    BooleanClause(qtq, BooleanClause.Occur.SHOULD))
        q = query_builder.build()
        hitsPerPage = 2
        reader = DirectoryReader.open(self.w)
        self.searcher = IndexSearcher(reader)
        simi = BM25Similarity(Config.k1, Config.b)
        # simi = ClassicSimilarity()
        self.searcher.setSimilarity(simi)

        docs = self.searcher.search(q, hitsPerPage)
        hits = docs.scoreDocs

        # print("Found " + str(len(hits)) + " hits.")
        if len(hits) > 0:
            mate = self.searcher.doc(hits[0].doc).get("id")
            if do_log:
                print("found something. mate: ", mate, "- score : ",
                      hits[0].score)
            return hits[0], int(mate)
        else:
            return None, -1
Пример #7
0
def run(searcher, analyzer):
    #while True:
    print()
    print("Hit enter with no input to quit.")
    #command = input("Query:")
    # command = unicode(command, 'GBK')
    command = '环保节能社会 site:guancha.cn'
    if command == '':
        return

    print()
    print("Searching for:", command)

    command_dict = parseCommand(command)
    print(command_dict)
    querys = BooleanQuery.Builder()
    for k, v in command_dict.items():
        query = QueryParser(k, analyzer).parse(v)
        # print(query)
        querys.add(query, BooleanClause.Occur.MUST)
    scoreDocs = searcher.search(querys.build(), 50).scoreDocs
    print("%s total matching documents." % len(scoreDocs))

    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        ##            explanation = searcher.explain(query, scoreDoc.doc)
        print("------------------------")
        print('path:', doc.get("path"))
        print('name:', doc.get("name"))
        print('title:', doc.get('title'))
        print('url:', doc.get('url'))
Пример #8
0
def combine_queries(q1, q2):
    '''Combine the two given queries into a BooleanQuery with the AND
    operator.'''
    b = BooleanQuery.Builder()
    b.add(q1, BooleanClause.Occur.MUST)  # Must include results from q1
    b.add(q2, BooleanClause.Occur.MUST)  # Must include results from q2
    bq = b.build()  # BooleanQuery instance
    return bq
 def get_or_query(self, queries):
     """Creates an OR Boolean query from multiple Lucene queries """
     # empty boolean query with Similarity.coord() disabled
     bq_builder = BooleanQuery.Builder()
     for q in queries:
         bq_builder.add(q, BooleanClause.Occur.SHOULD)
     bq = bq_builder.build()
     return bq
Пример #10
0
    def testFlat(self):

        b = BooleanQuery.Builder()
        b.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD))
        b.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD))
        b.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD))
        b.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD))
        q = b.build()
        self.assertEqual(1, self.search(q))
Пример #11
0
    def get_and_query(self, queries):
        """Creates an AND Boolean query from multiple Lucene queries """
        # empty boolean query with Similarity.coord() disabled
        bq_builder = BooleanQuery.Builder()
        bq_builder.setDisableCoord(False)

        for q in queries:
            bq_builder.add(q, BooleanClause.Occur.MUST)
        bq = bq_builder.build()

        return bq
    def __init__(self, index_path=os.path.join(ROOT_DIR, 'corpus/indexRI')):
        """
		Lucene components initialization
		:param index_path: path of the index
		"""
        self.analyzer = StandardAnalyzer()
        self.index = SimpleFSDirectory(File(index_path).toPath())
        self.reader = DirectoryReader.open(self.index)
        self.searcher = IndexSearcher(self.reader)
        self.constrained_query = BooleanQuery.Builder()
        self.parser = Parser()
Пример #13
0
def expandQuery(ixreader, result, nrRelevant):
    relevant = []
    mlt = MoreLikeThis(ixreader)
    for i in result[0:nrRelevant - 1]:
        docid = ixreader.doc(i.doc)
        relevant.append(mlt.like(docid))

    querybuilder = BooleanQuery.Builder()
    for i in relevant:
        querybuilder.add(i, BooleanClause.Occur.SHOULD)

    return querybuilder.build()
Пример #14
0
    def search(self, command_dict):
        '''
        Search for the query in the Lucene index.

        Input: `command_dict`: dict containing preprocessed query
        Output: score_docs satisfying the requirement
        '''
        querys = BooleanQuery.Builder()
        for k, v in command_dict.items():
            query = QueryParser(k, self.analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        return self.searcher.search(querys.build(), 50).scoreDocs
Пример #15
0
 def _luceneQueryBuilder(self,
                         prefix,
                         sets=None,
                         setsMask=None,
                         partition=None):
     numberOfClausesAdded = 0
     queryBuilder = BooleanQuery.Builder()
     if prefix:
         queryBuilder.add(TermQuery(Term(PREFIX_FIELD, prefix)),
                          BooleanClause.Occur.MUST)
         numberOfClausesAdded += 1
     if sets:
         setQueryBuilder = BooleanQuery.Builder()
         for setSpec in sets:
             setQueryBuilder.add(TermQuery(Term(SETS_FIELD, setSpec)),
                                 BooleanClause.Occur.SHOULD)
         queryBuilder.add(setQueryBuilder.build(), BooleanClause.Occur.MUST)
         numberOfClausesAdded += 1
     for set_ in setsMask or []:
         queryBuilder.add(TermQuery(Term(SETS_FIELD, set_)),
                          BooleanClause.Occur.MUST)
         numberOfClausesAdded += 1
     if partition:
         partitionQueries = []
         for start, stop in partition.ranges():
             partitionQueries.append(
                 IntPoint.newRangeQuery(HASH_FIELD, start, stop - 1))
         if len(partitionQueries) == 1:
             pQuery = partitionQueries[0]
         else:
             pQueryBuilder = BooleanQuery.Builder()
             for q in partitionQueries:
                 pQueryBuilder.add(q, BooleanClause.Occur.SHOULD)
             pQuery = pQueryBuilder.build()
         queryBuilder.add(pQuery, BooleanClause.Occur.MUST)
         numberOfClausesAdded += 1
     if numberOfClausesAdded == 0:
         queryBuilder.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
     return queryBuilder
Пример #16
0
def getQueryBuiler():
    # builder = QueryBuilder(analyzer)
    boolean_query = BooleanQuery.Builder()

    # print(args.search)

    if len(args.search) == 0:
        boolean_query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
        return boolean_query
    
    for i in range(len(args.search)):
        curSearch = args.search[i].split(' ')

        if curSearch[1] == 'query':
            parser = QueryParser(curSearch[2], analyzer)
            query = parser.parse(curSearch[3])
        elif curSearch[1] == 'intrange':
            query = IntPoint.newRangeQuery(curSearch[2], curSearch[3], curSearch[4])
        elif curSearch[1] == 'termrange':
            lowerDate = handleDate(curSearch[3], '%d/%b/%Y:%H:%M:%S')
            upperDate = handleDate(curSearch[4], '%d/%b/%Y:%H:%M:%S')
            query = TermRangeQuery.newStringRange(curSearch[2], lowerDate, upperDate, True, True)

        if curSearch[0] == 'must':
            boolean_query.add(query, BooleanClause.Occur.MUST)
        elif curSearch[0] == 'should':
            boolean_query.add(query, BooleanClause.Occur.SHOULD)
        elif curSearch[0] == 'filter':
            boolean_query.add(query, BooleanClause.Occur.FILTER)
        elif curSearch[0] == 'must_not':
            boolean_query.add(query, BooleanClause.Occur.MUST_NOT)
        else:
            print('raise exception')
            # raise Exception
    # exit()
    # parser = QueryParser('method1', analyzer)
    # query = parser.parse('options')
    # boolean_query.add(query, BooleanClause.Occur.MUST)

    # parser = QueryParser('response_code', analyzer)
    # query = IntPoint.newRangeQuery('response_code', 200, 300)
    # boolean_query.add(query, BooleanClause.Occur.MUST)

    # lowerDate = handleDate("19/Jul/2020:05:40:00 +0000")
    # upperDate = handleDate("19/Jul/2020:06:45:04 +0000")
    # query = TermRangeQuery.newStringRange("date_time", lowerDate, upperDate, True, True)
    # boolean_query.add(query, BooleanClause.Occur.MUST)


    return boolean_query
Пример #17
0
 def query_section(self, section):
     searcher = self._searcher
     query_doc = RegexpQuery(Term('id', self._id + '\\..+'))
     query_section = TermQuery(Term('section', section))
     query = BooleanQuery.Builder()
     bc1 = BooleanClause(query_doc, BooleanClause.Occur.MUST)
     bc2 = BooleanClause(query_section, BooleanClause.Occur.MUST)
     query = query.add(bc1).add(bc2).build()
     top_docs = searcher.search(query, 1000000)
     hits = top_docs.scoreDocs
     res_dict = {}
     for hit in hits:
         doc = searcher.doc(hit.doc)
         id = doc.get('id')
         document = doc.get('document')
         section = doc.get('section')
         author = doc.get('author')
         dynasty = doc.get('dynasty')
         type = doc.get('type')
         text = doc.get('text')
         color = doc.get('color')
         area = doc.get('area')
         zhujie = doc.get('zhujie')
         detail = doc.get('detail')
         res_dict[id] = {}
         if document:
             res_dict[id]['document'] = document
         if section:
             res_dict[id]['section'] = section
         if author:
             res_dict[id]['author'] = author
         if dynasty:
             res_dict[id]['dynasty'] = dynasty
         if type:
             res_dict[id]['type'] = type
         if text:
             res_dict[id]['text'] = text
         if color:
             res_dict[id]['color'] = color
         if area:
             res_dict[id]['area'] = area
         if zhujie:
             res_dict[id]['zhujie'] = zhujie
         if detail:
             res_dict[id]['detail'] = detail
     res_dict[self._id] = {'document': section}
     self._resDict = res_dict
     return self
    def get_results(self, nb_results=1000):
        """
		Get results that match with the query
		:param nb_results:
		:return:
		"""
        docs = self.searcher.search(self.constrained_query.build(),
                                    nb_results).scoreDocs
        self.constrained_query = BooleanQuery.Builder()

        hits = []
        for i in range(len(docs)):
            hits.append({})
            for field in self.reader.document(docs[i].doc).getFields():
                hits[i][field.name()] = field.stringValue()

        hits = self.remove_duplicates(hits)
        return hits
Пример #19
0
    def _parse_query(self, field_name, query):
        ts = self.analyzer.tokenStream("dummy", StringReader(query))
        termAtt = ts.getAttribute(CharTermAttribute.class_)
        ts.reset()
        tokens = []
        while ts.incrementToken():
            tokens.append(termAtt.toString())
        ts.end()
        ts.close()

        booleanQuery = BooleanQuery.Builder()
        for token in tokens:
            builder = PhraseQuery.Builder()
            for i, word in enumerate(token.split(' ')):
                builder.add(Term(field_name, word), i)
            pq = builder.build()
            booleanQuery.add(pq, BooleanClause.Occur.SHOULD)
        final_query = booleanQuery.build()
        return final_query
Пример #20
0
    def pairSearch(self, pair, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        title = pair[0].replace('_', ' ')
        content = pair[1]
        parser = QueryParser("content_section", self.analyzer)
        query1 = parser.parse(QueryParser.escape(title))
        query2 = parser.parse(QueryParser.escape(content))

        bq = BooleanQuery.Builder()
        bq.add(query1, BooleanClause.Occur.FILTER)
        bq.add(query2, BooleanClause.Occur.SHOULD)

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(bq.build(), 6).scoreDocs
        return hits
Пример #21
0
 def searchAncient(self, field):
     indexDir = SimpleFSDirectory(Paths.get(self._dir))
     sear = IndexSearcher(DirectoryReader.open(indexDir))
     bq = BooleanQuery.Builder()
     q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._searchWord))
     bc = BooleanClause(q, BooleanClause.Occur.MUST)
     bq.add(bc)
     search_fields = self._fields
     for i in search_fields:
         if i == 'section' or i == 'document':
             continue
         queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(search_fields[i]))
         bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
         bq.add(bc)
     query = bq.build()
     hits = sear.search(query, 9999)
     for hit in hits.scoreDocs:
         doc = sear.doc(hit.doc)
         res = doc.get(field)
         id = doc.get('id')
         detail = get_detail(doc)
         zhujie = detail['zhujie']
         if detail['detail'] and 'detail' in detail['detail'].keys():
             detail['detail'] = detail['detail']['detail']
         detail.pop('zhujie')
         detail.pop('text')
         detail.pop('type')
         detail = json.dumps(detail)
         self._doc[id] = res
         if doc_hit(res, self._words):
             f = key_filter(self._words, self._re, res)
             if f:
                 if 'section' in search_fields.keys():
                     if not search_upper_title_filter(id, sear, search_fields['section'], 0):
                         continue
                 if 'document' in search_fields.keys():
                     if not search_upper_title_filter(id, sear, search_fields['document'], 1):
                         continue
                 self._match.append(f)
                 self._resultSentencesList.append((id, res, detail, zhujie))
                 print(res)
                 print(self._match)
     return self
Пример #22
0
def get_user_query(positive_rated_movies):
    tags, genres, descriptions = get_user_profile(positive_rated_movies)

    query_builder = BooleanQuery.Builder()
    if tags != '':
        tags = tags_parser.escape(tags)
        tags = tags_parser.parse(tags)
        query_builder.add(tags, BooleanClause.Occur.SHOULD)

    if genres != '':
        genres = genres_parser.escape(genres)
        genres = genres_parser.parse(genres)
        query_builder.add(genres, BooleanClause.Occur.SHOULD)

    if descriptions != '':
        descriptions = descr_parser.escape(descriptions)
        descriptions = descr_parser.parse(descriptions)
        query_builder.add(descriptions, BooleanClause.Occur.SHOULD)

    return query_builder.build()
Пример #23
0
    def multiFieldsPairSearch(self, pair, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        title = pair[0].replace('_', ' ')
        content = pair[1]
        parser = MultiFieldQueryParser(["content_section", "title_article"],
                                       self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title))
        query2 = MultiFieldQueryParser.parse(parser,
                                             QueryParser.escape(content))

        bq = BooleanQuery.Builder()
        bq.add(query1, BooleanClause.Occur.FILTER)
        bq.add(query2, BooleanClause.Occur.SHOULD)

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(bq.build(), 6).scoreDocs
        return hits
Пример #24
0
def find_results(query, reader):
    """
    For the given `query`, search the index against the 'content' field in the index.
    """
    searcher = IndexSearcher(reader)
    content_query = QueryParser('content', Analyzer()).parse(query)
    highlighter = build_highlighter(content_query)

    abstract_query = QueryParser('abstract', Analyzer()).parse(query)
    abstract_query = BoostQuery(abstract_query,
                                ABSTRACT_BOOST)  # boost the abstract
    content_query = BoostQuery(content_query, CONTENT_BOOST)

    # query on both the abstract and the content field
    query_builder = BooleanQuery.Builder()
    query_builder.add(abstract_query, BooleanClause.Occur.SHOULD)
    query_builder.add(content_query, BooleanClause.Occur.MUST)
    query = query_builder.build()

    hits = searcher.search(query, MAX_N_DOCS).scoreDocs
    results = []

    for hit in hits:
        doc = searcher.doc(hit.doc)

        content = doc.get('content')
        stream = TokenSources.getTokenStream('content', content, Analyzer())
        fragments = highlighter.getBestTextFragments(
            stream, content, MERGE_CONTIGUOUS_FRAGMENTS, MAX_N_FRAGMENTS)
        fragments = [unicode(f).strip() for f in fragments]
        fragments = [f for f in fragments if f != '']  # no empty fragments

        if not ''.join(fragments) == '':
            results.append(
                Result(doc.get('name'), doc.get('path'), fragments, hit.doc,
                       reader))

    return results
Пример #25
0
    def __recs_query(self, positive_rated_document_list, scores, recs_number,
                     items_directory, candidate_list: List) -> pd.DataFrame:
        """
        Builds a query using the contents that the user liked. The terms relative to the contents that
        the user liked are boosted by the rating he/she gave. A filter clause is added to the query to
        consider only candidate items
        Args:
            positive_rated_document_list: List of contents that the user liked
            scores: Ratings given by the user
            recs_number: How many items must be recommended. You can only specify the number, not
            a specific item for which compute the prediction
            items_directory: Directory where the items are stored

        Returns:
            score_frame (pd.DataFrame): DataFrame containing the recommendations for the user
        """
        BooleanQuery.setMaxClauseCount(2000000)
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(
                Paths.get(items_directory))))
        if self.__classic_similarity:
            searcher.setSimilarity(ClassicSimilarity())

        field_list = searcher.doc(positive_rated_document_list[0]).getFields()
        user_fields = {}
        field_parsers = {}
        analyzer = SimpleAnalyzer()
        for field in field_list:
            if field.name() == 'content_id':
                continue
            user_fields[field.name()] = field.stringValue()
            field_parsers[field.name()] = QueryParser(field.name(), analyzer)

        positive_rated_document_list.remove(positive_rated_document_list[0])

        for _ in positive_rated_document_list:
            for field in field_list:
                if field.name() == 'content_id':
                    continue
                user_fields[field.name()] += field.stringValue()

        logger.info("Building query")

        query_builder = BooleanQuery.Builder()
        for score in scores:
            for field_name in user_fields.keys():
                if field_name == 'content_id':
                    continue
                field_parsers[field_name].setDefaultOperator(
                    QueryParser.Operator.OR)

                field_query = field_parsers[field_name].escape(
                    user_fields[field_name])
                field_query = field_parsers[field_name].parse(field_query)
                field_query = BoostQuery(field_query, score)
                query_builder.add(field_query, BooleanClause.Occur.SHOULD)

        if candidate_list is not None:
            id_query_string = ' OR '.join("content_id:\"" + content_id + "\""
                                          for content_id in candidate_list)
            id_query = QueryParser("testo_libero",
                                   KeywordAnalyzer()).parse(id_query_string)
            query_builder.add(id_query, BooleanClause.Occur.MUST)

        query = query_builder.build()
        docs_to_search = len(positive_rated_document_list) + recs_number
        scoreDocs = searcher.search(query, docs_to_search).scoreDocs

        logger.info("Building score frame to return")

        recorded_items = 0
        columns = ['to_id', 'rating']
        score_frame = pd.DataFrame(columns=columns)
        for scoreDoc in scoreDocs:
            if recorded_items >= recs_number:
                break
            if scoreDoc.doc not in positive_rated_document_list:
                doc = searcher.doc(scoreDoc.doc)
                item_id = doc.getField("content_id").stringValue()
                recorded_items += 1

                score_frame = pd.concat([
                    score_frame,
                    pd.DataFrame.from_records([(item_id, scoreDoc.score)],
                                              columns=columns)
                ])

        return score_frame
Пример #26
0
    def testPhraseQueryInConjunctionScorer(self):

        writer = self.getWriter()

        doc = Document()
        doc.add(Field("source", "marketing info", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("contents", "foobar", TextField.TYPE_STORED))
        doc.add(Field("source", "marketing info", TextField.TYPE_STORED))
        writer.addDocument(doc)

        writer.close()

        searcher = self.getSearcher()

        builder = PhraseQuery.Builder()
        builder.add(Term("source", "marketing"))
        builder.add(Term("source", "info"))
        phraseQuery = builder.build()
        topDocs = searcher.search(phraseQuery, 50)
        self.assertEqual(2, topDocs.totalHits)

        termQuery = TermQuery(Term("contents", "foobar"))
        builder = BooleanQuery.Builder()
        builder.add(termQuery, BooleanClause.Occur.MUST)
        builder.add(phraseQuery, BooleanClause.Occur.MUST)
        booleanQuery = builder.build()
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(1, topDocs.totalHits)

        writer = self.getWriter()

        doc = Document()
        doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("contents", "map foobarword entry woo",
                  TextField.TYPE_STORED))
        writer.addDocument(doc)

        writer.close()

        searcher = self.getSearcher()

        termQuery = TermQuery(Term("contents", "woo"))
        builder = PhraseQuery.Builder()
        builder.add(Term("contents", "map"))
        builder.add(Term("contents", "entry"))

        topDocs = searcher.search(termQuery, 50)
        self.assertEqual(3, topDocs.totalHits)
        phraseQuery = builder.build()
        topDocs = searcher.search(phraseQuery, 50)
        self.assertEqual(2, topDocs.totalHits)

        builder = BooleanQuery.Builder()
        builder.add(termQuery, BooleanClause.Occur.MUST)
        builder.add(phraseQuery, BooleanClause.Occur.MUST)
        booleanQuery = builder.build()
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(2, topDocs.totalHits)

        builder = BooleanQuery.Builder()
        builder.add(phraseQuery, BooleanClause.Occur.MUST)
        builder.add(termQuery, BooleanClause.Occur.MUST)
        booleanQuery = builder.build()
        topDocs = searcher.search(booleanQuery, 50)
        self.assertEqual(2, topDocs.totalHits)
Пример #27
0
    def search(self, field):
        s = self._search
        u = self._userQuery
        zh_to_hant_dict = self._zh_to_hant_dict
        info = u.getFlagsInfo()
        flags_list = u.getFlagsList()
        sq_list = []
        word_index_list = []
        index_count = 0
        for flag in flags_list:
            if flag["type"] == "word":
                word_index_list.append(index_count)
                if len(flag["content"]) == 1:
                    if flag["content"][0] in zh_to_hant_dict:
                        stq_list = [
                            SpanTermQuery(Term(field, flag["content"][0]))
                        ]
                        for hant in zh_to_hant_dict[flag["content"][0]]:
                            stq_list.append(SpanTermQuery(Term(field, hant)))
                        sq_list.append(SpanOrQuery(stq_list))
                    else:
                        sq_list.append(
                            SpanTermQuery(Term(field, flag["content"][0])))
                else:
                    snq_list = []
                    for w in flag["content"]:
                        if w in zh_to_hant_dict:
                            stq_list = [SpanTermQuery(Term(field, w))]
                            for hant in zh_to_hant_dict[w]:
                                stq_list.append(
                                    SpanTermQuery(Term(field, hant)))
                            snq_list.append(SpanOrQuery(stq_list))
                        else:
                            snq_list.append(SpanTermQuery(Term(field, w)))
                    sq_list.append(SpanNearQuery(snq_list, 0, True))
            else:
                sq_list.append({
                    "op": info[flag["content"]]["op"],
                    "num": info[flag["content"]]["num"]
                })
            index_count += 1
        q = None
        count = 0
        for index in word_index_list:
            if count == 0:
                q = sq_list[index]
                count += 1
            else:
                if not isinstance(sq_list[index - 1], dict):
                    q = SpanNearQuery([q, sq_list[index]], 0, True)
                else:
                    q = SpanNearQuery([q, sq_list[index]],
                                      sq_list[index - 1]["num"][-1], True)
        query = q
        # 过滤项
        filters = u.getFields()
        bq = BooleanQuery.Builder()
        bq.add(BooleanClause(query, BooleanClause.Occur.MUST))
        for key in filters.keys():
            cur_reg = '('
            for ft in filters[key]:
                cur_reg += ft + '|'
            cur_reg = cur_reg[0:-1] + ')'
            rq = RegexpQuery(Term(key, cur_reg))
            bq.add(BooleanClause(rq, BooleanClause.Occur.MUST))
        query = bq.build()
        top_docs = s.search(query, 9999)
        self._cur_field = field

        reg = get_test_reg(flags_list, info, zh_to_hant_dict)
        doc_id_list = []
        hits = top_docs.scoreDocs
        for hit in hits:
            doc = s.doc(hit.doc)
            text = doc.get("text")
            match_res = re.search(reg, text)
            if match_res:
                doc_id_list.append(hit.doc)
        self._res = doc_id_list
        self._reg = reg
        return self
    def testSimilarity(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer(
            Version.LUCENE_CURRENT),
                                similarity=SimpleSimilarity())

        d1 = Document()
        d1.add(Field("field", "a c", TextField.TYPE_STORED))

        d2 = Document()
        d2.add(Field("field", "a c b", TextField.TYPE_STORED))

        writer.addDocument(d1)
        writer.addDocument(d2)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        searcher.setSimilarity(SimpleSimilarity())

        a = Term("field", "a")
        b = Term("field", "b")
        c = Term("field", "c")

        class collector1(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def doSetNextReader(_self, context):
                pass

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(TermQuery(b), collector1())

        builder = BooleanQuery.Builder()
        builder.add(TermQuery(a), BooleanClause.Occur.SHOULD)
        builder.add(TermQuery(b), BooleanClause.Occur.SHOULD)
        bq = builder.build()

        class collector2(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(doc + _self.base + 1, score)

            def doSetNextReader(_self, context):
                _self.base = context.docBase

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(bq, collector2())

        pq = PhraseQuery(a.field(), [a.bytes(), c.bytes()])

        class collector3(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(1.0, score)

            def doSetNextReader(_self, context):
                pass

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(pq, collector3())

        pq = PhraseQuery(2, a.field(), [a.bytes(), b.bytes()])

        class collector4(PythonSimpleCollector):
            def collect(_self, doc, score):
                self.assertEqual(0.5, score)

            def doSetNextReader(_self, context):
                pass

            def scoreMode(_self):
                return ScoreMode.COMPLETE

        searcher.search(pq, collector4())
Пример #29
0
    def searchQ(self,query):
        ps=PorterStemmer()
        # qboosto=BoostQuery.BoostQuery()
        f = open("C:/Users/Tigmanshu/Documents/IRWeb/ssProject/Lucene/wn_s.pl")
        t = Thesaurus.from_file(f)
        # print(t.synonyms("regression"))
        result = []
        qList=query.mainq.lower().split()
        checkList=[]
        notQ=""
        mustQ=""
        shouldQ=""
        for qt in qList:
            if qt[0]=='!':
                notQ=notQ+" "+qt[1:len(qt)]
                # checkList.append(qt[1:len(qt)])
            elif qt[0]=='$':
                mustQ = mustQ + " " + qt[1:len(qt)]
                checkList.append(ps.stem(qt[1:len(qt)]))
            else:
                shouldQ=shouldQ + " " + qt
                thes=t.synonyms(qt)
                i=0
                for term in thes:
                    if i==5:
                        break
                    shouldQ = shouldQ + " " + term
                    i+=1
                checkList.append(ps.stem(qt))
        del qList
        self.stopList="a an the of is zero".split()
        searcher = IndexSearcher(DirectoryReader.open(self.openStore()))
        analyzer = StandardAnalyzer()
        # q1 = QueryParser("abstract", analyzer).parse(shouldQ)
        # q2 = QueryParser("entities", analyzer).parse(shouldQ)
        # q3 = QueryParser("abstract", analyzer).parse(mustQ)
        # q4 = QueryParser("entities", analyzer).parse(mustQ)
        # q5 = QueryParser("abstract", analyzer).parse(notQ)
        # q6 = QueryParser("entities", analyzer).parse(notQ)




        b1 = BooleanQuery.Builder()
        if len(shouldQ)>0:
            q1 = QueryParser("abstract", analyzer).parse(shouldQ)
            q2 = QueryParser("entities", analyzer).parse(shouldQ)
            # q1.setBoost(2)
            b1.add(q1, BooleanClause.Occur.SHOULD)
            b1.add(q2, BooleanClause.Occur.SHOULD)
        if len(mustQ)>0:
            q3 = QueryParser("abstract", analyzer).parse(mustQ)
            q4 = QueryParser("entities", analyzer).parse(mustQ)
            b1.add(q3, BooleanClause.Occur.MUST)
            b1.add(q4, BooleanClause.Occur.MUST)
        if len(notQ)>0:
            q5 = QueryParser("abstract", analyzer).parse(notQ)
            q6 = QueryParser("entities", analyzer).parse(notQ)
            b1.add(q5, BooleanClause.Occur.MUST_NOT)
            b1.add(q6, BooleanClause.Occur.MUST_NOT)

        if len(query.journal)>0:
            print("$$$$$")
            print(query.journal)
            q7 = QueryParser("journalName", analyzer).parse(query.journal)
            b1.add(q7, BooleanClause.Occur.SHOULD)
        if len(query.author)>0:
            q8 = QueryParser("authorName", analyzer).parse(query.author)
            b1.add(q8, BooleanClause.Occur.SHOULD)
        bq1 = b1.build()

        #print(bq1)
        topDocs = searcher.search(bq1, 100)
        scoreDocs = topDocs.scoreDocs
        #print(len(scoreDocs))
        entitiesHitList = {}
        for i in scoreDocs:
            myDoc= d.Document()
            doc = searcher.doc(i.doc)
            # print(doc.get("id"), i)
            # print(doc.get("abstract"))
            # print(doc.get("authorName"))

            for entity in doc.get("entities").split():
                entity=entity.lower()
                if ps.stem(entity) in checkList:
                    continue
                if entity in self.stopList:
                    continue
                if entity in entitiesHitList:
                    entitiesHitList[entity] += 1
                else:
                    entitiesHitList[entity] = 1
            myDoc.setDocTitle(doc.get("title"))
            myDoc.setAbstract(doc.get("abstract"))
            myDoc.setJournal(doc.get("journalName"))
            myDoc.setAuthor(doc.get("authorName"))
            myDoc.setURL(doc.get("url"))
            result.append(myDoc)
        sorted_d = sorted(entitiesHitList.items(), key=operator.itemgetter(1), reverse=True)
        del entitiesHitList
        n_items = dict(islice(sorted_d , 10))
        del sorted_d
        return {"res":result,"sug":n_items,"query":query}
import lucene
from java.nio.file import Paths
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery
from org.apache.lucene.index import DirectoryReader, Term

input_q = input().replace('ی', 'ي')
lucene.initVM()
index_path = Paths.get('./lucene.index')
question_field = 'question'
answer_field = 'answer'

directory = SimpleFSDirectory(index_path)
searcher = IndexSearcher(DirectoryReader.open(directory))

query_builder = BooleanQuery.Builder()
for q_word in input_q.split(' '):
    qtq = TermQuery(Term(question_field, q_word))
    query_builder\
        .add(BooleanClause(qtq, BooleanClause.Occur.SHOULD))
query = query_builder.build()
top_n = 5
scoreDocs = searcher.search(query, top_n).scoreDocs
print('found nums: ', len(scoreDocs))
for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    print('Best Math: ', doc.get(question_field), '\n')
    print('Answer: ', doc.get(answer_field))
    print('---------------------\n')