Exemplo n.º 1
0
def simple_term_to_query(field, word_list, zh_to_hant_dict=None):
    if not zh_to_hant_dict:
        zh_to_hant_dict = {}
    soq_list = []
    for words in word_list:
        snq_list = []
        for word in words:
            snq = SpanNearQuery.Builder(field, True)
            snq.setSlop(0)
            if len(word) == 1:
                if word[0] in zh_to_hant_dict:
                    stq_list = [SpanTermQuery(Term(field, word[0]))]
                    for hant in zh_to_hant_dict[word[0]]:
                        stq_list.append(SpanTermQuery(Term(field, hant)))
                    snq_list.append(SpanOrQuery(stq_list))
                else:
                    snq_list.append(SpanTermQuery(Term(field, word[0])))
            else:
                for w in word:
                    if w in zh_to_hant_dict:
                        stq_list = [SpanTermQuery(Term(field, w))]
                        for hant in zh_to_hant_dict[w]:
                            stq_list.append(SpanTermQuery(Term(field, hant)))
                        snq.addClause(SpanOrQuery(stq_list))
                    else:
                        snq.addClause(SpanTermQuery(Term(field, w)))
                snq_list.append(snq.build())
        soq = SpanOrQuery(snq_list)
        soq_list.append(soq)
    if len(soq_list) == 1:
        return soq_list[0]
    return SpanNearQuery(soq_list, 999, False)
Exemplo n.º 2
0
def getSpanNearQuery(analyzer, s, field="title", slop=100, inOrder=True):
    keywords = tokenize_string(analyzer, s)
    spanTermQueries = [
        SpanMultiTermQueryWrapper(FuzzyQuery(Term(field, keyword)))
        for keyword in keywords
    ]
    return SpanNearQuery(spanTermQueries, slop, inOrder)
Exemplo n.º 3
0
    def spanRegexQueryNrHits(self, regex1, regex2, slop, ordered):

        srq1 = SpanMultiTermQueryWrapper(RegexQuery(self.newTerm(regex1)))
        srq2 = SpanMultiTermQueryWrapper(RegexQuery(self.newTerm(regex2)))
        query = SpanNearQuery([srq1, srq2], slop, ordered)

        return self.searcher.search(query, 50).totalHits
Exemplo n.º 4
0
      def get_coll_bigram_freq(self,bigram,field,ordered,slop,title,field_cache='title'):
          if self.is_bigram_cache_used==True:
             item_tf=self.conn_bigram_tf_cache.find_one({'title':title,'bigram':bigram,'field':field,'ordered':ordered,'slop':slop})
             item_cf=self.conn_bigram_cf_cache.find_one({'bigram':bigram,'field':field,'ordered':ordered,'slop':slop})
             if item_cf is not None:
                cf=int(item_cf['value'])
                if item_tf is not None:
                   tf=int(item_tf['value'])
                else:
                   tf=0
                return (tf,cf)
             #print (bigram,field,ordered,slop,title)
             #assert item_cf is not None
          #print ('oh')     
          searcher=self.getSecondarySearcher()
          SpanClauses=[]
          for term in bigram.split(' '):
              SpanClauses.append(SpanTermQuery(Term(field,term)))

          builder=SpanNearQuery.Builder(field,ordered)
          for i in range(len(SpanClauses)):
              clause=SpanClauses[i]
              builder.addClause(clause)
          builder.setSlop(slop) 
          q_lucene=builder.build()
          
          sw=q_lucene.createWeight(searcher,False)
          list_leaves=self.reader.getContext().leaves()
          frequency=0
          doc_phrase_freq={}
          for leave in list_leaves:
              spans = sw.getSpans(leave, SpanWeight.Postings.POSITIONS)
              if spans is None:
                 continue
              while spans.nextDoc()!=DocIdSetIterator.NO_MORE_DOCS:
                    id=leave.reader().document(spans.docID()).get(field_cache)
                    #id=leave.reader().document(spans.docID()).get('wiki_id')
                    if self.is_bigram_cache_used==True:
                       item_tf=self.conn_bigram_tf_cache.find_one({'bigram':bigram,'field':field,'ordered':ordered,'title':id,'slop':slop})
                       if item_tf is not None:
                          continue
                       
                    if id not in doc_phrase_freq:
                       doc_phrase_freq[id]=0
                    while spans.nextStartPosition()!=Spans.NO_MORE_POSITIONS:
                          doc_phrase_freq[id]+=1
                          frequency+=1
                    
                    if self.is_bigram_cache_used==True:
                       self.conn_bigram_tf_cache.insert({'title':id,'bigram':bigram,'field':field,'ordered':ordered,'slop':slop,'value':doc_phrase_freq[id]})
          cf=sum(doc_phrase_freq.values())
          if self.is_bigram_cache_used==True:
             self.conn_bigram_cf_cache.insert({'bigram':bigram,'field':field,'ordered':ordered,'slop':slop,'value':cf})
          tf=doc_phrase_freq.get(title,0)
          return tf,cf
          #return doc_phrase_freq
Exemplo n.º 5
0
    def get_span_query(self, terms, field, slop, ordered=True):
        """
        Creates near span query

        :param terms: list of terms
        :param field: field name
        :param slop: number of terms between the query terms
        :param ordered: If true, ordered search; otherwise unordered search
        :return: lucene span near query
        """
        span_queries = []
        for term in terms:
            span_queries.append(SpanTermQuery(Term(field, term)))
        span_near_query = SpanNearQuery(span_queries, slop, ordered)
        return span_near_query
Exemplo n.º 6
0
    def get_coll_bigram_freq(self,
                             bigram,
                             field,
                             ordered,
                             slop,
                             title,
                             field_cache='title'):
        searcher = self.getSecondarySearcher()
        SpanClauses = []
        for term in bigram.split(' '):
            SpanClauses.append(SpanTermQuery(Term(field, term)))

        builder = SpanNearQuery.Builder(field, ordered)
        for i in range(len(SpanClauses)):
            clause = SpanClauses[i]
            builder.addClause(clause)
        builder.setSlop(slop)
        q_lucene = builder.build()

        sw = q_lucene.createWeight(searcher, False)
        list_leaves = self.reader.getContext().leaves()
        frequency = 0
        doc_phrase_freq = {}
        for leave in list_leaves:
            spans = sw.getSpans(leave, SpanWeight.Postings.POSITIONS)
            if spans is None:
                continue
            while spans.nextDoc() != DocIdSetIterator.NO_MORE_DOCS:
                id = leave.reader().document(spans.docID()).get(field_cache)
                #id=leave.reader().document(spans.docID()).get('wiki_id')

                if id not in doc_phrase_freq:
                    doc_phrase_freq[id] = 0
                while spans.nextStartPosition() != Spans.NO_MORE_POSITIONS:
                    doc_phrase_freq[id] += 1
                    frequency += 1
        cf = sum(doc_phrase_freq.values())
        tf = doc_phrase_freq.get(title, 0)
        return tf, cf
Exemplo n.º 7
0
    def get_coll_bigram_freq(self, bigram, field, ordered, slop, title,
                             **kwargs):
        field_id = kwargs.get('field_id', 'title')
        is_cached = kwargs.get('is_cached', True)
        is_bigram_cache_used = (self.is_bigram_cache_used and is_cached)
        if is_bigram_cache_used:
            item_tf = self.conn_bigram_tf_cache.find_one({
                field_id: title,
                'bigram': bigram,
                'field': field,
                'ordered': ordered,
                'slop': slop
            })
            item_cf = self.conn_bigram_cf_cache.find_one({
                'bigram': bigram,
                'field': field,
                'ordered': ordered,
                'slop': slop
            })
            if item_cf is not None:
                cf = int(item_cf['value'])
                if item_tf is not None:
                    tf = int(item_tf['value'])
                else:
                    tf = 0
                return (tf, cf)
        searcher = self.getSecondarySearcher()
        SpanClauses = []
        #print ('bigram='+bigram)
        for term in bigram.split(' '):
            #print (field,term)
            SpanClauses.append(SpanTermQuery(Term(field, term)))
        #print ('--------')
        builder = SpanNearQuery.Builder(field, ordered)
        for clause in SpanClauses:
            builder.addClause(clause)
        builder.setSlop(slop)
        q_lucene = builder.build()

        sw = q_lucene.createWeight(searcher, False)
        list_leaves = self.reader.getContext().leaves()
        cf = 0
        doc_phrase_freq = {}
        for leave in list_leaves:
            spans = sw.getSpans(leave, SpanWeight.Postings.POSITIONS)
            if spans is None:
                continue
            while spans.nextDoc() != DocIdSetIterator.NO_MORE_DOCS:
                id = leave.reader().document(spans.docID()).get(field_id)
                if is_bigram_cache_used:
                    item_tf = self.conn_bigram_tf_cache.find_one({
                        field_id: id,
                        'bigram': bigram,
                        'field': field,
                        'ordered': ordered,
                        'slop': slop
                    })
                    if item_tf is not None:
                        tf = item_tf['value']
                        doc_phrase_freq[id] = tf
                        cf += tf
                        continue

                if id not in doc_phrase_freq:
                    doc_phrase_freq[id] = 0
                while spans.nextStartPosition() != Spans.NO_MORE_POSITIONS:
                    doc_phrase_freq[id] += 1
                    cf += 1

                if is_bigram_cache_used:
                    self.conn_bigram_tf_cache.insert({
                        field_id:
                        id,
                        'bigram':
                        bigram,
                        'field':
                        field,
                        'ordered':
                        ordered,
                        'slop':
                        slop,
                        'value':
                        doc_phrase_freq[id]
                    })

        if is_bigram_cache_used:
            self.conn_bigram_cf_cache.insert({
                'bigram': bigram,
                'field': field,
                'ordered': ordered,
                'slop': slop,
                'value': cf
            })
        tf = doc_phrase_freq.get(title, 0)
        return tf, cf
Exemplo n.º 8
0
    def search(self, field):
        s = self._search
        u = self._userQuery
        zh_to_hant_dict = self._zh_to_hant_dict
        info = u.getFlagsInfo()
        flags_list = u.getFlagsList()
        sq_list = []
        word_index_list = []
        index_count = 0
        for flag in flags_list:
            if flag["type"] == "word":
                word_index_list.append(index_count)
                if len(flag["content"]) == 1:
                    if flag["content"][0] in zh_to_hant_dict:
                        stq_list = [
                            SpanTermQuery(Term(field, flag["content"][0]))
                        ]
                        for hant in zh_to_hant_dict[flag["content"][0]]:
                            stq_list.append(SpanTermQuery(Term(field, hant)))
                        sq_list.append(SpanOrQuery(stq_list))
                    else:
                        sq_list.append(
                            SpanTermQuery(Term(field, flag["content"][0])))
                else:
                    snq_list = []
                    for w in flag["content"]:
                        if w in zh_to_hant_dict:
                            stq_list = [SpanTermQuery(Term(field, w))]
                            for hant in zh_to_hant_dict[w]:
                                stq_list.append(
                                    SpanTermQuery(Term(field, hant)))
                            snq_list.append(SpanOrQuery(stq_list))
                        else:
                            snq_list.append(SpanTermQuery(Term(field, w)))
                    sq_list.append(SpanNearQuery(snq_list, 0, True))
            else:
                sq_list.append({
                    "op": info[flag["content"]]["op"],
                    "num": info[flag["content"]]["num"]
                })
            index_count += 1
        q = None
        count = 0
        for index in word_index_list:
            if count == 0:
                q = sq_list[index]
                count += 1
            else:
                if not isinstance(sq_list[index - 1], dict):
                    q = SpanNearQuery([q, sq_list[index]], 0, True)
                else:
                    q = SpanNearQuery([q, sq_list[index]],
                                      sq_list[index - 1]["num"][-1], True)
        query = q
        # 过滤项
        filters = u.getFields()
        bq = BooleanQuery.Builder()
        bq.add(BooleanClause(query, BooleanClause.Occur.MUST))
        for key in filters.keys():
            cur_reg = '('
            for ft in filters[key]:
                cur_reg += ft + '|'
            cur_reg = cur_reg[0:-1] + ')'
            rq = RegexpQuery(Term(key, cur_reg))
            bq.add(BooleanClause(rq, BooleanClause.Occur.MUST))
        query = bq.build()
        top_docs = s.search(query, 9999)
        self._cur_field = field

        reg = get_test_reg(flags_list, info, zh_to_hant_dict)
        doc_id_list = []
        hits = top_docs.scoreDocs
        for hit in hits:
            doc = s.doc(hit.doc)
            text = doc.get("text")
            match_res = re.search(reg, text)
            if match_res:
                doc_id_list.append(hit.doc)
        self._res = doc_id_list
        self._reg = reg
        return self
Exemplo n.º 9
0
    def testPayloadsPos0(self):

        writer = self.getWriter(analyzer=TestPayloadAnalyzer())

        doc = Document()
        doc.add(Field("content", "a a b c d e a f g h i j a b k k",
                      TextField.TYPE_STORED))
        writer.addDocument(doc)
        reader = self.getOnlyLeafReader(writer.getReader())
        writer.close()

        tp = reader.postings(Term("content", "a"), PostingsEnum.ALL)

        count = 0
        self.assert_(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

        # "a" occurs 4 times
        self.assertEqual(4, tp.freq())
        self.assertEqual(0, tp.nextPosition())
        self.assertEqual(1, tp.nextPosition())
        self.assertEqual(3, tp.nextPosition())
        self.assertEqual(6, tp.nextPosition())

        # only one doc has "a"
        self.assertEqual(DocIdSetIterator.NO_MORE_DOCS, tp.nextDoc())

        searcher = self.getSearcher()

        stq1 = SpanTermQuery(Term("content", "a"))
        stq2 = SpanTermQuery(Term("content", "k"))
        sqs = [stq1, stq2 ]
        snq = SpanNearQuery(sqs, 30, False)

        count = 0;
        collector = PayloadSpanCollector()
        pspans = snq.createWeight(searcher, False, 1.0).getSpans(
            searcher.getIndexReader().leaves().get(0),
            SpanWeight.Postings.PAYLOADS)

        sawZero = False
        while pspans.nextDoc() != Spans.NO_MORE_DOCS:
            while pspans.nextStartPosition() != Spans.NO_MORE_POSITIONS:
                collector.reset()
                pspans.collect(collector)
                sawZero = sawZero or pspans.startPosition() == 0
                for payload in collector.payloads:
                    count += 1

        self.assert_(sawZero)
        self.assertEquals(8, count)

        spans = snq.createWeight(searcher, False, 1.0).getSpans(
            searcher.getIndexReader().leaves().get(0),
            SpanWeight.Postings.POSITIONS)
        count = 0

        sawZero = False
        while spans.nextDoc() != Spans.NO_MORE_DOCS:
            while spans.nextStartPosition() != Spans.NO_MORE_POSITIONS:
                count += 1
                sawZero = sawZero or spans.startPosition() == 0

        self.assertEquals(4, count)
        self.assert_(sawZero)
Exemplo n.º 10
0
 def search(self, field):
     s = self._search
     u = self._userQuery
     z = self._zh_to_hant_dict
     keys = u.getKey()
     nums = u.getNum()
     word_list = u.getWordList()
     filters = u.getFields()
     # 只检索过滤项
     if len(word_list) == 0:
         query = None
     # 简单项
     elif len(keys) == 0:
         query = simple_term_to_query(field, word_list[0], z)
     elif keys[0] == '#':
         query_left = simple_term_to_query(field, word_list[0], z)
         query_right = simple_term_to_query(field, word_list[1], z)
         query = SpanNearQuery([query_left, query_right], int(nums[0]),
                               False)
     elif keys[0] == '+' or keys[0] == '$':
         prev_query = simple_term_to_query(field, word_list[0], z)
         for i in range(len(keys)):
             cur_query = simple_term_to_query(field, word_list[i + 1], z)
             if keys[i] == '+':
                 span_list = [prev_query]
                 for j in range(int(nums[i])):
                     span = SpanMultiTermQueryWrapper(
                         RegexpQuery(Term(field, '.')))
                     span_list.append(span)
                 span_list.append(cur_query)
                 prev_query = SpanNearQuery(span_list, 0, True)
             else:
                 span_list = [prev_query, cur_query]
                 prev_query = SpanNearQuery(span_list, int(nums[i]), True)
         query = prev_query
     elif keys[0] == '-' or keys[0] == '~':
         query_left = simple_term_to_query(field, word_list[0], z)
         query_right = simple_term_to_query(field, word_list[1], z)
         if keys[0] == '-':
             n_q_list = [query_left, query_right]
         else:
             n_q_list = [query_right, query_left]
         n_query = SpanNearQuery(n_q_list, int(nums[0]) - 1, True)
         bq = BooleanQuery.Builder()
         bc1 = BooleanClause(query_left, BooleanClause.Occur.MUST)
         bc2 = BooleanClause(n_query, BooleanClause.Occur.MUST_NOT)
         query = bq.add(bc1).add(bc2).build()
     else:
         raise ValueError("检索语句错误!")
     # 过滤项
     bq = BooleanQuery.Builder()
     if query:
         bq.add(BooleanClause(query, BooleanClause.Occur.MUST))
     for key in filters.keys():
         cur_reg = '('
         for ft in filters[key]:
             cur_reg += ft + '|'
         cur_reg = cur_reg[0:-1] + ')'
         rq = RegexpQuery(Term(key, cur_reg))
         bq.add(BooleanClause(rq, BooleanClause.Occur.MUST))
     query = bq.build()
     self._res = s.search(query, 100000)
     self._cur_field = field
     return self
Exemplo n.º 11
0
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT")
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)
    searcher = IndexSearcher(reader)

    term = Term("contents", "tiger")
    print(f'Tiger frequency: {reader.totalTermFreq(term)}')

    q_regex = RegexpQuery(Term("contents", "[0-9]+\.?[0-9]*"))
    print(f'regex results: {searcher.search(q_regex,1000000).totalHits}')

    span1 = SpanMultiTermQueryWrapper(q_regex)
    span2 = SpanMultiTermQueryWrapper(RegexpQuery(Term("contents", "tiger")))
    spannearquery = SpanNearQuery([span1, span2], 20, True)
    print(
        f'spanquery results: {searcher.search(spannearquery, 1000000).totalHits}'
    )

    parser = QueryParser('contents', StandardAnalyzer())
    q = parser.parse('"tiger leopard"')
    print(q)  # prints contents:"tiger leopard"
    print(searcher.search(q, 10000000).totalHits)

    phrase_query = PhraseQuery(10, 'contents', 'tiger leopard')
    print(phrase_query)
    print(searcher.search(phrase_query, 10000000).totalHits)

    parser = QueryParser('contents', StandardAnalyzer())
    q = parser.parse('"tiger leopard"~10')
Exemplo n.º 12
0
    def testPayloadsPos0(self):

        writer = self.getWriter(analyzer=TestPayloadAnalyzer())

        doc = Document()
        doc.add(
            Field("content", "a a b c d e a f g h i j a b k k",
                  TextField.TYPE_STORED))
        writer.addDocument(doc)
        reader = writer.getReader()
        writer.close()

        tp = MultiFields.getTermPositionsEnum(reader,
                                              MultiFields.getLiveDocs(reader),
                                              "content", BytesRef("a"))

        count = 0
        self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS)
        # "a" occurs 4 times
        self.assertEqual(4, tp.freq())

        expected = 0
        self.assertEqual(expected, tp.nextPosition())
        self.assertEqual(1, tp.nextPosition())
        self.assertEqual(3, tp.nextPosition())
        self.assertEqual(6, tp.nextPosition())

        # only one doc has "a"
        self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS)

        searcher = self.getSearcher(reader=reader)

        stq1 = SpanTermQuery(Term("content", "a"))
        stq2 = SpanTermQuery(Term("content", "k"))
        sqs = [stq1, stq2]
        snq = SpanNearQuery(sqs, 30, False)

        count = 0
        sawZero = False
        pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq)
        while pspans.next():
            payloads = pspans.getPayload()
            sawZero |= pspans.start() == 0

            it = payloads.iterator()
            while it.hasNext():
                count += 1
                it.next()

        self.assertEqual(5, count)
        self.assert_(sawZero)

        spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq)
        count = 0
        sawZero = False
        while spans.next():
            count += 1
            sawZero |= spans.start() == 0

        self.assertEqual(4, count)
        self.assert_(sawZero)

        sawZero = False
        psu = PayloadSpanUtil(searcher.getTopReaderContext())
        pls = psu.getPayloadsForQuery(snq)
        count = pls.size()
        it = pls.iterator()
        while it.hasNext():
            bytes = JArray('byte').cast_(it.next())
            s = bytes.string_
            sawZero |= s == "pos: 0"

        self.assertEqual(5, count)
        self.assert_(sawZero)