示例#1
0
 def search(self, field: str):
     sear = self._search
     if len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']:
         query = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
     elif self._commandInfo.getKey()[0] == '#':
         query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
         query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1]))
         bc1 = BooleanClause(query1, BooleanClause.Occur.MUST)
         bc2 = BooleanClause(query2, BooleanClause.Occur.MUST)
         query = BooleanQuery.Builder().add(bc1).add(bc2).build()
     elif self._commandInfo.getKey()[0] in ['$', '+']:
         bq = BooleanQuery.Builder()
         for w in self._commandInfo.getWordList():
             queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w))
             bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
             bq.add(bc)
         query = bq.build()
     else:
         query = ''
     hits = sear.search(query, 999999)
     for hit in hits.scoreDocs:
         doc = sear.doc(hit.doc)
         res = doc.get(field)
         id = doc.get(field+'_id')
         if doc_hit(res, self._commandInfo):
             sentences = re.split('[!?!?。]', res)
             map(lambda x: sentences.pop(x) if x == '' else 0, range(len(sentences)))
             for sentence in sentences:
                 if key_filter(self._commandInfo, sentence):
                     self._doc[id] = res
                     self._resultSentencesList.append((id, sentence))
     return self
示例#2
0
    def testFlat(self):

        q = BooleanQuery()
        q.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD))
        q.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD))
        q.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD))
        q.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD))
        self.assertEqual(1, self.search(q))
示例#3
0
    def testFlat(self):

        b = BooleanQuery.Builder()
        b.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD))
        b.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD))
        b.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD))
        b.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD))
        q = b.build()
        self.assertEqual(1, self.search(q))
示例#4
0
    def testParenthesisMust2(self):

        q3 = BooleanQuery()
        q3.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD))
        q3.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD))
        q4 = BooleanQuery()
        q4.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD))
        q4.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD))
        q2 = BooleanQuery()
        q2.add(q3, BooleanClause.Occur.SHOULD)
        q2.add(q4, BooleanClause.Occur.MUST)
        self.assertEqual(1, self.search(q2))
示例#5
0
    def testParenthesisMust2(self):

        b3 = BooleanQuery.Builder()
        b3.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD))
        b3.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD))
        q3 = b3.build()
        b4 = BooleanQuery.Builder()
        b4.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD))
        b4.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD))
        q4 = b4.build()
        b2 = BooleanQuery.Builder()
        b2.add(q3, BooleanClause.Occur.SHOULD)
        b2.add(q4, BooleanClause.Occur.MUST)
        q2 = b2.build()
        self.assertEqual(1, self.search(q2))
示例#6
0
 def query_section(self, section):
     searcher = self._searcher
     query_doc = RegexpQuery(Term('id', self._id + '\\..+'))
     query_section = TermQuery(Term('section', section))
     query = BooleanQuery.Builder()
     bc1 = BooleanClause(query_doc, BooleanClause.Occur.MUST)
     bc2 = BooleanClause(query_section, BooleanClause.Occur.MUST)
     query = query.add(bc1).add(bc2).build()
     top_docs = searcher.search(query, 1000000)
     hits = top_docs.scoreDocs
     res_dict = {}
     for hit in hits:
         doc = searcher.doc(hit.doc)
         id = doc.get('id')
         document = doc.get('document')
         section = doc.get('section')
         author = doc.get('author')
         dynasty = doc.get('dynasty')
         type = doc.get('type')
         text = doc.get('text')
         color = doc.get('color')
         area = doc.get('area')
         zhujie = doc.get('zhujie')
         detail = doc.get('detail')
         res_dict[id] = {}
         if document:
             res_dict[id]['document'] = document
         if section:
             res_dict[id]['section'] = section
         if author:
             res_dict[id]['author'] = author
         if dynasty:
             res_dict[id]['dynasty'] = dynasty
         if type:
             res_dict[id]['type'] = type
         if text:
             res_dict[id]['text'] = text
         if color:
             res_dict[id]['color'] = color
         if area:
             res_dict[id]['area'] = area
         if zhujie:
             res_dict[id]['zhujie'] = zhujie
         if detail:
             res_dict[id]['detail'] = detail
     res_dict[self._id] = {'document': section}
     self._resDict = res_dict
     return self
    def getBooleanQuery(self, clauses):

        extra_query = TermQuery(Term("all", "extra_clause"))
        extra_clause = BooleanClause(extra_query, BooleanClause.Occur.SHOULD)
        clauses.add(extra_clause)

        return super(BooleanTestMixin, self).getBooleanQuery(clauses)
    def get_most_similar(self, sentence, do_log=False):
        # print('query string is',string)
        # q = QueryParser('pa', self.analyzer).parse(sentence)
        query_builder = BooleanQuery.Builder()
        for token in sentence.split(' '):
            if token not in sw:
                qtq = TermQuery(Term("pa", token))
                query_builder.add(
                    BooleanClause(qtq, BooleanClause.Occur.SHOULD))
        q = query_builder.build()
        hitsPerPage = 2
        reader = DirectoryReader.open(self.w)
        self.searcher = IndexSearcher(reader)
        simi = BM25Similarity(Config.k1, Config.b)
        # simi = ClassicSimilarity()
        self.searcher.setSimilarity(simi)

        docs = self.searcher.search(q, hitsPerPage)
        hits = docs.scoreDocs

        # print("Found " + str(len(hits)) + " hits.")
        if len(hits) > 0:
            mate = self.searcher.doc(hits[0].doc).get("id")
            if do_log:
                print("found something. mate: ", mate, "- score : ",
                      hits[0].score)
            return hits[0], int(mate)
        else:
            return None, -1
示例#9
0
 def searchAncient(self, field):
     indexDir = SimpleFSDirectory(Paths.get(self._dir))
     sear = IndexSearcher(DirectoryReader.open(indexDir))
     bq = BooleanQuery.Builder()
     q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._searchWord))
     bc = BooleanClause(q, BooleanClause.Occur.MUST)
     bq.add(bc)
     search_fields = self._fields
     for i in search_fields:
         if i == 'section' or i == 'document':
             continue
         queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(search_fields[i]))
         bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
         bq.add(bc)
     query = bq.build()
     hits = sear.search(query, 9999)
     for hit in hits.scoreDocs:
         doc = sear.doc(hit.doc)
         res = doc.get(field)
         id = doc.get('id')
         detail = get_detail(doc)
         zhujie = detail['zhujie']
         if detail['detail'] and 'detail' in detail['detail'].keys():
             detail['detail'] = detail['detail']['detail']
         detail.pop('zhujie')
         detail.pop('text')
         detail.pop('type')
         detail = json.dumps(detail)
         self._doc[id] = res
         if doc_hit(res, self._words):
             f = key_filter(self._words, self._re, res)
             if f:
                 if 'section' in search_fields.keys():
                     if not search_upper_title_filter(id, sear, search_fields['section'], 0):
                         continue
                 if 'document' in search_fields.keys():
                     if not search_upper_title_filter(id, sear, search_fields['document'], 1):
                         continue
                 self._match.append(f)
                 self._resultSentencesList.append((id, res, detail, zhujie))
                 print(res)
                 print(self._match)
     return self
示例#10
0
def delete(primary_keys_map, collection_name, todelete, commit=False):
    INDEX_DIR_DEFAULT = "IndexFiles.index"
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    try:
        tofind_keyvalue_pairs = json.loads(todelete)
    except:
        return 100

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #setting writer configurations
    try:
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(direc, config)
        ireader = IndexReader.open(direc)
    except:
        return 105

    ###as of now deletion of documents support is only based on indexed keys.###################3
    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents according to primary keys
    query = BooleanQuery()
    for key in tofind_primary_keyvalue_pairs.keys():
        temp = QueryParser(Version.LUCENE_CURRENT, key,
                           analyzer).parse(tofind_primary_keyvalue_pairs[key])
        query.add(BooleanClause(temp, BooleanClause.Occur.MUST))

    a = writer.deleteDocuments(query)
    if commit == True:
        writer.commit()
    writer.close()
    return 000
示例#11
0
    def search(self, field):
        s = self._search
        u = self._userQuery
        zh_to_hant_dict = self._zh_to_hant_dict
        info = u.getFlagsInfo()
        flags_list = u.getFlagsList()
        sq_list = []
        word_index_list = []
        index_count = 0
        for flag in flags_list:
            if flag["type"] == "word":
                word_index_list.append(index_count)
                if len(flag["content"]) == 1:
                    if flag["content"][0] in zh_to_hant_dict:
                        stq_list = [
                            SpanTermQuery(Term(field, flag["content"][0]))
                        ]
                        for hant in zh_to_hant_dict[flag["content"][0]]:
                            stq_list.append(SpanTermQuery(Term(field, hant)))
                        sq_list.append(SpanOrQuery(stq_list))
                    else:
                        sq_list.append(
                            SpanTermQuery(Term(field, flag["content"][0])))
                else:
                    snq_list = []
                    for w in flag["content"]:
                        if w in zh_to_hant_dict:
                            stq_list = [SpanTermQuery(Term(field, w))]
                            for hant in zh_to_hant_dict[w]:
                                stq_list.append(
                                    SpanTermQuery(Term(field, hant)))
                            snq_list.append(SpanOrQuery(stq_list))
                        else:
                            snq_list.append(SpanTermQuery(Term(field, w)))
                    sq_list.append(SpanNearQuery(snq_list, 0, True))
            else:
                sq_list.append({
                    "op": info[flag["content"]]["op"],
                    "num": info[flag["content"]]["num"]
                })
            index_count += 1
        q = None
        count = 0
        for index in word_index_list:
            if count == 0:
                q = sq_list[index]
                count += 1
            else:
                if not isinstance(sq_list[index - 1], dict):
                    q = SpanNearQuery([q, sq_list[index]], 0, True)
                else:
                    q = SpanNearQuery([q, sq_list[index]],
                                      sq_list[index - 1]["num"][-1], True)
        query = q
        # 过滤项
        filters = u.getFields()
        bq = BooleanQuery.Builder()
        bq.add(BooleanClause(query, BooleanClause.Occur.MUST))
        for key in filters.keys():
            cur_reg = '('
            for ft in filters[key]:
                cur_reg += ft + '|'
            cur_reg = cur_reg[0:-1] + ')'
            rq = RegexpQuery(Term(key, cur_reg))
            bq.add(BooleanClause(rq, BooleanClause.Occur.MUST))
        query = bq.build()
        top_docs = s.search(query, 9999)
        self._cur_field = field

        reg = get_test_reg(flags_list, info, zh_to_hant_dict)
        doc_id_list = []
        hits = top_docs.scoreDocs
        for hit in hits:
            doc = s.doc(hit.doc)
            text = doc.get("text")
            match_res = re.search(reg, text)
            if match_res:
                doc_id_list.append(hit.doc)
        self._res = doc_id_list
        self._reg = reg
        return self
import lucene
from java.nio.file import Paths
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery
from org.apache.lucene.index import DirectoryReader, Term

input_q = input().replace('ی', 'ي')
lucene.initVM()
index_path = Paths.get('./lucene.index')
question_field = 'question'
answer_field = 'answer'

directory = SimpleFSDirectory(index_path)
searcher = IndexSearcher(DirectoryReader.open(directory))

query_builder = BooleanQuery.Builder()
for q_word in input_q.split(' '):
    qtq = TermQuery(Term(question_field, q_word))
    query_builder\
        .add(BooleanClause(qtq, BooleanClause.Occur.SHOULD))
query = query_builder.build()
top_n = 5
scoreDocs = searcher.search(query, top_n).scoreDocs
print('found nums: ', len(scoreDocs))
for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    print('Best Math: ', doc.get(question_field), '\n')
    print('Answer: ', doc.get(answer_field))
    print('---------------------\n')
示例#13
0
    def GET(self, query):
        data_input = web.input()
        page = 0
        if "page" in data_input:
            page = int(data_input["page"])
        render = web.template.render('templates/')
        anses = []
        num_pages = 0
        if use_elasticsearch:
            # importing libraries for Elasticsearch
            from elasticsearch import Elasticsearch
            from elasticsearch_dsl import Search, document, field, connections, Q
            from elasticsearch_dsl.connections import connections
            from booktype import Book

            es = Elasticsearch()
            es.indices.create(index='book-index', ignore=[400, 404])
            connections.create_connection(hosts=['localhost'], timeout=20)
            connections.add_connection('book', es)
            # print(connections.get_connection().cluster.health())
            s = Search(es).index('book-index').doc_type('book').query(
                Q('match', title=query.strip())
                | Q('match', description=query.strip())
                | Q("match", userreviews_userReview=query.strip()))
            ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute()
            s = s[page * 10:page * 10 + 10]
            response = s.execute()
            # print 'total number of hits: ', response.hits.total
            num_pages = (response.hits.total / 10) + 1
            for res in response:
                authors = zip(res.authors_name, res.authors_url)
                anses.append({
                    'title': res.title,
                    'description': res.description.encode('utf-8'),
                    'url': res.url,
                    'cover': res.cover,
                    'authors': authors
                })
        else:
            # importing libraries for Lucene
            import lucene
            from java.io import File
            from org.apache.lucene.index import DirectoryReader, Term
            from org.apache.lucene.queryparser.classic import QueryParser
            from org.apache.lucene.store import SimpleFSDirectory
            from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery
            from org.apache.lucene.util import Version
            from org.apache.lucene.analysis.standard import StandardAnalyzer
            import os

            # fields
            title_field = 'title'
            description_field = 'description'
            cover_field = 'cover'
            authors_name_field = 'authors_name'
            authors_url_field = 'authors_url'
            url_field = 'url'

            index_folder = '.'
            index_name = 'lucene.index'
            index_path = os.path.join(index_folder, index_name)

            lucene.initVM()
            version = Version.LUCENE_CURRENT
            directory = SimpleFSDirectory(File(index_path))
            searcher = IndexSearcher(DirectoryReader.open(directory))
            analyzer = StandardAnalyzer(version)

            title_tq = TermQuery(Term(title_field, query))
            desc_tq = TermQuery(Term(description_field, query))
            query = BooleanQuery()
            query.add(BooleanClause(title_tq, BooleanClause.Occur.SHOULD))
            query.add(BooleanClause(desc_tq, BooleanClause.Occur.SHOULD))
            scoreDocs = searcher.search(query, 1000).scoreDocs
            num_pages = (len(scoreDocs) / 10) + 1

            for scoreDoc in scoreDocs[page * 10:page * 10 + 10]:
                doc = searcher.doc(scoreDoc.doc)
                authors = zip([doc.get(authors_name_field)],
                              [doc.get(authors_url_field)])
                anses.append({
                    'title':
                    doc.get(title_field),
                    'description':
                    doc.get(description_field).encode('utf-8'),
                    'url':
                    doc.get(url_field),
                    'cover':
                    doc.get(cover_field),
                    'authors':
                    authors
                })

        return render.index(anses, query, num_pages)
示例#14
0
 def search(self, field):
     s = self._search
     u = self._userQuery
     z = self._zh_to_hant_dict
     keys = u.getKey()
     nums = u.getNum()
     word_list = u.getWordList()
     filters = u.getFields()
     # 只检索过滤项
     if len(word_list) == 0:
         query = None
     # 简单项
     elif len(keys) == 0:
         query = simple_term_to_query(field, word_list[0], z)
     elif keys[0] == '#':
         query_left = simple_term_to_query(field, word_list[0], z)
         query_right = simple_term_to_query(field, word_list[1], z)
         query = SpanNearQuery([query_left, query_right], int(nums[0]),
                               False)
     elif keys[0] == '+' or keys[0] == '$':
         prev_query = simple_term_to_query(field, word_list[0], z)
         for i in range(len(keys)):
             cur_query = simple_term_to_query(field, word_list[i + 1], z)
             if keys[i] == '+':
                 span_list = [prev_query]
                 for j in range(int(nums[i])):
                     span = SpanMultiTermQueryWrapper(
                         RegexpQuery(Term(field, '.')))
                     span_list.append(span)
                 span_list.append(cur_query)
                 prev_query = SpanNearQuery(span_list, 0, True)
             else:
                 span_list = [prev_query, cur_query]
                 prev_query = SpanNearQuery(span_list, int(nums[i]), True)
         query = prev_query
     elif keys[0] == '-' or keys[0] == '~':
         query_left = simple_term_to_query(field, word_list[0], z)
         query_right = simple_term_to_query(field, word_list[1], z)
         if keys[0] == '-':
             n_q_list = [query_left, query_right]
         else:
             n_q_list = [query_right, query_left]
         n_query = SpanNearQuery(n_q_list, int(nums[0]) - 1, True)
         bq = BooleanQuery.Builder()
         bc1 = BooleanClause(query_left, BooleanClause.Occur.MUST)
         bc2 = BooleanClause(n_query, BooleanClause.Occur.MUST_NOT)
         query = bq.add(bc1).add(bc2).build()
     else:
         raise ValueError("检索语句错误!")
     # 过滤项
     bq = BooleanQuery.Builder()
     if query:
         bq.add(BooleanClause(query, BooleanClause.Occur.MUST))
     for key in filters.keys():
         cur_reg = '('
         for ft in filters[key]:
             cur_reg += ft + '|'
         cur_reg = cur_reg[0:-1] + ')'
         rq = RegexpQuery(Term(key, cur_reg))
         bq.add(BooleanClause(rq, BooleanClause.Occur.MUST))
     query = bq.build()
     self._res = s.search(query, 100000)
     self._cur_field = field
     return self
示例#15
0
    def ancientSearch(self, field):
        sear = self._search
        fieldOnly = False
        # 只搜索域
        if len(self._commandInfo.getWordList()) == 0:
            fieldOnly = True
            bq = BooleanQuery.Builder()
            fields = self._commandInfo.getFields()
            for key in fields:
                queryx = QueryParser(key, KeywordAnalyzer()).parse(fields[key][0])
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()

        elif len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']:
            bq = BooleanQuery.Builder()
            q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
            bc = BooleanClause(q, BooleanClause.Occur.MUST)
            bq.add(bc)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        elif self._commandInfo.getKey()[0] == '#':
            bq = BooleanQuery.Builder()
            query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0]))
            query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1]))
            bc1 = BooleanClause(query1, BooleanClause.Occur.MUST)
            bc2 = BooleanClause(query2, BooleanClause.Occur.MUST)
            bq.add(bc1).add(bc2)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        elif self._commandInfo.getKey()[0] in ['$', '+']:
            bq = BooleanQuery.Builder()
            for w in self._commandInfo.getWordList():
                queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            for i in self._commandInfo.getFields():
                if i == 'section' or i == 'document':
                    continue
                queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i]))
                bc = BooleanClause(queryx, BooleanClause.Occur.MUST)
                bq.add(bc)
            query = bq.build()
        else:
            query = ''
        hits = sear.search(query, 9999)
        for hit in hits.scoreDocs:
            doc = sear.doc(hit.doc)
            res = doc.get(field)
            id = doc.get('id')
            detail = get_detail(doc)
            zhujie = detail['zhujie']
            if detail['detail'] and 'detail' in detail['detail'].keys():
                detail['detail'] = detail['detail']['detail']
            detail.pop('zhujie')
            detail.pop('text')
            detail.pop('type')
            detail = json.dumps(detail)
            if fieldOnly:
                if not doc.get("text").strip():
                    continue
                if id.count(".") == 2:
                    self._doc[id] = doc.get("text")
                    self._resultSentencesList.append((id, doc.get("text")))
                elif id.count(".") == 1:
                    searcher = self._search
                    query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1')
                    hits = searcher.search(query, 1)

                    for hit in hits.scoreDocs:
                        doc = searcher.doc(hit.doc)
                        res = doc.get("text")
                        if res:
                            self._doc[id+".1"] = doc.get('text')
                            self._resultSentencesList.append((id + ".1", doc.get('text')))
                else:
                    searcher = self._search
                    query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1.1')
                    hits = searcher.search(query, 1)
                    for hit in hits.scoreDocs:
                        doc = searcher.doc(hit.doc)
                        res = doc.get("text")
                        if not doc.get("text").strip():
                            continue
                        if res:
                            self._doc[id+".1.1"] = doc.get('text')
                            self._resultSentencesList.append((id + ".1.1", doc.get('text')))
            elif doc_hit(res, self._commandInfo):
                if key_filter(self._commandInfo, res):
                    if 'section' in self._commandInfo.getFields().keys():
                        if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['section'], 0):
                            continue
                    if 'document' in self._commandInfo.getFields().keys():
                        if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['document'], 1):
                            continue
                    self._doc[id] = res
                    self._resultSentencesList.append((id, res, detail, zhujie))
        return self
示例#16
0
def update(collection_name,
           tofind,
           update,
           commit=False,
           add_field_if_not_exists=True):
    #As of now the update will be implemented as search,modify data in json file,delete and re-write
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT
    try:
        tofind_keyvalue_pairs = json.loads(tofind)
    except:
        return 100
    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    try:
        ireader = IndexReader.open(direc)
        searcher = IndexSearcher(ireader)
        #setting writer configurations
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(direc, config)
    except:
        return 105
    no_of_documents_modified = 0

    #finding the document to update
    #Scope for making this more efficient
    def rewrite(data_string):
        data = json.loads(data_string)
        toupdate = json.loads(update)
        #primary_key_modified=False

        #delete the appropriate document
        query = BooleanQuery()
        for key in primary_keys_map[collection_name]:
            temp = QueryParser(Version.LUCENE_CURRENT, key,
                               analyzer).parse(data[key])
            query.add(BooleanClause(temp, BooleanClause.Occur.MUST))

        #print query
        #modify the values
        for key, value in toupdate.items():
            #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)
            if add_field_if_not_exists == False:
                if key in data.keys():
                    data[key] = value
            else:
                data[key] = value

        #this deletion statement has been intenstionally added here
        #only if the modified data,has primary keys already not existing,will the updating process continue
        primary_key_update = False
        for key in toupdate.keys():
            if key in primary_keys_map[INDEX_DIR]:
                primary_key_update = True
                break
        if primary_key_update == True:
            query_search = BooleanQuery()
            for key in primary_keys_map[INDEX_DIR]:
                temp = QueryParser(Version.LUCENE_CURRENT, key,
                                   analyzer).parse(data[key])
                query_search.add(BooleanClause(temp, BooleanClause.Occur.MUST))
            hits = searcher.search(query_search, MAX_RESULTS).scoreDocs
            if len(hits) > 0:
                return 106
        writer.deleteDocuments(query)

        #add the newly modified document
        doc = Document()
        #index files wrt primary key
        for primary_key in primary_keys_map[collection_name]:
            try:
                field = Field(primary_key, data[primary_key], Field.Store.NO,
                              Field.Index.NOT_ANALYZED)
                doc.add(field)
            except:
                primary_keys_map.pop(collection_name)
                return 101
        #compress data using snappy if compression is on
        if to_be_compressed_map[collection_name] == True:
            temp = json.dumps(data)
            data_string = base64.b64encode(snappy.compress(temp))
        else:
            temp = json.dumps(data)
            data_string = base64.b64encode(temp)

        field = Field("$DATA$", data_string, Field.Store.YES,
                      Field.Index.ANALYZED)
        doc.add(field)
        writer.addDocument(doc)

    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map[collection_name]:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents
    if len(tofind_primary_keyvalue_pairs) > 0:
        query = BooleanQuery()
        for key in tofind_primary_keyvalue_pairs.keys():
            temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(
                tofind_primary_keyvalue_pairs[key])
            query.add(BooleanClause(temp, BooleanClause.Occur.MUST))
        hits = searcher.search(query, MAX_RESULTS).scoreDocs

        for hit in hits:
            doc = searcher.doc(hit.doc)
            if to_be_compressed_map[collection_name] == True:
                temp = doc.get("$DATA$")
                data = snappy.uncompress(base64.b64decode(temp))
            else:
                temp = doc.get("$DATA$")
                data = base64.b64decode(temp)
            #non primary key filtering(without having to load all the primary key filtered values into main memory!)
            if len(tofind_nonprimary_keyvalue_pairs) > 0:
                entry = json.loads(data)
                satisfied = True
                for key in tofind_nonprimary_keyvalue_pairs.keys():
                    if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]:
                        satisfied = False
                        break
                if satisfied == True:
                    if rewrite(data) != 106:
                        no_of_documents_modified += 1
                    else:
                        return 106
            else:
                if rewrite(data) != 106:
                    no_of_documents_modified += 1
                else:
                    return 106

    else:
        for i in range(0, ireader.numDocs()):
            doc = searcher.doc(i)
            if to_be_compressed_map[collection_name] == True:
                temp = doc.get("$DATA$")
                data = snappy.uncompress(base64.b64decode(temp))
            else:
                temp = doc.get("$DATA$")
                data = base64.b64decode(temp)

            #non primary key filtering(without having to load all the primary key filtered values into main memory!)
            if len(tofind_nonprimary_keyvalue_pairs) > 0:
                entry = json.loads(data)
                satisfied = True
                for key in tofind_nonprimary_keyvalue_pairs.keys():
                    if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]:
                        satisfied = False
                        break
                if satisfied == True:
                    if rewrite(data) != 106:
                        no_of_documents_modified += 1
                    else:
                        return 106
            else:
                if rewrite(data) != 106:
                    no_of_documents_modified += 1
                else:
                    return 106

    ireader.close()
    if commit == True:
        writer.commit()
    writer.close()
    return str(no_of_documents_modified) + " have been modified"
示例#17
0
def search(collection_name, tofind):
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT
    try:
        tofind_keyvalue_pairs = json.loads(tofind)
    except:
        return 100
    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    try:
        ireader = IndexReader.open(direc)
        searcher = IndexSearcher(ireader)
    except:
        return 105

    #initializing return list
    return_list = []
    #check_list=[]
    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map[collection_name]:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents
    if len(tofind_primary_keyvalue_pairs) > 0:
        query = BooleanQuery()
        for key in tofind_primary_keyvalue_pairs.keys():
            temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(
                tofind_primary_keyvalue_pairs[key])
            query.add(BooleanClause(temp, BooleanClause.Occur.MUST))
        hits = searcher.search(query, MAX_RESULTS).scoreDocs
        for hit in hits:
            doc = searcher.doc(hit.doc)
            if to_be_compressed_map[collection_name] == True:
                temp = doc.get("$DATA$")
                data = snappy.uncompress(base64.b64decode(temp))
            else:
                temp = doc.get("$DATA$")
                data = base64.b64decode(temp)
            #non primary key filtering(without having to load all the primary key filtered values into main memory!)
            if len(tofind_nonprimary_keyvalue_pairs) > 0:
                entry = json.loads(data)
                satisfied = True
                for key in tofind_nonprimary_keyvalue_pairs.keys():
                    if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]:
                        satisfied = False
                        break
                if satisfied == True:
                    return_list.append(data)
            else:
                return_list.append(data)

    else:
        for i in range(0, ireader.numDocs()):
            doc = searcher.doc(i)
            if to_be_compressed_map[collection_name] == True:
                temp = doc.get("$DATA$")
                data = snappy.uncompress(base64.b64decode(temp))
            else:
                temp = doc.get("$DATA$")
                data = base64.b64decode(temp)

            #non primary key filtering(without having to load all the primary key filtered values into main memory!)
            if len(tofind_nonprimary_keyvalue_pairs) > 0:
                entry = json.loads(data)
                satisfied = True
                for key in tofind_nonprimary_keyvalue_pairs.keys():
                    if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]:
                        satisfied = False
                        break
                if satisfied == True:
                    return_list.append(data)
            else:
                return_list.append(data)

    ireader.close()

    if len(return_list) == 0:
        return None
    else:
        return return_list
示例#18
0
def store(collection_name, data, commit=False):
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT
    print "started indexing input data......"

    #extracting values
    try:
        contents = json.loads(data)
    except:
        return 100

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #checking for existance of record with same primary_key set
    try:
        ireader = IndexReader.open(direc)
        searcher = IndexSearcher(ireader)
        query = BooleanQuery()
        for key in primary_keys_map[INDEX_DIR]:
            temp = QueryParser(Version.LUCENE_CURRENT, key,
                               analyzer).parse(contents[key])
            query.add(BooleanClause(temp, BooleanClause.Occur.MUST))
        hits = searcher.search(query, MAX_RESULTS).scoreDocs
        if len(hits) > 0:
            return 106
    except:
        pass

    #setting writer configurations
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = IndexWriter(direc, config)
    #fix this later.....FieldType not defined
    #field_type=FieldType()
    #field_type.setIndexed(True)
    #field_type.setStored(False)
    #field_type.setTokenized(False)

    try:
        doc = Document()
        #index files wrt primary key
        for primary_key in primary_keys_map[collection_name]:
            try:
                field = Field(primary_key, contents[primary_key],
                              Field.Store.NO, Field.Index.NOT_ANALYZED)
                doc.add(field)
            except:
                primary_keys_map.pop(collection_name)
                return 101
        #compress data using snappy if compression is on
        if to_be_compressed_map[collection_name] == True:
            # print "here"
            #data=data.encode('utf-8')
            data = base64.b64encode(snappy.compress(data))
            # print data
        else:
            data = base64.b64encode(data)

        field = Field("$DATA$", data, Field.Store.YES, Field.Index.ANALYZED)
        doc.add(field)
        writer.addDocument(doc)
        if commit == True:
            writer.commit()
        writer.close()
        return 000
    except:
        return 102
示例#19
0
    def rewrite(data_string):
        data = json.loads(data_string)
        toupdate = json.loads(update)
        #primary_key_modified=False

        #delete the appropriate document
        query = BooleanQuery()
        for key in primary_keys_map[collection_name]:
            temp = QueryParser(Version.LUCENE_CURRENT, key,
                               analyzer).parse(data[key])
            query.add(BooleanClause(temp, BooleanClause.Occur.MUST))

        #print query
        #modify the values
        for key, value in toupdate.items():
            #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)
            if add_field_if_not_exists == False:
                if key in data.keys():
                    data[key] = value
            else:
                data[key] = value

        #this deletion statement has been intenstionally added here
        #only if the modified data,has primary keys already not existing,will the updating process continue
        primary_key_update = False
        for key in toupdate.keys():
            if key in primary_keys_map[INDEX_DIR]:
                primary_key_update = True
                break
        if primary_key_update == True:
            query_search = BooleanQuery()
            for key in primary_keys_map[INDEX_DIR]:
                temp = QueryParser(Version.LUCENE_CURRENT, key,
                                   analyzer).parse(data[key])
                query_search.add(BooleanClause(temp, BooleanClause.Occur.MUST))
            hits = searcher.search(query_search, MAX_RESULTS).scoreDocs
            if len(hits) > 0:
                return 106
        writer.deleteDocuments(query)

        #add the newly modified document
        doc = Document()
        #index files wrt primary key
        for primary_key in primary_keys_map[collection_name]:
            try:
                field = Field(primary_key, data[primary_key], Field.Store.NO,
                              Field.Index.NOT_ANALYZED)
                doc.add(field)
            except:
                primary_keys_map.pop(collection_name)
                return 101
        #compress data using snappy if compression is on
        if to_be_compressed_map[collection_name] == True:
            temp = json.dumps(data)
            data_string = base64.b64encode(snappy.compress(temp))
        else:
            temp = json.dumps(data)
            data_string = base64.b64encode(temp)

        field = Field("$DATA$", data_string, Field.Store.YES,
                      Field.Index.ANALYZED)
        doc.add(field)
        writer.addDocument(doc)