def get_or_query(self, queries): """Creates an OR Boolean query from multiple Lucene queries """ # empty boolean query with Similarity.coord() disabled bq = BooleanQuery(False) for q in queries: bq.add(q, BooleanClause.Occur.SHOULD) return bq
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------------' print 'title:',doc.get('title') print 'url:',doc.get('url') print 'src:',doc.get('src')
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): if(k=='site'): t = Term('url','*'+v.strip()+'*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------' #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site') print 'title:',doc.get('title'), print 'url:',doc.get('url')
def more_like_this(self, film, count=4): """ Use query by document techniques to find related documents :param film: film :param count: number of results :return: a list of related films """ # Retrieve doc id of the given film film_query = TermQuery(Term('id', str(film.film_id))) results = self.searcher.search(film_query, 1) if results.totalHits != 1: return [] # Use MoreLikeThis query by document technology mlt = MoreLikeThis(reader) mlt.setFieldNames(["title", "director", "writer", "genre", "cast", "fullplot"]) mlt.setMinTermFreq(0) mlt.setMinDocFreq(0) mlt.setAnalyzer(self.analyzer) mlt_query = mlt.like(results.scoreDocs[0].doc) # Filter the original film filtered_query = BooleanQuery() filtered_query.add(mlt_query, BooleanClause.Occur.MUST) filtered_query.add(film_query, BooleanClause.Occur.MUST_NOT) score_docs = self.searcher.search(filtered_query, count).scoreDocs return self._retrieve_in_order(score_docs)
def do_mapping(line): regex = re.match(r"(?P<netflix_id>[0-9]+),(?P<year>([0-9]+)|NULL),(?P<title>.+)", line) if not regex: raise ValueError(line) netflix_id = int(regex.group("netflix_id")) title = QueryParser.escape(regex.group("title")) query1 = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(title) year = regex.group("year") if year == "NULL": scoreDocs = searcher.search(query1, 1).scoreDocs else: year = int(year) query2 = NumericRangeQuery.newIntRange("year", year, year, True, True) booleanQuery = BooleanQuery(); booleanQuery.add(query1, BooleanClause.Occur.MUST); booleanQuery.add(query2, BooleanClause.Occur.MUST); scoreDocs = searcher.search(booleanQuery, 1).scoreDocs if scoreDocs: if scoreDocs[0].score > 1.5: doc = searcher.doc(scoreDocs[0].doc) doc_id = doc.getField("id").stringValue() doc.add(StringField("netflix_id", str(netflix_id), Field.Store.YES)) writer.updateDocument(Term("id", doc_id), doc)
def testFlat(self): q = BooleanQuery() q.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD)) q.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD)) q.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD)) q.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD)) self.assertEqual(1, self.search(q))
def visitSCOPED_CLAUSE(self, node): clause = CqlVisitor.visitSCOPED_CLAUSE(self, node) if len(clause) == 1: return clause[0] lhs, operator, rhs = clause query = BooleanQuery() query.add(lhs, LHS_OCCUR[operator]) query.add(rhs, RHS_OCCUR[operator]) return query
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def testCollectScoresWithNoResultAndBooleanQueryDoesntFailOnFakeScorerInAggregateScoreCollector(self): q = BooleanQuery() q.add(luceneQueryFromCql('M=true'), BooleanClause.Occur.SHOULD) q.add(luceneQueryFromCql('M=true'), BooleanClause.Occur.SHOULD) q = ComposedQuery('coreA', query=q) q.start = 0 q.stop = 0 q.setRankQuery(core='coreC', query=luceneQueryFromCql('S=true')) q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX+'A'), dict(core='coreC', key=KEY_PREFIX+'C')) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(q)) self.assertEquals(4, result.total) self.assertEquals([], result.hits)
def do_query(property, qstring, limit = 10): query = BooleanQuery() stream = analyzer.tokenStream(property, StringReader(qstring)) stream.reset() attr = stream.getAttribute(CharTermAttribute) while stream.incrementToken(): term = attr.toString() termQuery = TermQuery(Term(property, term)) query.add(termQuery, Occur.SHOULD) hits = searcher.search(query, None, limit).scoreDocs return [Document(searcher.doc(hit.doc)) for hit in hits]
def testOutOfOrderDocsScoringSort(self): """ Two Sort criteria to instantiate the multi/single comparators. """ sorts = [Sort(SortField.FIELD_DOC), Sort()] tfcOptions = [[False, False, False], [False, False, True], [False, True, False], [False, True, True], [True, False, False], [True, False, True], [True, True, False], [True, True, True]] actualTFCClasses = [ "OutOfOrderOneComparatorNonScoringCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorScoringNoMaxScoreCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorNonScoringCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorScoringNoMaxScoreCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector" ] bq = BooleanQuery() # Add a Query with SHOULD, since bw.scorer() returns BooleanScorer2 # which delegates to BS if there are no mandatory clauses. bq.add(MatchAllDocsQuery(), BooleanClause.Occur.SHOULD) # Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to # return the clause instead of BQ. bq.setMinimumNumberShouldMatch(1) for sort in sorts: for tfcOption, actualTFCClass in izip(tfcOptions, actualTFCClasses): tdc = TopFieldCollector.create(sort, 10, tfcOption[0], tfcOption[1], tfcOption[2], False) self.assert_(tdc.getClass().getName().endswith("$" + actualTFCClass)) self.full.search(bq, tdc) tds = tdc.topDocs() sds = tds.scoreDocs self.assertEqual(10, len(sds))
def _create_query(self, fields): """ Build query with Term, Phrase and Fuzzy clauses. :param fields: dictionary of (field, text) tuples :return: query """ query = BooleanQuery() for (field, text) in fields: if field.startswith("year"): start, end = text.split(",") numeric_query = NumericRangeQuery.newIntRange( 'year', int(start), int(end), True, True) query.add(BooleanClause(numeric_query, BooleanClause.Occur.MUST)) if field == 'title': spans = [] for word in text.lower().split(): spans.append(SpanTermQuery(Term(field, word))) query.add(BooleanClause(SpanNearQuery(spans, 2, True), BooleanClause.Occur.SHOULD)) field_names, field_texts = zip(*fields) flags = [BooleanClause.Occur.MUST] * len(field_names) query_parser_query = MultiFieldQueryParser.parse( Version.LUCENE_CURRENT, field_texts, field_names, flags, StandardAnalyzer(Version.LUCENE_CURRENT)) query.add(BooleanClause(query_parser_query, BooleanClause.Occur.MUST)) fuzzify = lambda s: (s + " ").replace(" ", "~1 ") fuzzy_field_texts = map(fuzzify, field_texts) fuzzy_query_parser_query = MultiFieldQueryParser.parse( Version.LUCENE_CURRENT, fuzzy_field_texts, field_names, flags, StandardAnalyzer(Version.LUCENE_CURRENT)) query.add(BooleanClause(fuzzy_query_parser_query, BooleanClause.Occur.MUST)) boostQuery = FunctionQuery( LinearFloatFunction( PowFloatFunction( DoubleConstValueSource(0.0001), ScaleFloatFunction(IntFieldSource("imdb_votes_boost"), 0.0, 1.0) ), -1.0, 1.0)) query = CustomScoreQuery(query, boostQuery) return query
def testUnqualifiedTermFields(self): composer = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings()) ast = parseCql("value") result = composer.compose(ast) query = BooleanQuery() left = TermQuery(Term("field0", "value")) left.setBoost(0.2) query.add(left, BooleanClause.Occur.SHOULD) right = TermQuery(Term("field1", "value")) right.setBoost(2.0) query.add(right, BooleanClause.Occur.SHOULD) self.assertEquals(type(query), type(result)) self.assertEquals(repr(query), repr(result))
def testParenthesisMust(self): q3 = BooleanQuery() q3.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD)) q3.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD)) q4 = BooleanQuery() q4.add(BooleanClause(self.c1, BooleanClause.Occur.MUST)) q4.add(BooleanClause(self.c2, BooleanClause.Occur.MUST)) q2 = BooleanQuery() q2.add(q3, BooleanClause.Occur.SHOULD) q2.add(q4, BooleanClause.Occur.SHOULD) self.assertEqual(1, self.search(q2))
def delete(primary_keys_map,collection_name,todelete,commit=False): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs=json.loads(todelete) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) ireader=IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents according to primary keys query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) a=writer.deleteDocuments(query) if commit==True: writer.commit() writer.close() return 000;
def testBraces(self): self.assertConversion(TermQuery(Term('unqualified', 'cats')), '(cats)') innerQuery = BooleanQuery() innerQuery.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST) innerQuery.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST) outerQuery = BooleanQuery() outerQuery.add(innerQuery, BooleanClause.Occur.SHOULD) outerQuery.add(TermQuery(Term('unqualified', 'mice')), BooleanClause.Occur.SHOULD) self.assertConversion(outerQuery, '(cats AND dogs) OR mice')
def perform_search(self, searchterm, results_per_page, page): # if there is a field in the searchterm """if ":" in searchterm: # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) else: query = BooleanQuery() query_title = TermQuery(Term("title", searchterm)) query_description = TermQuery(Term("description", searchterm)) query_content = TermQuery(Term("content", searchterm)) # BooleanClause.Occur.MUST for AND queries query.add(query_title, BooleanClause.Occur.SHOULD) query.add(query_description, BooleanClause.Occur.SHOULD) query.add(query_content, BooleanClause.Occur.SHOULD)""" # create QueryParser for each field to be searched parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer) parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer) parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) # put fields together query = BooleanQuery() query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() hits = searcher.search(query, results_per_page + (results_per_page * page)) score_docs = hits.scoreDocs count_results = hits.totalHits duration = datetime.now() - start # results to return results = [] count = 0 for scoreDoc in score_docs: # skip offset if count < results_per_page * page: count += 1 continue count += 1 doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) results.append(table) return results, duration, count_results
def rewrite(data_string): data=json.loads(data_string) toupdate=json.loads(update) #primary_key_modified=False #delete the appropriate document query=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) #modify the values for key,value in toupdate.items(): #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!) if add_field_if_not_exists==False: if key in data.keys(): data[key]=value else: data[key]=value #this deletion statement has been intenstionally added here #only if the modified data,has primary keys already not existing,will the updating process continue primary_key_update=False for key in toupdate.keys(): if key in primary_keys_map: primary_key_update=True break if primary_key_update == True: query_search=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query_search,MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 writer.deleteDocuments(query) #add the newly modified document doc=Document() #index files wrt primary key for primary_key in primary_keys_map: try: field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED) doc.add(field) except: # primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_input==True: temp=json.dumps(data) data_string=base64.b64encode(snappy.compress(temp)) else: temp=json.dumps(data) data_string=base64.b64encode(temp) field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc)
def lucene_range_query_parse(query_string): '''parse the user's range query string into something pylucene can understand''' query = BooleanQuery() queries_ = query_string.split(snapconf.RANGE_QUERY_DELIMITER) start = None end = None start_inclusive = True end_inclusive = True for query_tuple in queries_: m=snapconf.RANGE_QUERY_FIELD_PATTERN.search(query_tuple) (col,op_,val)=re.split(snapconf.RANGE_QUERY_OPS,query_tuple) if not m or not col or col not in snapconf.TABIX_DBS or col not in snapconf.LUCENE_TYPES: continue op=m.group(1) if op not in snapconf.operators: sys.stderr.write("bad operator %s in range query,exiting\n" % (str(op))) sys.exit(-1) (ltype,ptype,qtype) = snapconf.LUCENE_TYPES[col] rquery = None if ptype == str: rquery = TermQuery(qtype(col,str(val))) else: #assume operator == '=' (start,end) = (ptype(val),ptype(val)) if op == '>=': end = None if op == '<=': start = None if op == '<': start = None end_inclusive = False if op == '>': end = None start_inclusive = False rquery = qtype(col,start,end,start_inclusive,end_inclusive) query.add(rquery,BooleanClause.Occur.MUST) #sys.stderr.write("query + fields: %s %s\n" % (query,field)) return query
def extract_phrase_query(self, q, field, slop=0, boost=5): phrases = re.findall(r'"([^"]*)"', q) if len(phrases) == 0: return None, q q = re.sub(r'"([^"]*)"', "", q).strip() # query without phrases if self.verbose: print "Detected phrases: ", phrases bq = BooleanQuery() for phrase in phrases: # pq = PhraseQuery() # for term in filter(None, phrase.split(' ')): # pq.add(Term(field, term)) qparser = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer) # parse phrase - this may or may not be desired # pq = qparser.parse(field + ':"' + phrase + '"') pq = qparser.parse('%s "%s"~%d^%.1f' % (phrase, phrase, slop, boost)) # phrase queries have high priority bq.add(pq, BooleanClause.Occur.MUST) # bq.add(pq, BooleanClause.Occur.SHOULD) return bq, q
def visitSEARCH_CLAUSE(self, node): # possible children: # CQL_QUERY # SEARCH_TERM # INDEX, RELATION, SEARCH_TERM firstChild = node.children[0].name results = CqlVisitor.visitSEARCH_CLAUSE(self, node) if firstChild == 'SEARCH_TERM': (unqualifiedRhs,) = results if unqualifiedRhs == '*': return MatchAllDocsQuery() subQueries = [] for fieldname, boost in self._unqualifiedTermFields: subQuery = self._termOrPhraseQuery(fieldname, unqualifiedRhs) if isinstance(subQuery, PhraseQuery) and not self._fieldRegistry.phraseQueryPossible(fieldname): continue subQuery.setBoost(boost) subQueries.append(subQuery) if len(subQueries) == 1: query = subQueries[0] else: query = BooleanQuery() for subQuery in subQueries: query.add(subQuery, BooleanClause.Occur.SHOULD) return query elif firstChild == 'INDEX': (left, (relation, boost), right) = results if relation in ['==', 'exact'] or (relation == '=' and self._fieldRegistry.isUntokenized(left)): query = TermQuery(self._createTerm(left, right)) elif relation == '=': query = self._termOrPhraseQuery(left, right) elif relation in ['<','<=','>=','>']: query = self._termRangeQuery(left, relation, right) else: raise UnsupportedCQL("'%s' not supported for the field '%s'" % (relation, left)) query.setBoost(boost) return query else: ((query,),) = results return query
def run(searcher_good, searcher_bad, analyzer): while True: command_dict = parseCommand(command) total_num = 20 #这些不同的s用来决定排序顺序:依次是按价格(从低到高)、热度(总评论数)、好评率、综合评分 #s=SortField("price",SortField.Type.FLOAT,False) #s=SortField("total_comment",SortField.Type.FLOAT,True) s = SortField("good_rate", SortField.Type.FLOAT, True) #s=SortField("socre",SortField.Type.FLOAT,True) so = Sort(s) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) #这两句用来限定价格的范围 #q=NumericRangeQuery.newFloatRange("price",100.0,200.0,True,True) #querys.add(q,BooleanClause.Occur.MUST) scoreDocs_good = searcher_good.search(querys, total_num, so).scoreDocs total = len(scoreDocs_good) flag = True if len(scoreDocs_good) < total_num: scoreDocs_bad = searcher_bad.search(querys, total_num, so).scoreDocs total = total + len(scoreDocs_bad) flag = False if total > total_num: total = total_num print "%s total matching documents." % total #"url"是网址,“img_url”是图片网址,“brand”是品牌 for scoreDoc_good in scoreDocs_good: doc = searcher_good.doc(scoreDoc_good.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'title:', doc.get('title') print 'total_comment', doc.get("total_comment") print 'price', doc.get("price") print 'socre', doc.get("socre") print 'brand', doc.get("brand") print 'good_rate', doc.get("good_rate") print if not flag: t = 0 for scoreDoc_bad in scoreDocs_bad: t = t + 1 doc = searcher_bad.doc(scoreDoc_bad.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'title:', doc.get('title') print 'total_comment', doc.get("total_comment") print 'price', doc.get("price") print 'score', doc.get("score") print 'brand', doc.get("brand") print 'good_rate', doc.get("good_rate") print if t > total_num - 1 - len(scoreDocs_good): break
def addDuplicatesQuery(self, query): not_duplicate = TermQuery(Term('duplicate', 'false')) booleanQuery = BooleanQuery() booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST) booleanQuery.add(query, BooleanClause.Occur.MUST) return booleanQuery
def search(primary_keys_map,to_be_compressed_input,collection_name,tofind,MAX_RESULTS=1000): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: print "********" + tofind tofind_keyvalue_pairs=json.loads(tofind) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader=IndexReader.open(direc) searcher=IndexSearcher(ireader) except: return 105 #initializing return list return_list=[] #check_list=[] tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents if len(tofind_primary_keyvalue_pairs)>0: query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query,MAX_RESULTS).scoreDocs for hit in hits: doc=searcher.doc(hit.doc) if to_be_compressed_input==True: data=snappy.uncompress(doc.get("$DATA$")) else: data=doc.get("$DATA$") #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs)>0: entry=json.loads(data) satisfied=True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]: satisfied=False break if satisfied==True: return_list.append(data) else: return_list.append(data) else: for i in range(0,ireader.numDocs()): doc=searcher.doc(i) if to_be_compressed_input==True: data=snappy.uncompress(str(doc.get("$DATA$"))) else: data=doc.get("$DATA$") #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs)>0: entry=json.loads(data) satisfied=True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]: satisfied=False break if satisfied==True: return_list.append(data) else: return_list.append(data) ireader.close() if len(return_list)==0: return None else: return return_list
def ancientSearch(self, field): sear = self._search fieldOnly = False # 只搜索域 if len(self._commandInfo.getWordList()) == 0: fieldOnly = True bq = BooleanQuery.Builder() fields = self._commandInfo.getFields() for key in fields: queryx = QueryParser(key, KeywordAnalyzer()).parse(fields[key][0]) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']: bq = BooleanQuery.Builder() q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) bc = BooleanClause(q, BooleanClause.Occur.MUST) bq.add(bc) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif self._commandInfo.getKey()[0] == '#': bq = BooleanQuery.Builder() query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1])) bc1 = BooleanClause(query1, BooleanClause.Occur.MUST) bc2 = BooleanClause(query2, BooleanClause.Occur.MUST) bq.add(bc1).add(bc2) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif self._commandInfo.getKey()[0] in ['$', '+']: bq = BooleanQuery.Builder() for w in self._commandInfo.getWordList(): queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w)) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() else: query = '' hits = sear.search(query, 9999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get('id') detail = get_detail(doc) zhujie = detail['zhujie'] if detail['detail'] and 'detail' in detail['detail'].keys(): detail['detail'] = detail['detail']['detail'] detail.pop('zhujie') detail.pop('text') detail.pop('type') detail = json.dumps(detail) if fieldOnly: if not doc.get("text").strip(): continue if id.count(".") == 2: self._doc[id] = doc.get("text") self._resultSentencesList.append((id, doc.get("text"))) elif id.count(".") == 1: searcher = self._search query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1') hits = searcher.search(query, 1) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) res = doc.get("text") if res: self._doc[id+".1"] = doc.get('text') self._resultSentencesList.append((id + ".1", doc.get('text'))) else: searcher = self._search query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1.1') hits = searcher.search(query, 1) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) res = doc.get("text") if not doc.get("text").strip(): continue if res: self._doc[id+".1.1"] = doc.get('text') self._resultSentencesList.append((id + ".1.1", doc.get('text'))) elif doc_hit(res, self._commandInfo): if key_filter(self._commandInfo, res): if 'section' in self._commandInfo.getFields().keys(): if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['section'], 0): continue if 'document' in self._commandInfo.getFields().keys(): if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['document'], 1): continue self._doc[id] = res self._resultSentencesList.append((id, res, detail, zhujie)) return self
def update(primary_keys_map,to_be_compressed_input,collection_name,tofind,update,commit=False,add_field_if_not_exists=True): INDEX_DIR_DEFAULT="IndexFiles.index" #As of now the update will be implemented as search,modify data in json file,delete and re-write if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs=json.loads(tofind) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader=IndexReader.open(direc) searcher=IndexSearcher(ireader) #setting writer configurations config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) except: return 105 no_of_documents_modified=0 #finding the document to update #Scope for making this more efficient def rewrite(data_string): data=json.loads(data_string) toupdate=json.loads(update) #primary_key_modified=False #delete the appropriate document query=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) #modify the values for key,value in toupdate.items(): #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!) if add_field_if_not_exists==False: if key in data.keys(): data[key]=value else: data[key]=value #this deletion statement has been intenstionally added here #only if the modified data,has primary keys already not existing,will the updating process continue query_search=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query_search,MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 writer.deleteDocuments(query) #add the newly modified document doc=Document() #index files wrt primary key for primary_key in primary_keys_map: try: field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED) doc.add(field) except: # primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_input==True: data_string=snappy.compress(str(json.dumps(data))) else: data_string=json.dumps(data) field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents if len(tofind_primary_keyvalue_pairs)>0: query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query,MAX_RESULTS).scoreDocs for hit in hits: doc=searcher.doc(hit.doc) if to_be_compressed_input==True: data=snappy.uncompress(doc.get("$DATA$")) else: data=doc.get("$DATA$") #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs)>0: entry=json.loads(data) satisfied=True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]: satisfied=False break if satisfied==True: if rewrite(data)!=106: no_of_documents_modified+=1 else: writer.rollback() return 106 else: if rewrite(data)!=106: no_of_documents_modified+=1 else: writer.rollback() return 106 else: for i in range(0,ireader.numDocs()): doc=searcher.doc(i) if to_be_compressed_input==True: data=snappy.uncompress(doc.get("$DATA$")) else: data=doc.get("$DATA$") #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs)>0: entry=json.loads(data) satisfied=True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]: satisfied=False break if satisfied==True: if rewrite(data)!=106: no_of_documents_modified+=1 else: writer.rollback() return 106 else: if rewrite(data)!=106: no_of_documents_modified+=1 else: writer.rollback() return 106 ireader.close() if commit==True: writer.commit() writer.close() return str(no_of_documents_modified)+" have been modified"
def store(primary_keys_map,to_be_compressed_input,collection_name,data,commit=False): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT print "started indexing input data......" #extracting values try: contents=json.loads(data) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #checking for existance of record with same primary_key set try: ireader=IndexReader.open(direc) searcher=IndexSearcher(ireader) query=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(contents[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query,MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 except: pass #setting writer configurations config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) #fix this later.....FieldType not defined #field_type=FieldType() #field_type.setIndexed(True) #field_type.setStored(False) #field_type.setTokenized(False) try: doc=Document() #index files wrt primary key for primary_key in primary_keys_map: try: field=Field(primary_key,contents[primary_key],Field.Store.NO,Field.Index.ANALYZED) doc.add(field) except: # primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_input==True: data=snappy.compress(data) field=Field("$DATA$",data,Field.Store.YES,Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) if commit==True: writer.commit() writer.close() return 000 except: return 102
def runstext(command, cpage, meth): global vm_env, searcher, analyzer text = [] print(command) if command == '': return command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 1000).scoreDocs maxnum = len(scoreDocs) keywords = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) reslist = [] maxnum = min(maxnum, 100) for i, scoreDoc in enumerate(scoreDocs[:maxnum]): doc = searcher.doc(scoreDoc.doc) date = doc.get("date") score = float(scoreDoc.score) reslist.append([doc, date, score]) style = highlight.SimpleHTMLFormatter("<b><font color=\'red\'>", "</font></b>") high_seg = highlight.Highlighter(style, highlight.QueryScorer(keywords)) high_seg.setTextFragmenter(highlight.SimpleFragmenter(50)) if meth == "rel": reslist = sorted(reslist, key=lambda res: res[2], reverse=True) elif meth == "td": reslist = sorted(reslist, key=lambda res: res[1], reverse=True) elif meth == "tu": reslist = sorted(reslist, key=lambda res: res[1], reverse=False) print keywords start = (cpage - 1) * 10 end = min(start + 10, maxnum) print start, end for i in reslist[start:end]: doc = i[0] score = i[2] date = str(getdate(i[1])) text_dic = {} text_dic['title'] = doc.get("title").strip('-直播吧zhibo8.cc').strip( '_新浪竞技风暴_新浪网') text_dic['url'] = doc.get("url") tmpcontent = cleantxt(doc.get("contents")) keyword = high_seg.getBestFragment(analyzer, "contents", tmpcontent) text_dic['keyword'] = keyword text_dic['score'] = score text_dic['date'] = date text.append(text_dic) '''for i, scoreDoc in enumerate(scoreDocs): text_dic = {} doc = searcher.doc(scoreDoc.doc) text_dic['title'] = doc.get("title") text_dic['url'] = doc.get("url") keyword = high_seg.getBestFragment(analyzer, "contents", cleantxt(doc.get('contents'))) text_dic['keyword'] = keyword text.append(text_dic)''' return text, maxnum
def testBooleanNotTermOutput(self): query = BooleanQuery() query.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST) query.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST_NOT) self.assertConversion(query, 'cats NOT dogs')
print("Index contains %d documents." % n_docs) def get_query_results(reader,query,n,field): searcher = IndexSearcher(reader) hits = searcher.search(query, n).scoreDocs print("Found %d hits:" % len(hits)) for i, hit in enumerate(hits): doc = searcher.doc(hit.doc) print("%d. %s" % (i + 1, doc.get(field))) #### part(a) query1a = TermQuery(Term("capital_html","greek")) query2a = TermQuery(Term("capital_html","roman")) query3a = TermQuery(Term("capital_html","persian")) boolean_query_a = BooleanQuery() boolean_query_a.add(query1a, BooleanClause.Occur.MUST) boolean_query_a.add(query2a, BooleanClause.Occur.MUST) boolean_query_a.add(query3a, BooleanClause.Occur.MUST_NOT) get_query_results(reader,boolean_query_a,n_docs,"capital") #Found 32 hits: #1. https://en.wikipedia.org/wiki/Sukhumi #2. https://en.wikipedia.org/wiki/Nicosia #3. https://en.wikipedia.org/wiki/Nicosia #4. https://en.wikipedia.org/wiki/Tiraspol #5. https://en.wikipedia.org/wiki/Tripoli #6. https://en.wikipedia.org/wiki/Tunis #7. https://en.wikipedia.org/wiki/Lisbon #8. https://en.wikipedia.org/wiki/Podgorica
def testEquality(self): bq1 = BooleanQuery() bq1.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD) bq1.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD) nested1 = BooleanQuery() nested1.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD) nested1.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD) bq1.add(nested1, BooleanClause.Occur.SHOULD) bq2 = BooleanQuery() bq2.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD) bq2.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD) nested2 = BooleanQuery() nested2.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD) nested2.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD) bq2.add(nested2, BooleanClause.Occur.SHOULD) self.assert_(bq1.equals(bq2))
def __init__(self): # lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.boolean_query = BooleanQuery() self.similarityOfSynopsis() self.similarityOfStoryLine()
class DocSimilarity(object): def __init__(self): # lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.boolean_query = BooleanQuery() self.similarityOfSynopsis() self.similarityOfStoryLine() def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse(QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id) def similarityOfStoryLine(self): directory = SimpleFSDirectory(File(settings.STORYLINE_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.STORYLINE): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) query = queryParser.parse(QueryParser.escape(content)) topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first() similarity.storyline = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)