for fileName in os.listdir(corpus): #print fileName document = Document() article = os.path.join(corpus, fileName) content = open(article, 'r').read() document.add(Field("text", content, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) print writer.numDocs() writer.close() # INDEX READER reader = IndexReader.open(directory) searcher = IndexSearcher(reader) # QUERYING FOR A QUESTION queryParser = QueryParser(util.Version.LUCENE_CURRENT, "text", analyzer) ''' answers = ['A', 'B', 'C', 'D'] submissionFile = open("luceneModel.csv", "w") writer = csv.writer(submissionFile, delimiter=',') writer.writerow(['id', 'correctAnswer']) # 10 - 0.3844 # 9 - 0.386 # 5 - 0.3742 with open(trainingFilePath) as trainData: reader = csv.reader(trainData, delimiter="\t") header=0
return '' if not (TAGS_AND_GENRES or DESCR): raise Exception( 'At least one between TAGS_AND_GENRES and DESCR should be True') lucene.initVM(vmargs=['-Djava.awt.headless=true']) fsDir = SimpleFSDirectory(Paths.get('index')) searcher = IndexSearcher(DirectoryReader.open(fsDir)) if CLASSIC_SIMILARITY: searcher.setSimilarity(ClassicSimilarity()) analyzer = EnglishAnalyzer() tags_parser = QueryParser(TAGS_LABEL, analyzer) genres_parser = QueryParser(GENRES_LABEL, analyzer) descr_parser = QueryParser(DESCR_LABEL, analyzer) tags_parser.setDefaultOperator(QueryParser.Operator.OR) genres_parser.setDefaultOperator(QueryParser.Operator.OR) descr_parser.setDefaultOperator(QueryParser.Operator.OR) BooleanQuery.setMaxClauseCount( 2000000) # prevents 1024 limit error for very long queries ############################## Build user queries ########################## ratings = ML1M('../datasets/ml-1m').ratings movies_descriptions = pd.read_csv('../datasets/movies-descriptions.csv') movies_tags = pd.read_csv('../datasets/movies-tags.csv')
print("lm docs not in qrels: %s" % (len(lm_docs))) f = codecs.open('/home/fernando/MatchZoo/data/robust04/corpus_n_stem2.txt', 'w', encoding='utf8') for did in lm_docs: f.write("%s %s\n" % (did, lm_docs[did])) f.close() if __name__ == "__main__": lucene.initVM() index = DirectoryReader.open( SimpleFSDirectory(Paths.get(INDEX_BASE_DIR + INDEX_DIR))) searcher = IndexSearcher(index) analyzer = EnglishAnalyzer() qparser = QueryParser("contents", analyzer) qid_doc_list = {} qrel_dict = {} qrel_docs = set() rel_file = '/home/fernando/MatchZoo/data/robust04/cv_splits/test.5.txt' rel = read_relation(filename=rel_file) #rel.extend(read_relation(filename='/home/fernando/MatchZoo/data/robust04/relation_train.txt')) #rel.extend(read_relation(filename='/home/fernando/MatchZoo/data/robust04/relation_valid.txt')) print('Instance size: %s' % (len(rel)), end='\n') word_dict, _ = read_word_dict( "/home/fernando/MatchZoo/data/robust04/word_dict_new_n_stem_filtered_rob04_embed.txt" ) for label, d1, d2 in rel:
def perfume_search(command): query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(command) return query
print("Loading Lucene Index ...") lucene.initVM(vmargs=['-Djava.aws.headless=true']) analyzer = StandardAnalyzer() searchDir = NIOFSDirectory(Paths.get(args.index_path)) searcher = IndexSearcher(DirectoryReader.open(searchDir)) # try tuning the hyperparameters of bm25 for k1 in [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]: for b in [0.5, 0.6, 0.7, 0.8, 0.9]: print(f"Grid search.... k1: {k1}; b: {b}") searcher.setSimilarity(BM25Similarity(k1, b)) parser = QueryParser('Context', analyzer) retrieved = [] print("Searching ...") for q in tqdm(questions): query = parser.parse(QueryParser.escape(q)) # print(q, "|", QueryParser.escape(q), "|", query) # import pdb; pdb.set_trace() scoreDocs = searcher.search(query, args.topk).scoreDocs topkDocs = [] for hit in scoreDocs: doc = searcher.doc(hit.doc) topkDocs.append({ "title": doc.get("Title"), "text": doc.get("Context") })
def find_documents(self, search_text): self.query = QueryParser("contents", self.analyzer).parse(search_text) self.hits = self.searcher.search(self.query, 50) return self.hits
def mid_search(mid): query = QueryParser(Version.LUCENE_CURRENT, "mid", analyzer).parse(command) return query
def run(searcher_good, searcher_bad, analyzer): while True: command_dict = parseCommand(command) total_num = 20 #这些不同的s用来决定排序顺序:依次是按价格(从低到高)、热度(总评论数)、好评率、综合评分 #s=SortField("price",SortField.Type.FLOAT,False) #s=SortField("total_comment",SortField.Type.FLOAT,True) s = SortField("good_rate", SortField.Type.FLOAT, True) #s=SortField("socre",SortField.Type.FLOAT,True) so = Sort(s) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) #这两句用来限定价格的范围 #q=NumericRangeQuery.newFloatRange("price",100.0,200.0,True,True) #querys.add(q,BooleanClause.Occur.MUST) scoreDocs_good = searcher_good.search(querys, total_num, so).scoreDocs total = len(scoreDocs_good) flag = True if len(scoreDocs_good) < total_num: scoreDocs_bad = searcher_bad.search(querys, total_num, so).scoreDocs total = total + len(scoreDocs_bad) flag = False if total > total_num: total = total_num print "%s total matching documents." % total #"url"是网址,“img_url”是图片网址,“brand”是品牌 for scoreDoc_good in scoreDocs_good: doc = searcher_good.doc(scoreDoc_good.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'title:', doc.get('title') print 'total_comment', doc.get("total_comment") print 'price', doc.get("price") print 'socre', doc.get("socre") print 'brand', doc.get("brand") print 'good_rate', doc.get("good_rate") print if not flag: t = 0 for scoreDoc_bad in scoreDocs_bad: t = t + 1 doc = searcher_bad.doc(scoreDoc_bad.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'title:', doc.get('title') print 'total_comment', doc.get("total_comment") print 'price', doc.get("price") print 'score', doc.get("score") print 'brand', doc.get("brand") print 'good_rate', doc.get("good_rate") print if t > total_num - 1 - len(scoreDocs_good): break
def runstext(command, cpage, meth): global vm_env, searcher, analyzer text = [] print(command) if command == '': return command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 1000).scoreDocs maxnum = len(scoreDocs) keywords = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) reslist = [] maxnum = min(maxnum, 100) for i, scoreDoc in enumerate(scoreDocs[:maxnum]): doc = searcher.doc(scoreDoc.doc) date = doc.get("date") score = float(scoreDoc.score) reslist.append([doc, date, score]) style = highlight.SimpleHTMLFormatter("<b><font color=\'red\'>", "</font></b>") high_seg = highlight.Highlighter(style, highlight.QueryScorer(keywords)) high_seg.setTextFragmenter(highlight.SimpleFragmenter(50)) if meth == "rel": reslist = sorted(reslist, key=lambda res: res[2], reverse=True) elif meth == "td": reslist = sorted(reslist, key=lambda res: res[1], reverse=True) elif meth == "tu": reslist = sorted(reslist, key=lambda res: res[1], reverse=False) print keywords start = (cpage - 1) * 10 end = min(start + 10, maxnum) print start, end for i in reslist[start:end]: doc = i[0] score = i[2] date = str(getdate(i[1])) text_dic = {} text_dic['title'] = doc.get("title").strip('-直播吧zhibo8.cc').strip( '_新浪竞技风暴_新浪网') text_dic['url'] = doc.get("url") tmpcontent = cleantxt(doc.get("contents")) keyword = high_seg.getBestFragment(analyzer, "contents", tmpcontent) text_dic['keyword'] = keyword text_dic['score'] = score text_dic['date'] = date text.append(text_dic) '''for i, scoreDoc in enumerate(scoreDocs): text_dic = {} doc = searcher.doc(scoreDoc.doc) text_dic['title'] = doc.get("title") text_dic['url'] = doc.get("url") keyword = high_seg.getBestFragment(analyzer, "contents", cleantxt(doc.get('contents'))) text_dic['keyword'] = keyword text.append(text_dic)''' return text, maxnum
def ancientSearch(self, field): sear = self._search fieldOnly = False # 只搜索域 if len(self._commandInfo.getWordList()) == 0: fieldOnly = True bq = BooleanQuery.Builder() fields = self._commandInfo.getFields() for key in fields: queryx = QueryParser(key, KeywordAnalyzer()).parse(fields[key][0]) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']: bq = BooleanQuery.Builder() q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) bc = BooleanClause(q, BooleanClause.Occur.MUST) bq.add(bc) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif self._commandInfo.getKey()[0] == '#': bq = BooleanQuery.Builder() query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1])) bc1 = BooleanClause(query1, BooleanClause.Occur.MUST) bc2 = BooleanClause(query2, BooleanClause.Occur.MUST) bq.add(bc1).add(bc2) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif self._commandInfo.getKey()[0] in ['$', '+']: bq = BooleanQuery.Builder() for w in self._commandInfo.getWordList(): queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w)) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() else: query = '' hits = sear.search(query, 9999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get('id') detail = get_detail(doc) zhujie = detail['zhujie'] if detail['detail'] and 'detail' in detail['detail'].keys(): detail['detail'] = detail['detail']['detail'] detail.pop('zhujie') detail.pop('text') detail.pop('type') detail = json.dumps(detail) if fieldOnly: if not doc.get("text").strip(): continue if id.count(".") == 2: self._doc[id] = doc.get("text") self._resultSentencesList.append((id, doc.get("text"))) elif id.count(".") == 1: searcher = self._search query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1') hits = searcher.search(query, 1) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) res = doc.get("text") if res: self._doc[id+".1"] = doc.get('text') self._resultSentencesList.append((id + ".1", doc.get('text'))) else: searcher = self._search query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1.1') hits = searcher.search(query, 1) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) res = doc.get("text") if not doc.get("text").strip(): continue if res: self._doc[id+".1.1"] = doc.get('text') self._resultSentencesList.append((id + ".1.1", doc.get('text'))) elif doc_hit(res, self._commandInfo): if key_filter(self._commandInfo, res): if 'section' in self._commandInfo.getFields().keys(): if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['section'], 0): continue if 'document' in self._commandInfo.getFields().keys(): if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['document'], 1): continue self._doc[id] = res self._resultSentencesList.append((id, res, detail, zhujie)) return self
def parseQuery(myQuery): parser = QueryParser("", StopAnalyzer()) parsedQuery = parser.parse(myQuery) myQueryTerms = parsedQuery.toString().split(" ") return myQueryTerms
from org.apache.lucene.search import PhraseQuery from org.apache.lucene.search import BooleanQuery from org.apache.lucene.search import BooleanClause from org.apache.lucene.index import Term from rlqa.retriever.lucene_analyzer import MySimpleAnalyzer as MySimpleAnalyzerPython lucene.initVM(vmargs=['-Djava.awt.headless=true']) # analyzer = MySimpleAnalyzer() analyzer = MySimpleAnalyzerPython() shingle_analyzer = ShingleAnalyzerWrapper(analyzer) sentence = "Stearn received many honours for his work." print(sentence) query = QueryParser("text", analyzer).parse(QueryParser.escape(sentence)) print(query) def parse_query(analyzer, query): ts = analyzer.tokenStream("dummy", StringReader(sentence)) termAtt = ts.getAttribute(CharTermAttribute.class_) ts.reset() tokens = [] while ts.incrementToken(): tokens.append(termAtt.toString()) ts.end() ts.close() booleanQuery = BooleanQuery.Builder() for token in tokens:
def __init__(self, index_dir): self.index_dir = index_dir self.indexDir = SimpleFSDirectory(File(self.index_dir).toPath()) self.q_parser = QueryParser("", WhitespaceAnalyzer()) self.commit_max = 500000 self.__get_writer_searcher()
def button_search_clicked(self): t1 = time.time() title_substring = self.ui.lineEdit_title.text() mode = self.ui.comboBox.currentText() # SQLite/PostgreSQL if not self.ui.radioButton_Lucene.isChecked(): if mode == 'Полное совпадение': params = {'title': title_substring} query_string = """select * from movies where name ilike %(title)s""" if mode == 'Частичное совпадение': params = {'title': title_substring} query_string = """select * from movies where name ilike '%%'||%(title)s||'%%'""" if mode == 'Частичное совпадение по словам': params = { 'title' + str(i): v for i, v in enumerate(title_substring.split()) } query_string = """select * from movies where """ + ' or '.join( [ """name ilike '%%'||%({})s||'%%'""".format(t) for t in params.keys() ]) if mode == 'Полное совпадение + Год': year_substring = self.ui.lineEdit_year.text() year_substring = year_substring if year_substring != '' else None params = {'title': title_substring, 'year': year_substring} query_string = """select * from movies where name ilike %(title)s and year = %(year)s""" if mode == 'Частичное совпадение + Год': year_substring = self.ui.lineEdit_year.text() year_substring = year_substring if year_substring != '' else None params = {'title': title_substring, 'year': year_substring} query_string = """select * from movies where name ilike '%%'||%(title)s||'%%' and year = %(year)s""" if mode == 'Частичное совпадение по словам + Год': year_substring = self.ui.lineEdit_year.text() year_substring = year_substring if year_substring != '' else None params = { 'title' + str(i): v for i, v in enumerate(title_substring.split()) } query_string = ' or '.join([ """name ilike '%%'||%({})s||'%%'""".format(t) for t in params.keys() ]) params.update({'year': year_substring}) query_string = 'select * from movies where ({}) and year = %(year)s'.format( query_string, year_substring) # PostgreSQL connection if self.ui.radioButton_PostgreSQL.isChecked(): con = psycopg2.connect(user='******', password=self.db_password, host='db.mirvoda.com', port='5454', dbname='information_retrieval') se = 'PostgreSQL' # SQLite connection else: con = sqlite3.connect(PATH + '/imdb.db') se = 'SQLite' df = pd.read_sql(query_string, con, params=params).head(LIMIT) con.close() df = df.fillna('').astype(str) df['year'] = df['year'].apply(lambda x: x.replace('.0', '')) # Lucene else: lucene.initVM() index_dir = SimpleFSDirectory(Paths.get('index')) reader = DirectoryReader.open(index_dir) searcher = IndexSearcher(reader) query_string = '' se = 'Lucene' if mode == 'Полное совпадение': query_string = 'name:"{}"'.format(title_substring) if mode == 'Частичное совпадение': query_string = 'name:{}'.format(title_substring) if mode == 'Частичное совпадение по словам': query_string = ' or '.join([ """name:{}""".format(ss) for ss in title_substring.split() ]) if mode == 'Полное совпадение + Год': year_substring = self.ui.lineEdit_year.text() query_string = 'name:"{}" AND year:"{}"'.format( title_substring, year_substring) if mode == 'Частичное совпадение + Год': year_substring = self.ui.lineEdit_year.text() query_string = 'name:{} AND year:"{}"'.format( title_substring, year_substring) if mode == 'Частичное совпадение по словам + Год': year_substring = self.ui.lineEdit_year.text() query_string = ' or '.join([ """name:{}""".format(ss) for ss in title_substring.split() ]) query_string = '({}) and year:"{}"'.format( query_string, year_substring) query = QueryParser("defaultField", StandardAnalyzer()).parse(query_string) hits = searcher.search(query, LIMIT) df = pd.DataFrame() for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) df = df.append( [[doc.get('id'), doc.get('name'), doc.get('year')]], ignore_index=True) if not df.empty: df.columns = ['id', 'name', 'year'] pandas_model = PandasModel(df) self.tableView.setModel(pandas_model) self.tableView.horizontalHeader().setSectionResizeMode(1) t2 = time.time() self.statusBar().showMessage('Searched [{}] with {} for {} s'.format( query_string, se, str(t2 - t1))) logging.info('Searched [{}] with {} for {} s'.format( query_string, se, str(t2 - t1))) logging.info(df) logging.info( '---------------------------------------------------------')
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher = IndexSearcher(reader) # read query read_query() # initialize mongodb client mongoObj = Mongo_Object('localhost', 27017) # initialize word2vec print 'load word2vec model' w2vmodel = gensim.models.Word2Vec.load_word2vec_format( "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary", binary=True) print 'finish loading word2vec model' # search global hitsPerPage fields = ['name', 'value'] #parser=MultiFieldQueryParser(fields,analyzer) #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) rec_result = open('pylucene.runs', 'w') for i in range(len(queries)): query = queries[i] print 'processing query ' + str(i) + ':' + query[0] querystr = remove_duplicate(stemSentence(query[1])) #q_lucene=MultiFieldQueryParser.parse(parser,querystr) q_lucene = QueryParser("all_text", analyzer).parse(querystr) print "q_lucene: " + q_lucene.toString() collector = TopScoreDocCollector.create(hitsPerPage) searcher.search(q_lucene, collector) hits = collector.topDocs().scoreDocs # build query object for computeScore queryObj = Query_Object(query, mongoObj, w2vmodel) # initialize duplicate remover docDup = set() # find candidate results after 1st round filter candidates = PriorityQueue() for j in range(len(hits)): docID = hits[j].doc d = searcher.doc(docID) name = cleanSentence(d['title'].strip()) if name in docDup: continue docDup.add(name) # build entity object entityObj = Entity_Object(d, mongoObj, w2vmodel) score = computeScore(queryObj, entityObj, mongoObj, w2vmodel) #score=hits[j].score candidates.put((-score, j)) # output results from priority queue larger score first rank = 0 while candidates.empty() == False and rank < 100: rank = rank + 1 item = candidates.get() score = -item[0] j = item[1] # index of hits[] docID = hits[j].doc d = searcher.doc(docID) title = '<dbpedia:' + d.get('title') + '>' res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str( rank) + '\t' + str(score) + '\t' + 'pylucene_multifield' rec_result.writelines(res_line + '\n') rec_result.close()
def superSearch(command, command_dict, urlclick): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File('index2.3')) print "run super search..." searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) command = ' '.join(jieba.cut_for_search(command)) querys = BooleanQuery() if command: query = QueryParser(Version.LUCENE_CURRENT, 'nameforsearch', analyzer).parse(command) querys.add(query, BooleanClause.Occur.SHOULD) for k, v in (command_dict[0]).items(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) query.setBoost(0.1) querys.add(query, BooleanClause.Occur.MUST) for k, v in (command_dict[1]).items(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST_NOT) scoreDocs = searcher.search(querys, 10000).scoreDocs swxc_res = findres(command + ' ' + command_dict[0].get("ingredient", ''), scoreDocs, searcher) formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>") formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>") if command: scorer = QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command)) highlighters = [Highlighter(formatter_name, scorer)] else: highlighters = [''] if command_dict[0].get('ingredient'): highlighters.append( Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'ingredient', analyzer).parse( command_dict[0]['ingredient'])))) else: highlighters.append('') if command_dict[0].get('taste'): highlighters.append( Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'taste', analyzer).parse(command_dict[0]['taste'])))) else: highlighters.append('') if command_dict[0].get('tech'): highlighters.append( Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'tech', analyzer).parse(command_dict[0]['tech'])))) else: highlighters.append('') fragmenter = SimpleFragmenter(1000) for h in highlighters: if h: h.setTextFragmenter(fragmenter) results = [] for scoreDoc in scoreDocs: if (scoreDoc.score * len(scoreDocs) < 200 and len(scoreDocs) > 200) or scoreDoc.score < 0.1: continue doc = searcher.doc(scoreDoc.doc) if command: highlighterContent = highlighters[0].getBestFragment( analyzer, 'name', doc.get('name')) else: highlighterContent = '' if highlighters[1]: highlighterContent2 = highlighters[1].getBestFragment( analyzer, 'ingredient', doc.get('ingredient')) else: highlighterContent2 = '' if highlighters[2]: highlighterContent3 = highlighters[2].getBestFragment( analyzer, 'taste', doc.get('taste')) else: highlighterContent3 = '' if highlighters[3]: highlighterContent4 = highlighters[3].getBestFragment( analyzer, 'tech', doc.get('tech')) else: highlighterContent4 = '' if highlighterContent: highlighterContent = highlighterContent.replace(' ', '') highlighterContent = highlighterContent.replace(',', ' ') else: highlighterContent = doc.get('name').replace(' ', '') if highlighterContent2: highlighterContent2 = highlighterContent2.replace(',', '') else: highlighterContent2 = (doc.get('ingredient')).replace(',', '') if highlighterContent3: pass else: highlighterContent3 = doc.get('taste') if highlighterContent4: pass else: highlighterContent4 = doc.get('tech') results.append( (highlighterContent, doc.get('img'), doc.get('content').replace(' ', ''), highlighterContent2, highlighterContent3, highlighterContent4, doc.get('others').replace(',', ''), doc.get('url'), scoreDoc.score)) for i in range(0, min(20, len(results)) - 1): flag = True for j in range(0, min(20, len(results)) - i - 1): if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[ results[j][7]] < urlclick[results[j + 1][7]]: flag = False results[j], results[j + 1] = results[j + 1], results[j] if flag: break return results, swxc_res
new_query = '' for i in xrange(len(new_qt)): new_query += new_qt[i] number_of_relevants = 0 for k in relevance[nq].keys(): #print relevance[nq][k] number_of_relevants += relevance[nq][k] # new_query = '' # for i in xrange(len(important_words)): # new_query += important_words[i] + ' ' print "New query: ",new_query query = QueryParser("contents",analyzer).parse(new_query) scoreDocs = searcher.search(query,10).scoreDocs total_rel = 0 ap = 0. nn = 0 for d in scoreDocs: nn += 1 doc = searcher.doc(d.doc) docname = doc.get("name") rel = 0 if docname.strip() in relevance[nq]: rel = relevance[nq][docname.strip()] total_rel += rel if (rel == 1): ap += float(total_rel)/float(nn) print docname + " " + str(rel)
def run(searcher, analyzer, command, urlclick): if command == '': return [] res = firstsearch(searcher, analyzer, command) command = ''.join(my_jieba.cut(command)) command = " ".join(jieba.cut(command, cut_all=True)) if len(res) > 0: scoreDocs = res else: querys = BooleanQuery() for k in tag: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(command) if k == 'taste' or k == 'tech': query.setBoost(0.5) querys.add(query, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 10000).scoreDocs swxc_res = findres(command, scoreDocs, searcher) formatter = SimpleHTMLFormatter("<span style='color:red'>", "</span>") formatter_name = SimpleHTMLFormatter("<span,style='color:red'>", "</span>") scorer = QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'name', analyzer).parse(command)) highlighter1 = Highlighter(formatter_name, scorer) highlighter2 = Highlighter( formatter_name, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'content', analyzer).parse(command))) highlighter3 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'ingredient', analyzer).parse(command))) highlighter4 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'taste', analyzer).parse(command))) highlighter5 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'tech', analyzer).parse(command))) highlighter6 = Highlighter( formatter, QueryScorer( QueryParser(Version.LUCENE_CURRENT, 'others', analyzer).parse(command))) fragmenter = SimpleFragmenter(1000) highlighter1.setTextFragmenter(fragmenter) highlighter2.setTextFragmenter(fragmenter) highlighter3.setTextFragmenter(fragmenter) highlighter4.setTextFragmenter(fragmenter) highlighter5.setTextFragmenter(fragmenter) highlighter6.setTextFragmenter(fragmenter) results = [] for scoreDoc in scoreDocs: if (len(scoreDocs) > 200 and len(scoreDocs) * scoreDoc.score < 2) or scoreDoc.score < 0.002: continue doc = searcher.doc(scoreDoc.doc) highlighterContent = highlighter1.getBestFragment( analyzer, 'name', doc.get('name')) highlighterContent2 = highlighter2.getBestFragment( analyzer, 'content', doc.get('content')) highlighterContent3 = highlighter3.getBestFragment( analyzer, 'ingredient', doc.get('ingredient')) highlighterContent4 = highlighter4.getBestFragment( analyzer, 'taste', doc.get('taste')) highlighterContent5 = highlighter5.getBestFragment( analyzer, 'tech', doc.get('tech')) highlighterContent6 = highlighter6.getBestFragment( analyzer, 'others', doc.get('others')) if highlighterContent: highlighterContent = highlighterContent.replace(' ', '') highlighterContent = highlighterContent.replace(',', ' ') else: highlighterContent = doc.get('name').replace(' ', '') if highlighterContent2: highlighterContent2 = highlighterContent2.replace(' ', '') highlighterContent2 = highlighterContent2.replace(',', ' ') else: highlighterContent2 = doc.get('content').replace(' ', '') if highlighterContent3: highlighterContent3 = highlighterContent3.replace(',', '') else: highlighterContent3 = (doc.get('ingredient')).replace(',', '') if highlighterContent4: pass else: highlighterContent4 = doc.get('taste') if highlighterContent5: pass else: highlighterContent5 = doc.get('tech') if highlighterContent6: highlighterContent6 = highlighterContent6.replace(',', '') else: highlighterContent6 = (doc.get('others')).replace(',', '') results.append( (highlighterContent, doc.get('img'), highlighterContent2, highlighterContent3, highlighterContent4, highlighterContent5, highlighterContent6, doc.get('url'), scoreDoc.score)) for i in range(0, min(20, len(results)) - 1): flag = True for j in range(0, min(20, len(results)) - i - 1): if abs(results[j][8] - results[j + 1][8]) < 0.1 and urlclick[ results[j][7]] < urlclick[results[j + 1][7]]: flag = False results[j], results[j + 1] = results[j + 1], results[j] if flag: break return results, swxc_res
def former_search(former): query = QueryParser(Version.LUCENE_CURRENT, "former", analyzer).parse(command) return query
def sentenceCountForQuery(self, query, field='text'): qp = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query) collector = TotalHitCountCollector() self.searcher.search(qp, collector) return collector.getTotalHits()
def last_search(last): query = QueryParser(Version.LUCENE_CURRENT, "last", analyzer).parse(command) return query
def search(self, qstring): query = QueryParser("web", self.analyzer).parse(qstring) scoreDocs = self.searcher.search(query, 50).scoreDocs return [self.searcher.doc(score_doc.doc) for score_doc in scoreDocs]
def parse_query(query, fieldname): query_parser_obj = QueryParser(fieldname, StandardAnalyzer()) query_parser = query_parser_obj.parse(query) return query_parser
def Run_Price(searcher_good, searcher_bad, analyzer, command, brand): while True: command_dict, low, high = parseCommand(command, brand) total_num = 20 s = SortField("price", SortField.Type.FLOAT, False) #s=SortField("total_comment",SortField.Type.FLOAT,True) #s=SortField("good_rate",SortField.Type.FLOAT,True) #s=SortField("socre",SortField.Type.FLOAT,True) so = Sort(s) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) #The price's range q = NumericRangeQuery.newFloatRange("price", low, high, True, True) querys.add(q, BooleanClause.Occur.MUST) scoreDocs_good = searcher_good.search(querys, total_num, so).scoreDocs total = len(scoreDocs_good) flag = True if len(scoreDocs_good) < total_num: scoreDocs_bad = searcher_bad.search(querys, total_num, so).scoreDocs total = total + len(scoreDocs_bad) flag = False if total > total_num: total = total_num #Total is the number of matched websites res = [] for scoreDoc_good in scoreDocs_good: unit = [] doc = searcher_good.doc(scoreDoc_good.doc) title = doc.get('title') title.replace(' ', '') title = title[:18] total_comment = doc.get("total_comment") price = doc.get("price") socre = doc.get("socre") brand = doc.get("brand") good_rate = doc.get("good_rate") url = doc.get("url") img_url = doc.get("img_url") comment = doc.get("comment").split() unit.append(title) #0 unit.append(total_comment) #1 unit.append(price) #2 unit.append(socre) #3 unit.append(brand) #4 unit.append(good_rate) #5 unit.append(url) #6 unit.append(img_url) #7 unit.append(comment) #8 res.append(unit) if not flag: t = 0 for scoreDoc_bad in scoreDocs_bad: t = t + 1 doc = searcher_bad.doc(scoreDoc_bad.doc) ## explanation = searcher.explain(query, scoreDoc.doc) title = doc.get('title') title.replace(' ', '') title = title[:18] total_comment = doc.get("total_comment") price = doc.get("price") socre = doc.get("socre") brand = doc.get("brand") good_rate = doc.get("good_rate") url = doc.get("url") img_url = doc.get("img_url") comment = doc.get("comment").split() unit.append(title) unit.append(total_comment) unit.append(price) unit.append(socre) unit.append(brand) unit.append(good_rate) unit.append(url) unit.append(img_url) unit.append(comment) res.append(unit) if t > total_num - 1 - len(scoreDocs_good): break res.append(brand) return res
def __init__(self, *args): super(HighlighterTestCase, self).__init__(*args) self.parser = QueryParser(self.FIELD_NAME, StandardAnalyzer())
elif o == "--index": indexDir = a elif o == "--stats": stats = True class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(' '.join(args)) start = datetime.now() scoreDocs = searcher.search(query, 51).scoreDocs duration = datetime.now() - start if stats: print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) print template.substitute(table)
def more_like_this2(self, limit, item_doc, user_query): github_result = [] if not item_doc: item_doc.append(ResultItem(None, 1.0, "No Title", 0, 0)) query = "" if item_doc.doc: query += self.document_to_query(item_doc.doc) query += user_query query = remove_unified_stop_lists(query) print '................................................................................................' print "Project Searcher Unified Query :", query print '................................................................................................' write_search_log( "................................................................................................\n" + "Project Searcher Unified Query : " + str(query.encode('utf-8')) + "\n" + "................................................................................................\n" ) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: like_query = queryparser.parse(query) hits = self.searcher.search(like_query, limit).scoreDocs #answer 1개당 10개씩 temp = 1 for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( like_query, hit.doc) #print "Matched Terms : ", matched_terms print("File %s" % temp, doc.get("file"), "//", doc.get("file_content") ) #, "line_numbers", doc.get("line_numbers")) write_search_log("File " + str(temp) + str(doc.get("file")) + "//" + str(doc.get("file_content")) + "\n") temp += 1 file_path = doc.get("file") print 'file_path = ', file_path content = None try: with open(file_path) as f: content = f.read() except: print "CAN'T OPEN THE FILE" pass if content: item = GithubResultItem(doc.get("file"), content, matched_terms, hit.score, item_doc, doc.get("line_numbers"), hit.doc) # print item.score github_result.append(item) except Exception as e: print "GitSearcher Error: %s" % e print(traceback.format_exc()) #sorted(github_result, key=attrgetter()) print 'github_result : ', github_result return github_result
lucene.initVM(maxheap='8192m') q = sys.argv[1] q = '\"adolf hitler\" \"national museum\"' field = 'content' index_dir = os.path.join(os.path.expanduser('~'), 'github/entityqa/data/index_180629') print(index_dir) hitsPerPage = int(sys.argv[2]) reader = DirectoryReader.open(FSDirectory.open(Paths.get(index_dir))) searcher = IndexSearcherE(reader) analyzer = StandardAnalyzer() qparser = QueryParser(field, analyzer) query = qparser.parse(q) print("Searching for:", query.toString(field)) topdocs = searcher.searchE(query, 5 * hitsPerPage, 'ent') topdocs = TopDocsE.cast_(topdocs) hitEntities = topdocs.scoreDocs hitDocs = topdocs.entityWeightedDocs numTotalHits = topdocs.totalHits numTotalDocs = topdocs.totalDocs print("{} total matching entities ({} docs)".format(numTotalHits, numTotalDocs)) # retriever.searcher.doc^(hitDocs[0]