def findLiteral(self, instanceUri, propertyURI): labels = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] labelOrTitleUris = "\"" + propertyURI + "\"" queries = ["\"" + QueryParser.escape(instanceUri) + "\"", QueryParser.escape(labelOrTitleUris)] query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) labels.append(doc.get(FreyaConstants.FIELD_EXACT_CONTENT)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return labels
def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True): # if prm.n_threads > 1: # out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher) # if (prm.docs_path != prm.docs_path_term) and extra_terms: # terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term) # else: # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher) # if (prm.docs_path != prm.docs_path_term) and extra_terms: # terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term) out = [] try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) c = OrderedDict() exp = self.searcher.explain(query, doc_int) c[1] = exp out.append(c) return out
def searchForClass(self, inst, pred): classUris = list() fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] queries = ["\"" + QueryParser.escape(inst) + "\"", "\"" + QueryParser.escape(pred) + "\""] try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer) result = self._searcher.search(query, 1) logging.info("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) classUris.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 except Exception as e:#ParseException(e): print e.message logging.error("Error") return classUris
def findSubClasses(self, classUri): #RESOLVE multifieldqueryparser DOCUMENTATION PROBLEM!!!! propertyURI = "http://www.w3.org/2000/01/rdf-schema#subClassOf" subClasses = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) fields = [FreyaConstants.CLASS_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] subClassUri = "\"" + QueryParser.escape(propertyURI) + "\"" queries = ["\"" + QueryParser.escape(classUri) + "\"", subClassUri] query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,queries, fields,flags,analyzer) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) subClasses.append(doc.get(FreyaConstants.INST_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return subClasses
def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() if q in self.cache: return self.cache[q] else: try: q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() hits = self.curr_searcher.search(query, self.max_cand) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) if i < self.max_full_cand: word_idx = list(map(int, doc['word_idx'].split(' '))) word = doc['word'].split('<&>') else: word_idx = [] word = [] # c[int(doc['id'])] = [word_idx, word] c[int(doc['id'])] = [word_idx, word, hit.score] # print(c) return c
def search_pair_score_multithread_part(self, q_doc_int): # print(q_doc_int) spl=q_doc_int.split('<|endoftext|>') q = spl[0] print(q) doc_int = int(spl[1]) print(doc_int) if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() exp = self.curr_searcher.explain(query, doc_int) c[1] = exp return c
def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher): out = [] for q in qs: if q in self.cache: out.append(self.cache[q]) else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace( 'NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse( QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' query = QueryParser("text", self.analyzer).parse( QueryParser.escape('dummy')) c = OrderedDict() hits = curr_searcher.search(query, max_cand) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) if i < max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] out.append(c) return out
def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string""" query = "" for field in [ "description" ]: #여기의 필드가 description 으로 설정 했고... 맨 끝에서 field, term이런식으로 넣으니.. 중복이 많음.. for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) #tokenize term = self.tokenize_string(StandardAnalyzer(), term) #CamelCase temp = [] for t in term: temp += self.camel_case_split(t) #stopwords temp_2 = [] for t in temp: if t not in english_stop_words: temp_2.append(t) #stemming temp_3 = [] for t in temp_2: temp_3.append(stem(t)) #stopwords temp_4 = [] for t in temp_3: if t not in english_stop_words: temp_4.append(t) #query generation for term in temp_4: query += "%s:%s " % (field, term) for field in [ "typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called" ]: # "extends", "annotations", "literals" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue( )) #이 자리에서 Unified Query 정제 되나 한번 보자...... stoplist = ["java.lang.Object"] if term not in stoplist: query += "%s:%s " % (field, term) if len(doc.getFields("code_hints")) > 0: hints = [ hint.stringValue() for hint in doc.getFields("code_hints") ] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) if term not in english_stop_words: # print "Including 'code_hints' from Doc_To_Query TERMs... //", term query += "code_hints:%s " % term return query
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string """ query = "" for field in ["description"]: for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) # tokenize term = self.tokenize_string(StandardAnalyzer(), term) # CamelCase temp = [] for t in term: temp += self.camel_case_split(t) # stopwords temp_2 = [] for t in temp: if t not in english_stop_words: temp_2.append(t) # stemming temp_3 = [] for t in temp_2: temp_3.append(stem(t)) # stopwords temp_4 = [] for t in temp_3: if t not in english_stop_words: temp_4.append(t) # query generation for term in temp_4: query += "%s:%s " % (field, term) for field in ["typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals"]: # "used_classes", , "literals" , "extends" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) java_stoplist = ["java.lang.Object", 'void', 'Global', 'boolean', 'String', 'int', 'char', 'float', 'double', 'write', 'close', 'from', 'println', 'StringBuilder', 'write', 'toString', 'close', 'mkdir', 'exists'] if term not in java_stoplist: query += "%s:%s " % (field, term) if len(doc.getFields("code_hints")) > 0: hints = [hint.stringValue() for hint in doc.getFields("code_hints")] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) if term not in english_stop_words: # print "Including 'code_hints' from Doc_To_Query TERMs... //", term query += "code_hints:%s " % term return query
def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string""" query = "" for field in [ "typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals" ]: #"used_classes", , "literals" , "extends" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) # Filter out noisy terms stoplist = ["java.lang.Object"] if term not in stoplist: # idf = self.get_IDF(field, term) # print self.get_DF(field, term), term, field #query += "%s:%s^%s " % (field, term, idf) query += "%s:%s " % (field, term) #print "term: %s idf: %s" % (term, self.get_minimum_IDF()) #query += "%s:%s " % (field, term) #print "%s:%s^%s" % (field, term, self.getIDF(field, term)) # for hint in doc.getFields("code_hints"): # tokens = utils.tokenize(hint.stringValue()) # for token in tokens: # #print token # token = QueryParser.escape(token) # if token.strip(): # print "HINTS", token # query += "code:%s^5.0 " % (token) if len(doc.getFields("code_hints")) > 0: hints = [ hint.stringValue() for hint in doc.getFields("code_hints") ] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) print "TERM", term # if term[0].isupper(): # query += "used_classes:%s^5.0 class_instance_creation:%s^5.0 " % (term, term) # elif "(" in term or "." in term or "#" in term: # Heuristic to boost only code identifiers # query += "methods:%s^5.0 methods_called:%s^5.0 " % (term, term) #query += "code:%s^5.0 " % (term) return query
def findPropertyURIs(self, propertyType, max): uris = list() # list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.CLASS_FEATURE_LKB, analyzer) query = parser.parse("\"" + QueryParser.escape(propertyType) + "\"") result = self._searcher.search(query, 1) freq = result.totalHits if max != None: freq = max.intValue() if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.debug("For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max)) print "For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) uris.append(doc.get(FreyaConstants.INST_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return uris
def searchDocs(self, command, topK=30): if command == '': return # print("Searching for:", command) parser = PythonMultiFieldQueryParser(['name', 'contents'], self.analyzer) query = parser.parse( QueryParser.escape(command), ['name', 'contents'], [BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD], self.analyzer) scoreDocs = self.searcher.search(query, topK).scoreDocs # print("%s total matching documents." % len(scoreDocs)) docName = [] docContents = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) docName.append(doc.get("docname")) docContents.append(doc.get("contents")) # print('docname:', doc.get("docname"), 'name:', doc.get("name"), 'content:', doc.get("contents")) return docName, docContents
def findDirectTypes(self, instanceUri, max): dTypes = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, "inst", analyzer) query = parser.parse("\"" + QueryParser.escape(instanceUri) + "\"") result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = 0 if max != None: freq = max else: freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) dTypes.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") logging.debug("there are " + str(len(dTypes)) + " unique direct types") return dTypes
def get_evidence(searcher, analyzer, claim): escaped_string = QueryParser.escape(claim) query = QueryParser("text", analyzer).parse(escaped_string) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start claim = nlp(claim) claim_evid = [] line_no = [] sim_score = [] final_evidence = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) norm_doc = doc.get("text") norm_doc = nlp(norm_doc) val = claim.similarity(norm_doc) try: int(doc.get("Sno")) claim_evid.append(doc.get("keyterm")) line_no.append(int(doc.get("Sno"))) sim_score.append(val) except ValueError: pass # or whatever if len(sim_score)>5: for val in range(0,5): index = sim_score.index(max(sim_score)) claim = claim_evid.pop(index) line = line_no.pop(index) final_evidence.append([claim , line]) del sim_score[index] else: for i in range(0, len(sim_score)-1): final_evidence.append([claim_evid[i] , int(line_no[i])]) return final_evidence
def do_mapping(line): regex = re.match(r"(?P<netflix_id>[0-9]+),(?P<year>([0-9]+)|NULL),(?P<title>.+)", line) if not regex: raise ValueError(line) netflix_id = int(regex.group("netflix_id")) title = QueryParser.escape(regex.group("title")) query1 = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(title) year = regex.group("year") if year == "NULL": scoreDocs = searcher.search(query1, 1).scoreDocs else: year = int(year) query2 = NumericRangeQuery.newIntRange("year", year, year, True, True) booleanQuery = BooleanQuery(); booleanQuery.add(query1, BooleanClause.Occur.MUST); booleanQuery.add(query2, BooleanClause.Occur.MUST); scoreDocs = searcher.search(booleanQuery, 1).scoreDocs if scoreDocs: if scoreDocs[0].score > 1.5: doc = searcher.doc(scoreDocs[0].doc) doc_id = doc.getField("id").stringValue() doc.add(StringField("netflix_id", str(netflix_id), Field.Store.YES)) writer.updateDocument(Term("id", doc_id), doc)
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q, a, t, p in qatp: if n % 100 == 0: print 'finding candidates sample', n n += 1 q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def _search_singlethread( self, queries: List[str], doc_max: int) -> List[List[Dict[str, Union[float, str]]]]: queries_result = [] for query in queries: try: query = QueryParser('text', self.analyzer).parse( QueryParser.escape(query)) except Exception as exception: # pylint: disable=broad-except logger.warning( colored(f'{exception}: {query}, use query dummy.'), 'yellow') query = QueryParser('text', self.analyzer).parse('dummy') query_results = [] hits = self.searcher.search(query, doc_max) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) query_results.append({ 'score': hit.score, 'title': doc['title'], 'text': doc['text'] }) if not query_results: logger.warning( colored( f'WARN: search engine returns no results for query: {query}.', 'yellow')) queries_result.append(query_results) return queries_result
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q,a,t,p in qatp: if n % 100 == 0: print 'finding candidates sample', n n+=1 q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def code_as_text(self, query): new_query = " " for term in self.tokenize_string(self.porter_analyzer, query): if term: term = QueryParser.escape(term) new_query += "description:%s " % (term) return new_query
def retriever(file_dir): analyzer = WhitespaceAnalyzer() reader = DirectoryReader.open( SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/"))) searcher = IndexSearcher(reader) queryParser = QueryParser("code", analyzer) BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE) with open(file_dir + "/train/train.spl.src", 'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu: sources = [line.strip() for line in fso.readlines()] summaries = [line.strip() for line in fsu.readlines()] with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \ open(file_dir+"/output/ast.out", 'w') as fws: queries = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in ft.readlines() ] for i, line in enumerate(queries): print("query %d" % i) query = queryParser.parse(QueryParser.escape(line)) hits = searcher.search(query, 1).scoreDocs flag = False for hit in hits: doc = searcher.doc(hit.doc) _id = eval(doc.get("id")) flag = True fwo.write(sources[_id] + '\n') fws.write(summaries[_id] + '\n') if not flag: print(query) print(hits) exit(-1)
def getDoc(self, doc, sentenseid): query = QueryParser.escape(doc + ' ' + str(sentenseid)) query = QueryParser('docname', self.analyzer).parse(query) score = self.searcher.search(query, 1).scoreDocs doc = self.searcher.doc(score[0].doc) return doc.get('docname'), doc.get('contents')
def run(self, searcher, analyzer, rawQuery): query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(QueryParser.escape(rawQuery)) # escape special characters scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get("name")
def retrieve(self, term, sid): query = term + ' ' + str(sid) query = self.repalcer(query) query = QueryParser.escape(query) query = QueryParser('name-sid', self.analyzer).parse(query) score = self.searcher.search(query, 1).scoreDocs doc = self.searcher.doc(score[0].doc) return doc.get('name-sid'), doc.get('contents')
def code_as_text(self): """ Extends a query by matching query keywords in source code as text""" query = " " for term in tokenize_string(self.porter_analyzer, self.query): if term: term = QueryParser.escape(term) query += "code:%s " % (term) return query
def pairSearch(self, pair, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: title = pair[0].replace('_', ' ') content = pair[1] parser = QueryParser("content_section", self.analyzer) query1 = parser.parse(QueryParser.escape(title)) query2 = parser.parse(QueryParser.escape(content)) bq = BooleanQuery.Builder() bq.add(query1, BooleanClause.Occur.FILTER) bq.add(query2, BooleanClause.Occur.SHOULD) self.searcher.setSimilarity(sim) hits = self.searcher.search(bq.build(), 6).scoreDocs return hits
def perfume_search(command,tb_data_line,f): query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(QueryParser.escape(command)) scoreDocs = searcher.search(query, 1).scoreDocs # print "%s total matching documents." % len(scoreDocs) contents=tb_data_line.split('\t') name=contents[0] url=contents[1] price=str(contents[2]) post=str(contents[3]) sales=str(contents[4][:-3]) comments=str(contents[5]) place=contents[6] shop=contents[7] img=contents[-1] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDocs[0].doc) data={} data.setdefault('name',name) data.setdefault('url',url) data.setdefault('price',price) data.setdefault('post',post) data.setdefault('sales',sales) data.setdefault('comments',comments) data.setdefault('place',place) data.setdefault('shop',shop) data.setdefault('img',img) data.setdefault('xssd_name',doc.get('name')) data.setdefault('perfumer',doc.get('perfumer')) data.setdefault('tune',doc.get('tune')) data.setdefault('xssd_url', doc.get('url')) data.setdefault('brand',doc.get('brand')) data.setdefault('rate:',float(doc.get('rate'))) data.setdefault('xssd_comments',doc.get('comment')) if doc.get('former_scents')!=None: former=doc.get('former_scents') mid=doc.get('mid_scents') last=doc.get('last_scents') data.setdefault( 'former',former) data.setdefault( 'mid',mid) data.setdefault('last',last) scents=former+' '+mid+' '+last data.setdefault('scents',scents) else: data.setdefault( 'scents',doc.get('scents')) for k,v in data.items(): if v==None: f.write('None'+'\t') else: f.write(str(v)+'\t') f.write('\n')
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0] * len( feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def simpleSearchID(self, query, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: parser = QueryParser("id_section", self.analyzer) query = parser.parse(QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def explain(self, query, fields, doc): if not self.searcher: self.open_searcher() query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return self.searcher.explain(query, doc)
def document_to_query(self): """ Given a document it transforms the source code related fields to a lucene query string""" query = "" for field in [ "typed_method_call", "methods", "extends", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals" ]: #"used_classes", , "literals" for val in self.ast[field]: term = QueryParser.escape(val) query += "%s:%s " % (field, term) return query
def multiFieldsSearch(self, query, sim): lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser( ["content_section", "title_section", 'title_article'], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def multiFieldsPairSearch(self, pair, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: title = pair[0].replace('_', ' ') content = pair[1] parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title)) query2 = MultiFieldQueryParser.parse(parser, QueryParser.escape(content)) bq = BooleanQuery.Builder() bq.add(query1, BooleanClause.Occur.FILTER) bq.add(query2, BooleanClause.Occur.SHOULD) self.searcher.setSimilarity(sim) hits = self.searcher.search(bq.build(), 6).scoreDocs return hits
def retrieve_wiki(text_query, searcher, analyzer): txt =text_query try: query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt) except: qp = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer) txt = qp.escape(txt) query = qp.parse(txt) scoreDocs = searcher.search(query, 1000).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) yield doc.get('title'), doc.get('contents')
def retrieve_wiki(text_query, searcher, analyzer): txt = text_query try: query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt) except: qp = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer) txt = qp.escape(txt) query = qp.parse(txt) scoreDocs = searcher.search(query, 1000).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) yield doc.get('title'), doc.get('contents')
def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse( QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount( self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter( first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter( first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)
def search_scores(self, query, topk=10): query = self.repalcer(query) query = QueryParser.escape(query) query1 = QueryParser('name', self.analyzer).parse(query) query2 = QueryParser('name-contents', self.analyzer).parse(query) # print(query2) scores1 = self.searcher.search(query1, 30).scoreDocs scores2 = self.searcher.search(query2, 30).scoreDocs name1 = [] name2 = [] for score1 in scores1: doc1 = self.searcher.doc(score1.doc) t = doc1.get('name') if t not in name1: name1.append(t) if len(name1) > 1: break # print(name1) name2.append(name1[0]) docnames = [] doccontents = [] s = [] maxscore = scores2[0].score t_doc = self.searcher.doc(scores2[0].doc) for score2 in scores2: doc2 = self.searcher.doc(score2.doc) tname = doc2.get('name') # print(tname) # print(tname,name1) if score2.score == maxscore: docnames.append(doc2.get('name-sid')) doccontents.append(doc2.get('contents')) s.append(score2.score) # print(docnames) elif tname in name1 and score2.score > maxscore - 5: docnames.append(doc2.get('name-sid')) doccontents.append(doc2.get('contents')) s.append(score2.score) # print(docnames) # print(score2.score) # print(maxscore) if len(docnames) > 2: break if len(docnames) == 1 and scores1[0].score > maxscore - 10: docnames.append(self.searcher.doc(scores1[0].doc).get('name-sid')) doccontents.append( self.searcher.doc(scores1[0].doc).get('contents')) s.append(scores1[0].score) assert len(docnames) != 0 return docnames, doccontents, s
def __BM25(self,searcher,rawQuery): '''retrieve documents with a single query''' if 'Standard' in self.indexFile: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # build a standard analyzer with default stop words if 'Porter' in self.indexFile: analyzer = PorterStemmerAnalyzer() query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(QueryParser.escape(rawQuery)) # escape special characters scoreDocs = searcher.search(query, 100).scoreDocs docList = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) docList.append(doc.get("name")) return docList
def search_pair_score_singlethread(self, q, doc_int, searcher): out = [] try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) c = OrderedDict() exp = searcher.explain(query, doc_int) c[1] = exp out.append(c) return out
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def multiFieldsSearch(self, query, sim): """ Method that searches through documents using content_section and title_article Fields searchDir : the path to the folder that contains the index. """ # Now search the index: lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def annotate_all_questions(analyzer, searcher): df = pd.concat(map(dm.ALLEN_AI_OBQA, list(OBQAType))) annotations = {} for _, row in tqdm.tqdm(df.iterrows(), total=len(df)): for answer in row.answers: sent = row.question + " " + answer query_string = QueryParser.escape(sent) query = QueryParser("contents", analyzer).parse(query_string) hits = searcher.search(query, 75).scoreDocs closest = [ searcher.doc(score_doc.doc).get("contents") for score_doc in hits ] # noqa: E501: annotations[sent] = closest pickle.dump(annotations, open("annotations.pkl", "wb")) print("Annotations written to annotations.pkl")
def lucene_retrieval(q_string, use_BM25=False): """ :param q_string: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def doc_text(hists): """ return doc_name & score :param hists: """ text = '_NONE_' for h in hists: docID = h.doc doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") text = doc.get("text") #score = h.score # yield (file_name, doc_name, score, text) return text result = '_NONE_' # escape special characters via escape function if q_string and q_string.strip(): # when pre-process answers, `none of the above` -> '' cause error here #print(q_string) query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists result = doc_text(hs) # reader.close() return result # text: also nodes
def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse(QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)
def findTopClasses(self): propertyURI = RDFS.SUBCLASSOF allClasses = list() topClasses = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.PROPERTY_FEATURE_LKB, analyzer) query = parser.parse("\"" + QueryParser.escape(propertyURI) + "\"") result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) allClasses.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 # for (String classUri : allClasses) { indexus = 0 while indexus < len(allClasses): classUri = allClasses[indexus] logging.info("Checking whether " + classUri + " is a top class.") # search inst and pred retrieve class # if class exists that means it is not top class otherwise add to # topClasses classes = self.searchForClass(classUri, propertyURI) logging.info("top classes:" + str(len(classes))) if classes != None or len(classes) > 0: logging.info("This is not a top class...") else: topClasses.append(classUri) logging.info("Adding " + classUri + " to top classes.") indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return topClasses
def searchStemFirst(self, annotation): annotations = list() pocString = QueryParser.escape(annotation.getText()) preparePocStringOriginal = "\"" + pocString + "\"" preparePocStringLowercase = "\"" + pocString.lower() + "\"" try: maxSynonyms = 0 # Analyzer stemmedAnalyser = # AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)), # synonymMap, maxSynonyms); stemmedAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT) analyser = StandardAnalyzer(Version.LUCENE_CURRENT) stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemmedAnalyser) query = stemParser.parse(preparePocStringLowercase) result = self._searcher.search(query, 1) logging.info("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) stemHits = result.scoreDocs allHits = stemHits # if(stemHits.length == 0) { # search lowercased exact parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser) query = parser.parse(preparePocStringLowercase) result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) lowHits = result.scoreDocs allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(lowHits) # ArrayUtils.addAll(allHits, lowHits) logging.info("For " + str(query) + " : " + str(result.totalHits)) # } # if(allHits.length == 0) { # search exact exactParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser) query = exactParser.parse(preparePocStringLowercase) result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(result.scoreDocs) #ArrayUtils.addAll(allHits, result.scoreDocs) logging.info("For " + str(query) + " : " + str(result.totalHits)) # } # for (ScoreDoc hit : allHits) { indexus = 0 while indexus < len(allHits): hit = allHits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) ann = Annotation() features = dict() features[FreyaConstants.CLASS_FEATURE_LKB] = doc.get(FreyaConstants.CLASS_FEATURE_LKB) features[FreyaConstants.INST_FEATURE_LKB] = doc.get(FreyaConstants.INST_FEATURE_LKB) features[FreyaConstants.PROPERTY_FEATURE_LKB] = doc.get(FreyaConstants.PROPERTY_FEATURE_LKB) features["string"] = doc.get(FreyaConstants.FIELD_EXACT_CONTENT) features["score"] = hit.score ann.setFeatures(features) ann.setEndOffset(annotation.getEndOffset()) ann.setStartOffset(annotation.getStartOffset()) ann.setSyntaxTree(annotation.getSyntaxTree()) ann.setText(annotation.getText()) annotations.append(ann) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return annotations
def testSearcher(self): query=QueryParser(Version.LUCENE_CURRENT, "class", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(QueryParser.escape('http\://www.mooney.net/geo#River')) print query hits = self._searcher.search(query, 50) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = self._searcher.doc(hit.doc) print doc.get("class").encode("utf-8")
def searchIndex(self, annotation, specialTreatment): if specialTreatment: return self.searchStemFirst(annotation) annotations = list() #ArrayList[Annotation]() try: maxSynonyms = 0 stemAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT) # Analyzer stemmedAnalyser = AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)), # synonymMap, maxSynonyms); analyser = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser) pocString = QueryParser.escape(annotation.getText()) preparePocString = "\"" + pocString + "\"" preparePocStringLowercase = "\"" + pocString.lower() + "\"" query = parser.parse(preparePocString) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.debug("For " + str(query) + " : " + str(result.totalHits)) if freq <= 0: # search lowercased exact lowerCasedParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser) query = lowerCasedParser.parse(preparePocStringLowercase) # logging.info("Searching for: " + query.toString()); result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.debug("For " + str(query) + " : " + str(result.totalHits)) if len(hits) == 0 and preparePocStringLowercase.index(" ") < 0: # search stemmed stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemAnalyser) query = stemParser.parse(preparePocStringLowercase) # logging.info("Searching for: " + query.toString()); result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.info("For " + str(query) + " : " + str(result.totalHits)) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) ann = Annotation() features = dict() features[FreyaConstants.CLASS_FEATURE_LKB]=doc.get(FreyaConstants.CLASS_FEATURE_LKB) features[FreyaConstants.INST_FEATURE_LKB]=doc.get(FreyaConstants.INST_FEATURE_LKB) features[FreyaConstants.PROPERTY_FEATURE_LKB]=doc.get(FreyaConstants.PROPERTY_FEATURE_LKB) features["string"]=doc.get(FreyaConstants.FIELD_EXACT_CONTENT) features[FreyaConstants.SCORE]=hit.score ann.setFeatures(features) ann.setEndOffset(annotation.getEndOffset()) ann.setStartOffset(annotation.getStartOffset()) ann.setSyntaxTree(annotation.getSyntaxTree()) ann.setText(annotation.getText()) annotations.append(ann) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return annotations