class SSQA_S_Searcher: def __init__(self, indexDir, analyzer): lucene.initVM() self.reader = DirectoryReader.open(indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = analyzer logger.debug("Search similarity func: {}".format( self.searcher.getSimilarity())) def search(self, query_text, top_n): query_text = query_text.strip() query = QueryParser("content", self.analyzer).parse( QueryParser.escape(query_text.strip())) scoreDocs = self.searcher.search(query, top_n).scoreDocs count = 0 out_list = [] for scoreDoc in tqdm(scoreDocs): docIndex = scoreDoc.doc doc = self.searcher.doc(docIndex) log_debug(doc, logger) log_debug(self.searcher.explain(query, docIndex), logger) out_list.append(doc['content']) count += 1 logger.info("Added {} sentences".format(count)) return out_list def close(self): self.reader.close()
def search_loop(index_dir, field="contents", explain=False): searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(Paths.get(index_dir)))) analyzer = StandardAnalyzer() print("Hit enter with no input to quit.") while True: command = input("Query:") if command == '': return print("Searching for: %s" % command) query = QueryParser(field, analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print("%s total matching documents." % len(scoreDocs)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) if field == 'web': print( f'{doc.get("web")} | {doc.get("raw")} | {scoreDoc.score}') else: print('path:', doc.get("path"), 'name:', doc.get("name")) if explain: explanation = searcher.explain(query, scoreDoc.doc) print(explanation) print('------------')
class ParagSearcher: def __init__(self, Lid, db_path=config.DB_SSQA): lucene.initVM() self.db = SSQA_DB(db_path) lesson_str = self.db.get_lesson_str(Lid) parags = str_lesson2parags(lesson_str) # Index a Lesson myIndexer = _ChineseRamIndexer() myIndexer.index_lesson(parags) myIndexer.close() self.reader = DirectoryReader.open(myIndexer.indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = SmartChineseAnalyzer() logger.debug('search similarity:{}'.format( self.searcher.getSimilarity())) def __exit__(self, *args): self.close() def search(self, query_text, top_n=1): query_text = query_text.strip() # query = QueryParser("content", self.analyzer).parse(QueryParser.escape(query_text.strip())) query = QueryParser("content", self.analyzer).parse(query_text) scoreDocs = self.searcher.search(query, top_n).scoreDocs out_list = [] for scoreDoc in scoreDocs: docIndex = scoreDoc.doc doc = self.searcher.doc(docIndex) log_debug(doc, logger) log_debug(self.searcher.explain(query, docIndex), logger) out_list.append((doc['pid'], doc['content'], scoreDoc.score)) return out_list def close(self): self.db.close() self.reader.close()
class CosQASearcher: def __init__(self, lang): lucene.initVM() if lang == 'zh': indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_ZH))) analyzer = SmartChineseAnalyzer() elif lang == 'en': indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_EN))) analyzer = EnglishAnalyzer() else: raise ValueError( 'lang should be "zh" or "en", {} is invalid!'.format(lang)) self.reader = DirectoryReader.open(indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = analyzer logger.debug('search similarity func: {}'.format( self.searcher.getSimilarity())) def search(self, query_text, top_n=1): query_text = query_text.strip() query = QueryParser("content", self.analyzer).parse( QueryParser.escape(query_text.strip())) # query = QueryParser("content", self.analyzer).parse(query_text) scoreDocs = self.searcher.search(query, top_n).scoreDocs out_list = [] for scoreDoc in scoreDocs: docIndex = scoreDoc.doc doc = self.searcher.doc(docIndex) log_debug(doc, logger) log_debug(self.searcher.explain(query, docIndex), logger) out_list.append( (doc['did'], doc['title_en'], doc['content'], scoreDoc.score)) return out_list def close(self): self.reader.close()
class SimpleSearcher(Searcher): def __init__(self, index_dir): self.searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(Paths.get(index_dir)))) self.analyzer = StandardAnalyzer() def search(self, qstring): query = QueryParser("web", self.analyzer).parse(qstring) scoreDocs = self.searcher.search(query, 50).scoreDocs return [self.searcher.doc(score_doc.doc) for score_doc in scoreDocs] def explain(self, qstring): query = QueryParser("web", self.analyzer).parse(qstring) score_docs = self.searcher.search(query, 50).scoreDocs print(qstring) for score_doc in score_docs: doc = self.searcher.doc(score_doc.doc) print(f'{doc.get("web")} | {doc.get("raw")} | {score_doc.score}') explanation = self.searcher.explain(query, score_doc.doc) print(explanation) print('------------')
class LuceneRetrieval(BaseRetrieval): """ Encapsulates the Lucene retrieval engine """ def __init__(self, index_path, method, logger=None, use_default_similarity=False): self.index_path=index_path directory = SimpleFSDirectory(File(self.index_path)) self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) self.reader=DirectoryReader.open(directory) self.searcher = IndexSearcher(self.reader) # uncomment one of these lines to change the type of parser, query and weight used if use_default_similarity: self.query_parser=QueryParser else: self.query_parser=FieldAgnosticQueryParser if use_default_similarity: similarity=DefaultSimilarity() self.useExplainQuery=False else: similarity=FieldAgnosticSimilarity() self.useExplainQuery=True # by default, FieldAgnosticSimilarity uses coord factor, can be disabled ## similarity.useCoord=False self.searcher.setSimilarity(similarity) self.method=method # never used? self.logger=logger def runQueryViaExplain(self,query, max_results): """ Really crappy solution to make sure that explanations and searches are the same while I fix Lucene """ results=[] index=0 for index in range(self.reader.numDocs()): explanation=self.searcher.explain(query,index) score=explanation.getValue() ## match=re.search(r"(.*?)\s=",explanation.toString(),re.IGNORECASE|re.DOTALL) ## if match: ## score=float(match.group(1)) hit=namedtuple("Hit",["doc","score"]) hit.doc=index hit.score=score ## heapq.heappush(results,hit) results.append(hit) results.sort(key=lambda x:x.score,reverse=True) if max_results < self.reader.numDocs(): results=results[:max_results] return results def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL): """ LOTS OF SWEET LUCENE """ original_query=structured_query if not structured_query or len(structured_query) == 0 : return [] self.last_query=structured_query query_text=self.rewriteQuery(structured_query["structured_query"], ["text"]) try: query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text) except: print("Lucene exception:",sys.exc_info()[:2]) return None structured_query["lucene_query"]=query_text if self.useExplainQuery: # this should only exist until I fix the lucene bulkScorer to give the same results hits=self.runQueryViaExplain(query,max_results) else: collector=TopScoreDocCollector.create(max_results, True) self.searcher.search(query, collector) hits = collector.topDocs().scoreDocs ## print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) res=[] ## if len(hits.scoreDocs) ==0: ## print "Original query:",original_query ## print "Query:", query for hit in hits: doc = self.searcher.doc(hit.doc) metadata= json.loads(doc.get("metadata")) res.append((hit.score,metadata)) return res def formulaFromExplanation(self, query, doc_id): """ Runs .explain() for one query/doc pair, generates and returns a \ StoredFormula instance from it :param query: Elastic DSL Query :param doc_id: id of document to run .explain() for :returns: """ explanation=self.searcher.explain(query,doc_id) formula=StoredFormula() formula.fromLuceneExplanation(explanation) return formula
class GitHubSearcher: def __init__(self, index_path, query=None): self.index_path = index_path self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): indexDir = File(self.index_path) a = {"code": self.porter_analyzer} self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open( index) #IndexReader 열고 닫지 않았었음........................... n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("Index contains %d documents." % n_docs) def get_DF(self, field, term): return self.reader.docFreq(Term(field, term)) def get_IDF(self, field, term): from math import log10, sqrt docF = self.reader.docFreq(Term(field, term)) return log10(self.reader.numDocs() / (docF + 1)) + 1 def get_minimum_IDF(self, docF=2): from math import log10, sqrt return log10(self.reader.numDocs() / (docF + 1)) + 1 def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string""" query = "" for field in [ "typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals" ]: #"used_classes", , "literals" , "extends" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) # Filter out noisy terms stoplist = ["java.lang.Object"] if term not in stoplist: # idf = self.get_IDF(field, term) # print self.get_DF(field, term), term, field #query += "%s:%s^%s " % (field, term, idf) query += "%s:%s " % (field, term) #print "term: %s idf: %s" % (term, self.get_minimum_IDF()) #query += "%s:%s " % (field, term) #print "%s:%s^%s" % (field, term, self.getIDF(field, term)) # for hint in doc.getFields("code_hints"): # tokens = utils.tokenize(hint.stringValue()) # for token in tokens: # #print token # token = QueryParser.escape(token) # if token.strip(): # print "HINTS", token # query += "code:%s^5.0 " % (token) if len(doc.getFields("code_hints")) > 0: hints = [ hint.stringValue() for hint in doc.getFields("code_hints") ] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) print "TERM", term # if term[0].isupper(): # query += "used_classes:%s^5.0 class_instance_creation:%s^5.0 " % (term, term) # elif "(" in term or "." in term or "#" in term: # Heuristic to boost only code identifiers # query += "methods:%s^5.0 methods_called:%s^5.0 " % (term, term) #query += "code:%s^5.0 " % (term) return query def get_matched_keywords(self, query, docid): matched_terms = [] # def _get_matched_keywords(q, matched_terms): # print type(q), matched_terms # if isinstance(q, TermQuery): # if self.searcher.explain(q, docid).isMatch(): # matched_terms.append( q.getTerm().text() ) # elif isinstance(q, BooleanQuery): # for query_term in query.getClauses(): # _get_matched_keywords(query_term, matched_terms) # # if self.searcher.explain(query_term.getQuery(), docid).isMatch(): # # matched_terms.append( query_term.getQuery().getTerm().text() ) # _get_matched_keywords(query, matched_terms) if isinstance(query, TermQuery): if self.searcher.explain(query, docid).isMatch(): matched_terms.append(query.getTerm().text()) elif isinstance(query, BooleanQuery): for query_term in query.getClauses(): if self.searcher.explain(query_term.getQuery(), docid).isMatch(): matched_terms.append( query_term.getQuery().getTerm().text()) #print "Matched Terms: %s" % matched_terms return matched_terms def get_matched_keywords2(self, query, doc): matched_terms = [] weight_expl = self.searcher.explain(query, doc).toString().split("weight(") for expl in weight_expl: if " in " in expl: field_val = expl.split(" in ")[0] #field, val = field_val.split(":") val = field_val.split(":")[-1] matched_terms.append(val) return matched_terms def code_as_text(self): """ Extends a query by matching query keywords in source code as text""" query = " " for term in tokenize_string(self.porter_analyzer, self.query): if term: term = QueryParser.escape(term) query += "code:%s " % (term) return query def lexical_search(self): """ In case no term is matching with stackoverflow we perform a simple lexical search on GitHub """ github_result = [] query = self.code_as_text().strip() query = QueryParser(Version.LUCENE_CURRENT, "code", self.analyzer).parse(query) hits = self.searcher.search(query, 10).scoreDocs for hit in hits: doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords(query, hit.doc) # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] item = GithubResultItem(doc.get("file"), decompress(doc.get("file_content")), matched_terms, hit.score, so_item, doc.get("line_numbers"), hit.doc) # code github_result.append(item) return github_result def more_like_this(self, so_items): github_result = [] if not so_items: so_items.append(SOResultItem(None, 1.0, "No Title", 0, "")) for so_item in so_items: queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) query = "" if so_item.doc: query = self.document_to_query(so_item.doc) query += self.code_as_text() if query: print "-" * 30 print "Query: %s" % query print "-" * 30 try: like_query = queryparser.parse(query) hits = self.searcher.search(like_query, 10).scoreDocs for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( like_query, hit.doc) # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] item = GithubResultItem(doc.get("file"), decompress( doc.get("file_content")), matched_terms, hit.score, so_item, doc.get("line_numbers"), hit.doc) # code github_result.append(item) #print("%d. File: %s, Matched: %s, Score: %s" % (i + 1, doc.get("file"), matched_terms, hit.score)) except Exception as e: print "Error: %s" % e # print Counter(files).most_common(5) return github_result def more_like_this2(self, so_items): if not so_items: so_items.append(SOResultItem(None, 1.0, "No Title", 0, "")) query = "" queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) #### ###아래의 반복문이 Agumented Query 생성부 for so_item in so_items: if so_item.doc: query += self.document_to_query(so_item.doc) query += self.code_as_text() github_result = [] if query: print "-" * 50 print "UNified Query: %s" % query print "-" * 50 try: ###루씬에 맞는 Query로 최종 변환 like_query = queryparser.parse(query) ###아래 줄이 실제로 GitHub Indices들 찾아들어가서 like_query와 비교 견적 상위 5개.. hits = self.searcher.search(like_query, 5).scoreDocs #상위 5개 결과 #hits에 5개의 결과가 들어감.. for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( like_query, hit.doc) print "Matched Terms : ", matched_terms # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] print("file", doc.get("file"), "file_content", doc.get("file_content"), "line_numbers", doc.get("line_numbers")) file_path = doc.get("file") #file_path = "" + doc.get("file")[24:] #file_path = "/root/GitSearch" + doc.get("file")[24:] #print(doc.get("file")[32:]) #print(doc.get("file")[0:]) #print(file_path) content = None try: with open( file_path ) as f: #실제 프로젝트 경로 쭉 찾아들어가서 파일 열고 읽어서 content에 넣음 content = f.read() except: pass #File 찾고 내용 존재 시, 형식에 맞게 item에 넣음. if content: item = GithubResultItem(doc.get("file"), content, matched_terms, hit.score, so_item, doc.get("line_numbers"), hit.doc) # code github_result.append(item) except Exception as e: print "GitSearcher: Error: %s" % e print(traceback.format_exc()) return github_result
class LuceneRetrieval(BaseRetrieval): """ Encapsulates the Lucene retrieval engine """ def __init__(self, index_path, method, logger=None, use_default_similarity=False): self.index_path=index_path directory = SimpleFSDirectory(File(self.index_path)) self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) self.reader=DirectoryReader.open(directory) self.searcher = IndexSearcher(self.reader) # uncomment one of these lines to change the type of parser, query and weight used if use_default_similarity: self.query_parser=QueryParser else: self.query_parser=FieldAgnosticQueryParser if use_default_similarity: similarity=DefaultSimilarity() self.useExplainQuery=False else: similarity=FieldAgnosticSimilarity() self.useExplainQuery=True # by default, FieldAgnosticSimilarity uses coord factor, can be disabled ## similarity.useCoord=False self.searcher.setSimilarity(similarity) self.method=method # never used? self.logger=logger def runQueryViaExplain(self,query, max_results): """ Really crappy solution to make sure that explanations and searches are the same while I fix Lucene """ results=[] index=0 for index in range(self.reader.numDocs()): explanation=self.searcher.explain(query,index) score=explanation.getValue() ## match=re.search(r"(.*?)\s=",explanation.toString(),re.IGNORECASE|re.DOTALL) ## if match: ## score=float(match.group(1)) hit=namedtuple("Hit",["doc","score"]) hit.doc=index hit.score=score ## heapq.heappush(results,hit) results.append(hit) results.sort(key=lambda x:x.score,reverse=True) if max_results < self.reader.numDocs(): results=results[:max_results] return results def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL): """ LOTS OF SWEET LUCENE """ original_query=structured_query if not structured_query or len(structured_query) == 0 : return [] self.last_query=structured_query query_text=self.rewriteQuery(structured_query["structured_query"], ["text"]) try: query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text) except: print("Lucene exception:",sys.exc_info()[:2]) return None structured_query["lucene_query"]=query_text if self.useExplainQuery: # this should only exist until I fix the lucene bulkScorer to give the same results hits=self.runQueryViaExplain(query,max_results) else: collector=TopScoreDocCollector.create(max_results, True) self.searcher.search(query, collector) hits = collector.topDocs().scoreDocs ## print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) res=[] ## if len(hits.scoreDocs) ==0: ## print "Original query:",original_query ## print "Query:", query for hit in hits: doc = self.searcher.doc(hit.doc) metadata= json.loads(doc.get("metadata")) res.append((hit.score,metadata)) return res def formulaFromExplanation(self, query, doc_id): """ Runs .explain() for one query/doc pair, generates and returns a \ StoredFormula instance from it :param query: Elastic DSL Query :param doc_id: id of document to run .explain() for :returns: """ explanation=self.searcher.explain(query,doc_id) formula=StoredFormula() formula.fromLuceneExplanation(explanation) return formula
class Searcher: def __init__(self, source, index_path): self.index_path = index_path self.source = source ast, source = parse(self.source, resolve=True, source=True) self.source = source self.ast = ast self.queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", KeywordAnalyzer()) self.load_index() def load_index(self): indexDir = File(self.index_path) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("Index contains %d documents." % n_docs) def document_to_query(self): """ Given a document it transforms the source code related fields to a lucene query string""" query = "" for field in [ "typed_method_call", "methods", "extends", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals" ]: #"used_classes", , "literals" for val in self.ast[field]: term = QueryParser.escape(val) query += "%s:%s " % (field, term) return query def get_matched_keywords(self, query, docid): matched_terms = [] if isinstance(query, TermQuery): #print self.searcher.explain(query, docid) if self.searcher.explain(query, docid).isMatch(): matched_terms.append(query.getTerm().text()) else: for query_term in query.getClauses(): if self.searcher.explain(query_term.getQuery(), docid).isMatch(): #print self.searcher.explain(query_term.getQuery(), docid) matched_terms.append( query_term.getQuery().getTerm().text()) #print "Matched Terms: %s" % matched_terms return matched_terms def get_AST_from_Doc(self, doc): tree = {} tree["typed_method_call"] = [ f.stringValue() for f in doc.getFields("typed_method_call") ] tree["methods_called"] = [ f.stringValue() for f in doc.getFields("methods_called") ] tree["imports"] = [f.stringValue() for f in doc.getFields("imports")] tree["used_classes"] = [ f.stringValue() for f in doc.getFields("used_classes") ] # tree["var_type_map"] = eval(doc.getField("var_type_map").stringValue()) # tree["unresolved_method_calls"] = [f.stringValue() for f in doc.getFields("unresolved_method_calls")] return tree def more_like_this(self): trees = [] file_hash_process = set() query = self.document_to_query() if query: print "-" * 30 print "Query: %s" % query print "-" * 30 try: like_query = self.queryparser.parse(query) hits = self.searcher.search(like_query, 10).scoreDocs for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords( like_query, hit.doc) file_path = doc.getField("file").stringValue() #print "Matched Terms", matched_terms print "Path: ", file_path # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] with open(file_path, "r") as f: file_content = f.read() file_hash = doc.getField("hash").stringValue() #print "FILE", file_content #print "PARSE", parse(file_content, resolve=False) if file_hash not in file_hash_process: trees.append(parse(file_content, resolve=False)) file_hash_process.add(file_hash) else: print "Duplicate: ", file_path #trees.append( self.get_AST_from_Doc(doc) ) except Exception as e: print "Error: %s" % e return trees
class QuestionLuceneSearch(): def __init__(self): self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print('Creating index at', prm.index_folder) if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print('copying index from', prm.index_folder, 'to', prm.local_index_folder) if os.path.exists(prm.local_index_folder): print('Folder', prm.local_index_folder, 'already exists! Doing nothing.') else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) self.searcher.setSimilarity(BM25Similarity()) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print('Creating index at', prm.index_folder_term) self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term) if os.path.exists(prm.local_index_folder_term): print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.') else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print('Loading Text-ID mapping...') self.text_id_map, self.id_text_map = self.get_text_id_map() def get_text_id_map(self): # get number of docs n_docs = self.searcher.getIndexReader().numDocs() text_id = {} id_text = {} query = MatchAllDocsQuery() hits = self.searcher.search(query, n_docs) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) idd = int(doc['id']) text = doc['text'] text_id[text] = idd id_text[idd] = text return text_id, id_text # def add_doc(self, doc_id, title, txt, add_terms): def add_doc(self, doc_id, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) # doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str,words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder, docs_path, add_terms=False): print('Loading Vocab...') if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print("%d docs in index" % self.writer.numDocs()) print("Indexing documents...") # import corpus_hdf5 # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path) import pickle with open(docs_path, "rb") as read_file: corpus = pickle.load(read_file) idx_cnt = 0 # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()): # for doc_id, txt in corpus.items(): for txt in corpus: self.add_doc(idx_cnt, txt, add_terms) # not lowered if idx_cnt % 1000 == 0: print('indexing doc', idx_cnt) idx_cnt += 1 print("Index of %d docs..." % self.writer.numDocs()) self.writer.close() def search_multithread(self, qs, max_cand, max_full_cand, searcher): self.max_cand = max_cand self.max_full_cand = max_full_cand self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() if q in self.cache: return self.cache[q] else: try: q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() hits = self.curr_searcher.search(query, self.max_cand) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) if i < self.max_full_cand: word_idx = list(map(int, doc['word_idx'].split(' '))) word = doc['word'].split('<&>') else: word_idx = [] word = [] # c[int(doc['id'])] = [word_idx, word] c[int(doc['id'])] = [word_idx, word, hit.score] # print(c) return c def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher): out = [] for q in qs: if q in self.cache: out.append(self.cache[q]) else: try: q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') query = QueryParser("text", self.analyzer).parse(QueryParser.escape('dummy')) c = OrderedDict() hits = curr_searcher.search(query, max_cand) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) if i < max_full_cand: word_idx = list(map(int, doc['word_idx'].split(' '))) word = doc['word'].split('<&>') else: word_idx = [] word = [] # c[int(doc['id'])] = [word_idx, word] c[int(doc['id'])] = [word_idx, word, hit.score] out.append(c) return out def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True): if not max_full_cand: max_full_cand = max_cand if prm.docs_path != prm.docs_path_term: max_cand2 = 0 else: max_cand2 = max_full_cand if prm.n_threads > 1: out = self.search_multithread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term) else: out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term) if (prm.docs_path != prm.docs_path_term) and extra_terms: for outt, termss in zip(out, terms): for cand_id, term in zip(list(outt.keys())[:max_full_cand], list(termss.values())): outt[cand_id] = term if save_cache: for q, c in zip(qs, out): if q not in self.cache: self.cache[q] = c return out def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True): # if prm.n_threads > 1: # out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher) # if (prm.docs_path != prm.docs_path_term) and extra_terms: # terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term) # else: # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher) # if (prm.docs_path != prm.docs_path_term) and extra_terms: # terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term) out = [] try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) c = OrderedDict() exp = self.searcher.explain(query, doc_int) c[1] = exp out.append(c) return out def search_pair_score_singlethread(self, q, doc_int, searcher): out = [] try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) c = OrderedDict() exp = searcher.explain(query, doc_int) c[1] = exp out.append(c) return out def search_pair_score_multithread(self, qs_trailing_doc, searcher): self.curr_searcher = searcher # out = self.pool.map(self.search_pair_score_multithread_part, product(qs,doc_int)) out = self.pool.map(self.search_pair_score_multithread_part, qs_trailing_doc) return out def search_pair_score_multithread_part(self, q_doc_int): # print(q_doc_int) spl=q_doc_int.split('<|endoftext|>') q = spl[0] print(q) doc_int = int(spl[1]) print(doc_int) if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() exp = self.curr_searcher.explain(query, doc_int) c[1] = exp return c
class BenchSearcher: def __init__(self, index_path, query=None): self.index_path = File(index_path) self.directory = None self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): a = {"code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer} self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader) def tokenize_string(self, analyzer, string): result = [] stream = analyzer.tokenStream(None, StringReader(string)) cattr = stream.addAttribute(CharTermAttribute) stream.reset() while stream.incrementToken(): result.append(cattr.toString()) stream.close() return result def camel_case_split(self, s): import re s = s.replace("_", " ") s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s) s = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1).lower().replace(" ", " ").split() return s def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string """ query = "" for field in ["description"]: for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) # tokenize term = self.tokenize_string(StandardAnalyzer(), term) # CamelCase temp = [] for t in term: temp += self.camel_case_split(t) # stopwords temp_2 = [] for t in temp: if t not in english_stop_words: temp_2.append(t) # stemming temp_3 = [] for t in temp_2: temp_3.append(stem(t)) # stopwords temp_4 = [] for t in temp_3: if t not in english_stop_words: temp_4.append(t) # query generation for term in temp_4: query += "%s:%s " % (field, term) for field in ["typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals"]: # "used_classes", , "literals" , "extends" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) java_stoplist = ["java.lang.Object", 'void', 'Global', 'boolean', 'String', 'int', 'char', 'float', 'double', 'write', 'close', 'from', 'println', 'StringBuilder', 'write', 'toString', 'close', 'mkdir', 'exists'] if term not in java_stoplist: query += "%s:%s " % (field, term) if len(doc.getFields("code_hints")) > 0: hints = [hint.stringValue() for hint in doc.getFields("code_hints")] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) if term not in english_stop_words: # print "Including 'code_hints' from Doc_To_Query TERMs... //", term query += "code_hints:%s " % term return query def get_matched_keywords2(self, query, doc): matched_terms = [] weight_expl = self.searcher.explain(query, doc).toString().split("weight(") for expl in weight_expl: if " in " in expl: field_val = expl.split(" in ")[0] # field, val = field_val.split(":") val = field_val.split(":")[-1] matched_terms.append(val) return matched_terms def more_like_this2(self, limit, score_logs_for_each, user_query, flag): bench_result = [] query = "" if flag == 1: query += user_query query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: parsed_query = queryparser.parse(query) hits = self.searcher.search(parsed_query, limit).scoreDocs temp = 1 for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched = doc.get('file').split('/')[9].split('.')[0] score_logs_for_each += str(matched) + '\t' + str(round(hit.score, 2)) + '\n' matched_terms = self.get_matched_keywords2(parsed_query, hit.doc) temp += 1 file_path = doc.get("file") content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = BenchResultItem_UQ(doc.get("file"), content, matched_terms, hit.score, doc.get("line_numbers"), hit.doc) bench_result.append(item) except Exception as e: print "BenchSearcher Error: %s" % e print(traceback.format_exc()) return bench_result, score_logs_for_each
class LuceneAnnotator(object): #@Value("${org.freya.lucene.index.dir.search}") Resource luceneIndexDir; def __init__(self): logging.basicConfig(filename='../../freya/index/annotator.log', filemode='w', level=logging.DEBUG) def close(self): if self._reader != None: try: print "closing index Reader" except Exception as e:#IOException(e): print e.message logging.error("Error") finally: pass #private static final Log logger = LogFactory.getLog(LuceneAnnotator.class); def getIndex(self): return self._index def setIndex(self, index): self._index = index def testSearcher(self): query=QueryParser(Version.LUCENE_CURRENT, "class", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(QueryParser.escape('http\://www.mooney.net/geo#River')) print query hits = self._searcher.search(query, 50) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = self._searcher.doc(hit.doc) print doc.get("class").encode("utf-8") #public SynonymMap synonymMap; def init(self): try: print 'lucene', lucene.VERSION lucene.initVM(vmargs=['-Djava.awt.headless=true']) if not hasattr(self,'_index'): indexDir = "../../freya/index/actual" self._index = File(indexDir) if not hasattr(self,'_reader'): self._reader = "Not needed" if not hasattr(self,'_searcher'): try: # lazily instantiate searcher print "Setting searcher to " + str(self._index) self._searcher = IndexSearcher(DirectoryReader.open(SimpleFSDirectory(self._index))) except Exception as e:#Exception(e): print e.message print "Searcher Initialisation Error" except Exception as e:#CorruptIndexException(e): print e.message logging.error("Lucene Error") def getSpecificityScores(self): # map = Hashtable[str, Nullable]() logging.info("Need to implement....") return map # * # * find lucene annotations for this poc specialTreatment is for common nouns so that they are searched with stem not # * exact match # * # * @param annotation # * @return # def searchIndex(self, annotation, specialTreatment): if specialTreatment: return self.searchStemFirst(annotation) annotations = list() #ArrayList[Annotation]() try: maxSynonyms = 0 stemAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT) # Analyzer stemmedAnalyser = AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)), # synonymMap, maxSynonyms); analyser = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser) pocString = QueryParser.escape(annotation.getText()) preparePocString = "\"" + pocString + "\"" preparePocStringLowercase = "\"" + pocString.lower() + "\"" query = parser.parse(preparePocString) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.debug("For " + str(query) + " : " + str(result.totalHits)) if freq <= 0: # search lowercased exact lowerCasedParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser) query = lowerCasedParser.parse(preparePocStringLowercase) # logging.info("Searching for: " + query.toString()); result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.debug("For " + str(query) + " : " + str(result.totalHits)) if len(hits) == 0 and preparePocStringLowercase.index(" ") < 0: # search stemmed stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemAnalyser) query = stemParser.parse(preparePocStringLowercase) # logging.info("Searching for: " + query.toString()); result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.info("For " + str(query) + " : " + str(result.totalHits)) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) ann = Annotation() features = dict() features[FreyaConstants.CLASS_FEATURE_LKB]=doc.get(FreyaConstants.CLASS_FEATURE_LKB) features[FreyaConstants.INST_FEATURE_LKB]=doc.get(FreyaConstants.INST_FEATURE_LKB) features[FreyaConstants.PROPERTY_FEATURE_LKB]=doc.get(FreyaConstants.PROPERTY_FEATURE_LKB) features["string"]=doc.get(FreyaConstants.FIELD_EXACT_CONTENT) features[FreyaConstants.SCORE]=hit.score ann.setFeatures(features) ann.setEndOffset(annotation.getEndOffset()) ann.setStartOffset(annotation.getStartOffset()) ann.setSyntaxTree(annotation.getSyntaxTree()) ann.setText(annotation.getText()) annotations.append(ann) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return annotations # * # * this method now search both stem and lowercase # * # * @param annotation # * @return # def searchStemFirst(self, annotation): annotations = list() pocString = QueryParser.escape(annotation.getText()) preparePocStringOriginal = "\"" + pocString + "\"" preparePocStringLowercase = "\"" + pocString.lower() + "\"" try: maxSynonyms = 0 # Analyzer stemmedAnalyser = # AnalyzerUtil.getSynonymAnalyzer(AnalyzerUtil # .getPorterStemmerAnalyzer(new StandardAnalyzer(Version.LUCENE_CURRENT)), # synonymMap, maxSynonyms); stemmedAnalyser = EnglishAnalyzer(Version.LUCENE_CURRENT) analyser = StandardAnalyzer(Version.LUCENE_CURRENT) stemParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_STEMMED_CONTENT, stemmedAnalyser) query = stemParser.parse(preparePocStringLowercase) result = self._searcher.search(query, 1) logging.info("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) stemHits = result.scoreDocs allHits = stemHits # if(stemHits.length == 0) { # search lowercased exact parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_LOWERCASED_CONTENT, analyser) query = parser.parse(preparePocStringLowercase) result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) lowHits = result.scoreDocs allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(lowHits) # ArrayUtils.addAll(allHits, lowHits) logging.info("For " + str(query) + " : " + str(result.totalHits)) # } # if(allHits.length == 0) { # search exact exactParser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.FIELD_EXACT_CONTENT, analyser) query = exactParser.parse(preparePocStringLowercase) result = self._searcher.search(query, 1) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) allHits = pyJava.JArray2List(allHits) + pyJava.JArray2List(result.scoreDocs) #ArrayUtils.addAll(allHits, result.scoreDocs) logging.info("For " + str(query) + " : " + str(result.totalHits)) # } # for (ScoreDoc hit : allHits) { indexus = 0 while indexus < len(allHits): hit = allHits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) ann = Annotation() features = dict() features[FreyaConstants.CLASS_FEATURE_LKB] = doc.get(FreyaConstants.CLASS_FEATURE_LKB) features[FreyaConstants.INST_FEATURE_LKB] = doc.get(FreyaConstants.INST_FEATURE_LKB) features[FreyaConstants.PROPERTY_FEATURE_LKB] = doc.get(FreyaConstants.PROPERTY_FEATURE_LKB) features["string"] = doc.get(FreyaConstants.FIELD_EXACT_CONTENT) features["score"] = hit.score ann.setFeatures(features) ann.setEndOffset(annotation.getEndOffset()) ann.setStartOffset(annotation.getStartOffset()) ann.setSyntaxTree(annotation.getSyntaxTree()) ann.setText(annotation.getText()) annotations.append(ann) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return annotations # * # * @return # def findPropertyURIs(self): uris = list() uris = uris + self.findPropertyURIs(OWL.DATATYPEPROPERTY, None) uris = uris + self.findPropertyURIs(OWL.OBJECTPROPERTY, None) uris = uris + self.findRDFPropertyURIs(None) return uris # * # * @param max # * @return # def findPropertyURIs(self, max): uris = list() uris = uris + self.findPropertyURIs(OWL.DATATYPEPROPERTY, max) uris = uris + self.findPropertyURIs(OWL.OBJECTPROPERTY, max) uris = uris + self.findRDFPropertyURIs(max) return uris # * # * @return # def findDatatypePropertyURIs(self): uris = list() uris = uris + self.findPropertyURIs(OWL.DATATYPEPROPERTY, None) return uris # * # * @return # def findObjectPropertyURIs(self): uris = list() uris = uris + self.findPropertyURIs(OWL.OBJECTPROPERTY, None) return uris # * # * @param max # * @return # def findRDFPropertyURIs(self, max): uris = list() owl = "http://www.w3.org/2002/07/owl" rdfProps = self.findPropertyURIs(RDF.PROPERTY, max) # for (String prop : rdfProps) { indexus = 0 while indexus < len(rdfProps): prop = rdfProps[indexus] if prop != None and not prop.startswith(owl): uris.append(prop) indexus += 1 return uris # * # * @return # def findClassURIs(self): uris = list() uris = uris + self.findPropertyURIs(OWL.CLASS, None) uris = uris + self.findPropertyURIs(RDFS.CLASS, None) return uris # * # * find lucene annotations for this poc # * # * @param annotation # * @return # def findPropertyURIs(self, propertyType, max): uris = list() # list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.CLASS_FEATURE_LKB, analyzer) query = parser.parse("\"" + QueryParser.escape(propertyType) + "\"") result = self._searcher.search(query, 1) freq = result.totalHits if max != None: freq = max.intValue() if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.debug("For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max)) print "For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) uris.append(doc.get(FreyaConstants.INST_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return uris # * # * @param propertyUri # * @return # def findPropertyRange(self, propertyUri): rangeUri = "http://www.w3.org/2000/01/rdf-schema#range" return self.searchForClass(propertyUri, rangeUri) # * # * @param propertyUri # * @return # def findPropertyDomain(self, propertyUri): rangeUri = "http://www.w3.org/2000/01/rdf-schema#domain" return self.searchForClass(propertyUri, rangeUri) # * # * given classUri search for field class so that pred=subClassOf # * # * @param classUri # * @return # def findSubClasses(self, classUri): #RESOLVE multifieldqueryparser DOCUMENTATION PROBLEM!!!! propertyURI = "http://www.w3.org/2000/01/rdf-schema#subClassOf" subClasses = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) fields = [FreyaConstants.CLASS_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] subClassUri = "\"" + QueryParser.escape(propertyURI) + "\"" queries = ["\"" + QueryParser.escape(classUri) + "\"", subClassUri] query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,queries, fields,flags,analyzer) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) subClasses.append(doc.get(FreyaConstants.INST_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return subClasses # * # * check whether this is datatype property or not # * # * @param propertyUri # * @return # def isItDatatypeProperty(self, propertyUri): result = self.checkIfItIsDatatypeProperty(propertyUri) exists = False if result != None and len(result) > 0: exists = True # logging.info("isItDatatypeProperty for " + propertyUri + " is " + exists); return exists # * # * @param classUri # * @return # def getDefinedPropertiesWhereClassIsADomain(self, classUri): properties = self.searchForInstance(classUri, RDFS.DOMAIN) return properties # Apparently there is no overloading in Python! MUST CHANGE FLOW def getDefinedPropertiesWhereClassIsADomain(self, classUri, forceSuperClasses): properties = list() if forceSuperClasses: superClasses = self.findSuperClasses(classUri) superClasses.append(classUri) # for (String uri : superClasses) { indexus = 0 while indexus < len(superClasses): uri = superClasses[indexus] for each in self.getDefinedPropertiesWhereClassIsADomain(uri,False): properties.append(each) indexus += 1 else: properties = self.searchForInstance(classUri, RDFS.DOMAIN) return properties def getDefinedPropertiesWhereClassIsARange(self, classUri, forceSuperClasses): properties = list() if forceSuperClasses: superClasses = self.findSuperClasses(classUri) superClasses.append(classUri) # for (String uri : superClasses) { indexus = 0 while indexus < len(superClasses): uri = superClasses[indexus] for each in self.getDefinedPropertiesWhereClassIsARange(uri,False): properties.append(each) indexus += 1 else: properties = self.searchForInstance(classUri, RDFS.RANGE) return properties # * # * @param classUri # * @return # def getNeighbouringClassesWhereGivenClassIsADomain(self, classUri, forceSuperClasses): classes = list() if forceSuperClasses: # here recursively go and first find all super classes feedClasses = self.findSuperClasses(classUri) feedClasses.append(classUri) # then for each superclass do the same as above # for (String uri : feedClasses) { indexus = 0 while indexus < len(feedClasses): uri = feedClasses[indexus] for each in self.getNeighbouringClassesWhereGivenClassIsADomain(uri, False): classes.append(each) indexus += 1 else: properties = self.searchForInstance(classUri, RDFS.DOMAIN) # for (String property : properties) { indexus = 0 while indexus < len(properties): property = properties[indexus] for each in self.searchForClass(property, RDFS.RANGE): classes.append(each) indexus += 1 return classes def getNeighbouringClassesWhereGivenClassIsARange(self, classUri, forceSuperClasses): classes = list() if forceSuperClasses: # here recursively go and first find all super classes feedClasses = self.findSuperClasses(classUri) feedClasses.append(classUri) logging.info("found " + str(len(feedClasses)) + " super classes for " + classUri) # then for each superclass do the same as above # for (String uri : feedClasses) { indexus = 0 while indexus < len(feedClasses): uri = feedClasses[indexus] for each in self.getNeighbouringClassesWhereGivenClassIsARange(uri,False): classes.append(each) logging.info("found " + str(len(classes)) + " elements for " + uri) indexus += 1 else: properties = self.searchForInstance(classUri, RDFS.RANGE) # for (String property : properties) { indexus = 0 while indexus < len(properties): property = properties[indexus] for each in self.searchForClass(property, RDFS.DOMAIN): classes.append(each) indexus += 1 return classes # * # * @param classUri # * @return # def findSuperClasses(self, classUri): searchFinished = False directSuperClasses = list() superClassesToSave = list() while not searchFinished: directSuperClasses = self.searchForClass(classUri, RDFS.SUBCLASSOF) # print str(directSuperClasses) + " list" if len(directSuperClasses) == 0 or (len(directSuperClasses) != 0 and pyJava.contains(directSuperClasses,superClassesToSave)): searchFinished = True else: # logging.info("searchFinished for SuperClasses"); # System.out.println("size:"+directSuperClasses.size()); for each in directSuperClasses: superClassesToSave.append(each) # for (String cUri : directSuperClasses) { indexus = 0 while indexus < len(directSuperClasses): cUri = directSuperClasses[indexus] for each in self.findSuperClasses(cUri): superClassesToSave.append(each) indexus += 1 searchFinished = True logging.info("For " + str(classUri) + " found " + str(len(superClassesToSave)) + " super-classes.") return superClassesToSave def searchForInstance(self, classUri, pred): uris = list() fields = [FreyaConstants.CLASS_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] queries = ["\"" + QueryParser.escape(classUri) + "\"", "\"" + QueryParser.escape(pred) + "\""] try: query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, StandardAnalyzer(Version.LUCENE_CURRENT)) result = self._searcher.search(query, 1) logging.debug("For " + query.toString() + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) uris.append(doc.get(FreyaConstants.INST_FEATURE_LKB)) indexus += 1 except Exception as e:#ParseException(e): print e.message logging.error("Error") return uris # * # * # * @param inst # * @param className # * @return # def checkIfItIsDatatypeProperty(self, inst): classUris = list() fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.CLASS_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] queries = ["\"" + inst + "\"", "\"" + OWL.DATATYPEPROPERTY + "\""] try: query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, StandardAnalyzer(Version.LUCENE_CURRENT)) result = self._searcher.search(query, 1) logging.info("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) classUris.append(doc.get(FreyaConstants.INST_FEATURE_LKB)) indexus += 1 except Exception as e:#ParseException(e): print e.message logging.error("Error") return classUris # * # * @param inst # * @param pred # * @return # def searchForClass(self, inst, pred): classUris = list() fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] queries = ["\"" + QueryParser.escape(inst) + "\"", "\"" + QueryParser.escape(pred) + "\""] try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer) result = self._searcher.search(query, 1) logging.info("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) classUris.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 except Exception as e:#ParseException(e): print e.message logging.error("Error") return classUris # * # * @return # def findTopClasses(self): propertyURI = RDFS.SUBCLASSOF allClasses = list() topClasses = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.PROPERTY_FEATURE_LKB, analyzer) query = parser.parse("\"" + QueryParser.escape(propertyURI) + "\"") result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) allClasses.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 # for (String classUri : allClasses) { indexus = 0 while indexus < len(allClasses): classUri = allClasses[indexus] logging.info("Checking whether " + classUri + " is a top class.") # search inst and pred retrieve class # if class exists that means it is not top class otherwise add to # topClasses classes = self.searchForClass(classUri, propertyURI) logging.info("top classes:" + str(len(classes))) if classes != None or len(classes) > 0: logging.info("This is not a top class...") else: topClasses.append(classUri) logging.info("Adding " + classUri + " to top classes.") indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return topClasses # * # * randomly gets the direct type # * # * @param instanceUri # * @return # def findOneDirectType(self, instanceUri): return self.findDirectTypes(instanceUri, 1)[0] def findDirectTypes(self, instanceUri): return self.findDirectTypes(instanceUri, None) # * # * find direct types # * # * @param annotation # * @return # def findDirectTypes(self, instanceUri, max): dTypes = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, "inst", analyzer) query = parser.parse("\"" + QueryParser.escape(instanceUri) + "\"") result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = 0 if max != None: freq = max else: freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) dTypes.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") logging.debug("there are " + str(len(dTypes)) + " unique direct types") return dTypes # * # * find lucene annotations for this poc # * # * @param annotation # * @return # def findLabels(self, instanceUri): labels = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] labelOrTitleUris = "\"http://www.w3.org/2000/01/rdf-schema#label\"" # + # " OR http://purl.org/dc/elements/1.1/title"; queries = ["\"" + QueryParser.escape(instanceUri) + "\"", QueryParser.escape(labelOrTitleUris)] query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) labels.append(doc.get(FreyaConstants.FIELD_EXACT_CONTENT)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return labels def findLiteral(self, instanceUri, propertyURI): labels = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] labelOrTitleUris = "\"" + propertyURI + "\"" queries = ["\"" + QueryParser.escape(instanceUri) + "\"", QueryParser.escape(labelOrTitleUris)] query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) labels.append(doc.get(FreyaConstants.FIELD_EXACT_CONTENT)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return labels
class GitSearcher: def __init__(self, index_path): self.index_path = index_path self.reader = None self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): indexDir = File(self.index_path) a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer() } self.analyzer = PerFieldAnalyzerWrapper(self.porter_analyzer, a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("\nLoading Indices... GitHub index contains [%d] documents." % n_docs) def tokenize_string(self, analyzer, string): result = [] stream = analyzer.tokenStream(None, StringReader(string)) cattr = stream.addAttribute(CharTermAttribute) stream.reset() while stream.incrementToken(): result.append(cattr.toString()) stream.close() return result def camel_case_split(self, s): import re s = s.replace("_", " ") s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s) s = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1).lower().replace(" ", " ").split() return s def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string """ query = "" for field in ["description"]: for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) #tokenize term = self.tokenize_string(StandardAnalyzer(), term) #CamelCase temp = [] for t in term: temp += self.camel_case_split(t) #stopwords temp_2 = [] for t in temp: if t not in english_stop_words: temp_2.append(t) #stemming temp_3 = [] for t in temp_2: temp_3.append(stem(t)) #stopwords temp_4 = [] for t in temp_3: if t not in english_stop_words: temp_4.append(t) #query generation for term in temp_4: query += "%s:%s " % (field, term) for field in [ "typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals" ]: # "used_classes", , "literals" , "extends" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) java_stoplist = [ "java.lang.Object", 'void', 'Global', 'boolean', 'String', 'int', 'char', 'float', 'double', 'write', 'close', 'from', 'println', 'StringBuilder', 'write', 'toString', 'close', 'mkdir', 'exists' ] if term not in java_stoplist: query += "%s:%s " % (field, term) if len(doc.getFields("code_hints")) > 0: hints = [ hint.stringValue() for hint in doc.getFields("code_hints") ] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) if term not in english_stop_words: # print "Including 'code_hints' from Doc_To_Query TERMs... //", term query += "code_hints:%s " % term return query def get_matched_keywords2(self, query, doc): matched_terms = [] weight_expl = self.searcher.explain(query, doc).toString().split("weight(") for expl in weight_expl: if " in " in expl: field_val = expl.split(" in ")[0] #field, val = field_val.split(":") val = field_val.split(":")[-1] matched_terms.append(val) return matched_terms def more_like_this2(self, limit, item_doc, user_query): github_result = [] if not item_doc: item_doc.append(ResultItem(None, 1.0, "No Title", 0, 0)) query = "" if item_doc.doc: query += self.document_to_query(item_doc.doc) query += user_query query = remove_unified_stop_lists(query) print '................................................................................................' print "Project Searcher Unified Query :", query print '................................................................................................' write_search_log( "................................................................................................\n" + "Project Searcher Unified Query : " + str(query.encode('utf-8')) + "\n" + "................................................................................................\n" ) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: like_query = queryparser.parse(query) hits = self.searcher.search(like_query, limit).scoreDocs #answer 1개당 10개씩 temp = 1 for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( like_query, hit.doc) #print "Matched Terms : ", matched_terms print("File %s" % temp, doc.get("file"), "//", doc.get("file_content") ) #, "line_numbers", doc.get("line_numbers")) write_search_log("File " + str(temp) + str(doc.get("file")) + "//" + str(doc.get("file_content")) + "\n") temp += 1 file_path = doc.get("file") print 'file_path = ', file_path content = None try: with open(file_path) as f: content = f.read() except: print "CAN'T OPEN THE FILE" pass if content: item = GithubResultItem(doc.get("file"), content, matched_terms, hit.score, item_doc, doc.get("line_numbers"), hit.doc) # print item.score github_result.append(item) except Exception as e: print "GitSearcher Error: %s" % e print(traceback.format_exc()) #sorted(github_result, key=attrgetter()) print 'github_result : ', github_result return github_result
class Index: def __init__(self, folder=None, fields=[], similarity="tfidf"): self.jcc = lucene.initVM() if folder: self.directory = SimpleFSDirectory(File(folder)) else: self.directory = RAMDirectory() self.fields = {} for field in fields: ft = FieldType() for pname, pvalue in field.props.items(): setter = getattr(ft, "set" + pname.capitalize()) setter(pvalue) ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # ft.setOmitNorms(True) self.fields[field.name] = ft self.similarity = similarity.lower() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.writer = None self.searcher = None def attach_thread(self): self.jcc.attachCurrentThread() def open_writer(self): config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.directory, config) def add(self, **doc): if not self.writer: self.open_writer() d = Document() for field, value in doc.items(): # try : d.add(Field(field, value, self.fields[field])) # except Exception, e : # print # print "Fudeu" # pass self.writer.addDocument(d) def commit(self): self.writer.commit() def close(self): if self.writer: self.writer.close() def open_searcher(self): self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader) if (self.similarity == "bm25"): self.searcher.setSimilarity(BM25Similarity()) def preprocess_query(self, query, fields, mode="ANY"): ''' Fix query according to provided mode. If the value is not supported, the query remains unchanged ''' terms = query.lower().strip().split() if mode == "ANY": query = " OR ".join(terms) elif mode == "ALL": query = " AND ".join(terms) else: print "Invalid mode parameter '%s'." % mode query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return query def search(self, query, search_fields, return_fields, filter=None, ignore=set(), mode="ANY", return_scores=False, limit=1000000): ''' Search documents in the index using a standard analyzer (tokenizes and removes top words). Supports two search modes: ANY and ALL ANY: include documents that contain at least one term of the query. ALL: include only documents that contain all terms of the query. ''' if not self.searcher: self.open_searcher() # Return empty results if query is empty (Lucene can't handle it nicely) if query.strip() == '': if return_scores: return [], [] else: return [] query = self.preprocess_query(query, search_fields, mode) # If limit is not provided, return all matched documents. A little hack is required # to do that. We query for one document and get the count total matched documents. # if not limit : # hits = self.searcher.search(query, 1) # limit = hits.totalHits # Fetch more than asked in case we have to remove entries from the ignore set if limit != None: limit += len(ignore) hits = self.searcher.search(query, filter, limit) hits = hits.scoreDocs docs = [] for hit in hits: doc = self.searcher.doc(hit.doc) if doc['id'] not in ignore: docs.append([doc[f] for f in return_fields]) if return_scores: scores = [hit.score for hit in hits] return docs[:limit], scores[:limit] return docs[:limit] def explain(self, query, fields, doc): if not self.searcher: self.open_searcher() query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return self.searcher.explain(query, doc) def get_documents(self, doc_ids, fields): docs = [] for doc_id in doc_ids: doc = self.reader.document(doc_id) if isinstance(fields, basestring): docs.append(doc.get(fields)) else: docs.append({f: doc.get(f) for f in fields}) return docs def get_query_scores(self, query, fields, doc_ids, mode="ANY"): # Creates pre-filter to ignore all other documents filter = TermsFilter([Term("id", id) for id in doc_ids]) query = self.preprocess_query(query, fields, mode) hits = self.searcher.search(query, filter, len(doc_ids)).scoreDocs # Creates scores' mapping using entity id instead of internal index id scores = { str(self.reader.document(hit.doc).get("id")): hit.score for hit in hits } # Normalize to 0..1 interval # n = 1.0/sum(scores.values()) # scores # Adds to the mapping entries for the non-returned docs (no term found) for doc_id in doc_ids: if doc_id not in scores: scores[doc_id] = 0.0 return scores