class GettingQuestionDocs: #Index path를 입력으로 받은 객체를 생성하고 search를 통해 현재 Answer Serchear_1에서 받은 질문 리스트들에 대한 doc들을 얻을 것. def __init__(self, index_path, query=None): self.index_path = File(index_path) # self.index_path = index_path self.directory = None self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire() # self.searchermgr.tryIncRef(self.searcher) # self.reader = DirectoryReader.open(self.directory) # self.searcher = IndexSearcher(self.reader) def search(self, q_ids, limit): docs = [] c = 0 for i, q_id in enumerate(q_ids): #Index 가 안되어있는 Question은 찾지 못함. query = TermQuery(Term("question_id", str(q_id))) topdocs = self.searcher.search( query, 1).scoreDocs #현재는 Accepted Answer 하나만 인덱싱 되기 때문에 1개로 한정 # index searcher에 TermQuery의 객체가 들어가고.. 질문 id에 달려있는 답변 중 상위 n개 가져옴/ scoreDocs는 점수 (소수형임..) for hit in topdocs: doc = self.searcher.doc(hit.doc) docs.append( ResultItem(doc, len(q_ids) - i, doc.get("title"), doc.get("question_id"))) if len(topdocs) > 0: c += 1 if c >= limit: break # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return docs
def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire()
def query_index(query, hit_logs_for_each, score_logs_for_each): ### 1_Query Alternation user_code_query = Generator(query) directory = SimpleFSDirectory(File(INDICES_PATH + 'bigclonebench_4_text')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() benchsearcher = BenchSearcher(searcher) # BigCloneBench ### 8_Querying for the Final Results # Log : Bench_result for each query bench_result, score_logs_for_each = benchsearcher.more_like_this3( 5000, score_logs_for_each, user_code_query) searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None if bench_result: hit_logs_for_each += str(len(bench_result)) + '\t' else: hit_logs_for_each += ('0' + '\t') sorted_bench_results = sorted(bench_result, key=attrgetter('score'), reverse=True) print 'Search Count : ', len(sorted_bench_results) recommended = recommend(sorted_bench_results) print 'Final Count : ', len(recommended) if bench_result: hit_logs_for_each += str(len(recommended)) + '\t' else: hit_logs_for_each += ('0' + '\t') return recommended, hit_logs_for_each, score_logs_for_each
class SimilarQsSearcher: def __init__(self, index_path, query=None): self.index_path = File(index_path) # self.index_path = index_path self.directory = None self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire() # self.searchermgr.tryIncRef(self.searcher) # self.reader = DirectoryReader.open(self.directory) # self.searcher = IndexSearcher(self.reader) def tokenize_string(self, analyzer, string): result = [] stream = analyzer.tokenStream(None, StringReader(string)) cattr = stream.addAttribute(CharTermAttribute) stream.reset() while stream.incrementToken(): result.append(cattr.toString()) stream.close() return result def camel_case_split(self, s): import re s = s.replace("_", " ") s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s) s = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1).lower().replace(" ", " ").split() return s def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string""" query = "" for field in [ "description" ]: #여기의 필드가 description 으로 설정 했고... 맨 끝에서 field, term이런식으로 넣으니.. 중복이 많음.. for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) #tokenize term = self.tokenize_string(StandardAnalyzer(), term) #CamelCase temp = [] for t in term: temp += self.camel_case_split(t) #stopwords temp_2 = [] for t in temp: if t not in english_stop_words: temp_2.append(t) #stemming temp_3 = [] for t in temp_2: temp_3.append(stem(t)) #stopwords temp_4 = [] for t in temp_3: if t not in english_stop_words: temp_4.append(t) #query generation for term in temp_4: query += "%s:%s " % (field, term) for field in [ "typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called" ]: # "extends", "annotations", "literals" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue( )) #이 자리에서 Unified Query 정제 되나 한번 보자...... stoplist = ["java.lang.Object"] if term not in stoplist: query += "%s:%s " % (field, term) if len(doc.getFields("code_hints")) > 0: hints = [ hint.stringValue() for hint in doc.getFields("code_hints") ] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) if term not in english_stop_words: # print "Including 'code_hints' from Doc_To_Query TERMs... //", term query += "code_hints:%s " % term return query def get_matched_keywords2(self, query, doc): matched_terms = [] weight_expl = self.searcher.explain(query, doc).toString().split("weight(") for expl in weight_expl: if " in " in expl: field_val = expl.split(" in ")[0] #field, val = field_val.split(":") val = field_val.split(":")[-1] matched_terms.append(val) return matched_terms def code_as_text(self, query): new_query = " " for term in self.tokenize_string(self.porter_analyzer, query): if term: term = QueryParser.escape(term) new_query += "description:%s " % (term) return new_query def more_like_this2( self, item_doc, result_num ): #들어온 질문 docs들에 대해 순회하면서 최종 query로 생성하고 Question Index에서 비슷한거 검색할 것. similar_questions = [] if not item_doc: item_doc.append(ResultItem(None, 1.0, "No Title", 0)) query = "" if item_doc.doc: query += self.document_to_query(item_doc.doc) query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "term", self.analyzer) if query: #########이 시점에서의 Unified Query는 Tokenization, Stemming 이 되어있음..######## try: like_query = queryparser.parse(query) hits = self.searcher.search( like_query, result_num).scoreDocs #Q와 비슷한 Q들 상위 3개씩의 결과 그럼 총 9개 for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) similar_questions.append(doc.get("question_id")) except Exception as e: print "Question Searcher: Error: %s" % e # write_search_log("Question Searcher: Error: %s" % e + "\n") print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) # self.searchermgr.release(self.searcher) # self.searcher = None # self.directory.close() # self.directory = None return similar_questions
def query_index(query, hit_logs_for_each, score_logs_for_each): ### 1_Query Alternation user_code_query = Generator(query) directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 2_Finding 3 Answer Snippets using the User Query (refined) answers = SnippetSearcher(searcher, user_code_query) answer_ids = answers.more_like_this(20, query=user_code_query) searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Answer count if answer_ids: hit_logs_for_each += str(len(answer_ids)) + '\t' else: hit_logs_for_each += ('0' + '\t') ### 3_Finding the Associated Questions question_ids = answers.find_question_ids(answer_ids) # Log : Answer - Question count if question_ids: hit_logs_for_each += str(len(question_ids)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() getDoc = GettingQuestionDocs(searcher) item_docs = getDoc.search( question_ids, 20)[0:7] # 순위대로 최소 7개의 question을 얻기 위해서 여기서 7개를 자름. searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Question ItemDoc count if item_docs: hit_logs_for_each += str(len(item_docs)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 4_Finding 3 Similar Questions per a Question (3 X 3) similar_questions = [] question = SimilarQsSearcher(searcher) # Log : Similar Question count for each of Question ItemDoc i = 1 if item_docs: for item_doc in item_docs: similar_question = question.more_like_this2( item_doc, 7) # 각 question 들에 대해 7개씩 비슷한 것들 찾음. if similar_question: hit_logs_for_each += str(len(similar_question)) + '\t' else: hit_logs_for_each += ('0' + '\t') similar_questions += similar_question i += 1 else: hit_logs_for_each += ('0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' ) # 7개 searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Similar Question result count if similar_questions: hit_logs_for_each += str(len(similar_questions)) + '\t' else: hit_logs_for_each += ('0' + '\t') ### 5_Finding Associated Answers for each Question (9 - 9) answer_ids = find_answer_ids(similar_questions) # Log : Question - Answer count if answer_ids: hit_logs_for_each += str(len(answer_ids)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 6_Getting Answer Docs for the Final Query getDoc = GettingAnswerDocs(searcher) answer_docs = getDoc.search(answer_ids) searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Answer Docs count if answer_docs: hit_logs_for_each += str(len(answer_docs)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'bigclonebench_2')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() bench_results = [] benchsearcher = BenchSearcher(searcher) # BigCloneBench # Exceptional ### 7_Appending for the user query results ### 8_Querying for the Final Results # Log : Bench_result for each query for answer_doc in answer_docs: bench_result, score_logs_for_each = benchsearcher.more_like_this2( 100, answer_doc, score_logs_for_each, user_code_query, 0) # , user_query=user_code_query) if bench_result: hit_logs_for_each += str(len(bench_result)) + '\t' else: hit_logs_for_each += ('0' + '\t') bench_results += bench_result searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None if answer_docs < 49: for a in range(49 - len(answer_docs)): hit_logs_for_each += ('0' + '\t') if bench_results: hit_logs_for_each += str(len(bench_results)) + '\t' else: hit_logs_for_each += ('0' + '\t') sorted_bench_results = sorted(bench_results, key=attrgetter('score'), reverse=True) print 'Search Count : ', len(sorted_bench_results) recommended = recommend(sorted_bench_results) print 'Final Count : ', len(recommended) if bench_results: hit_logs_for_each += str(len(recommended)) + '\t' else: hit_logs_for_each += ('0' + '\t') return recommended, hit_logs_for_each, score_logs_for_each
def query_index(query): ### 1_Query Alternation user_code_query = Generator(query) print 'query: ', query print 'user_code_query: ', user_code_query #open directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 2_Finding 3 Answer Snippets using the User Query (refined) answers = SnippetSearcher(searcher, user_code_query) answer_ids = answers.more_like_this(10, query=user_code_query) print 'answer_ids: ', answer_ids #close searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None ### 3_Finding the Associated Questions question_ids = answers.find_question_ids(answer_ids) print 'question ids: ', question_ids #open directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 4_Cutting items getDoc = GettingQuestionDocs(searcher) item_docs = getDoc.search( question_ids, 10)[0:3] # 순위대로 최소 7개의 question을 얻기 위해서 여기서 7개를 자름. # print 'item docs: ', item_docs ### 5_Finding 3 Similar Questions per a Question (3 X 3) similar_questions = [] question = SimilarQsSearcher(searcher) if item_docs: for item_doc in item_docs: similar_question = question.more_like_this2( item_doc, 3) # 각 question 들에 대해 7개씩 비슷한 것들 찾음. similar_questions += similar_question print 'similar_questions: ', similar_questions searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None ### 6_Finding Associated Answers for each Question (9 - 9) answer_ids = find_answer_ids(similar_questions) print 'answer ids: ', answer_ids if not answer_ids: recommended = '' return recommended # dest_path = u'/Users/Falcon/Desktop/***Ongoing***/***[4]_FaCoY_Defect4J_Data_Share_Kui/Defect4J_Results/' # project_name = u'Chart/'################################################### # write_file() directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 7_Getting Answer Docs for the Final Query getDoc = GettingAnswerDocs(searcher) answer_docs = getDoc.search(answer_ids) # print 'answer docs: ', answer_docs searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None directory = SimpleFSDirectory(File(INDICES_PATH + 'github')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() git_results = [] gitsearcher = GitSearcher(searcher) ### 7_Appending for the user query results git_result = gitsearcher.more_like_this2(10, answer_docs[0], user_code_query, 1) git_results += git_result # print 'answer docs: ', answer_docs ### 8_Querying for the Final Results for answer_doc in answer_docs: git_result = gitsearcher.more_like_this2(10, answer_doc, user_code_query, 0) git_results += git_result searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None git_results = sorted(git_results, key=attrgetter('so_item.answer_id'), reverse=True) id = 0 i = 0 temp_result = [] for item in git_results: if id != item.so_item.answer_id: id = item.so_item.answer_id i = 1 temp_result.append(item) elif id == item.so_item.answer_id and i < 3: i += 1 temp_result.append(item) elif id == item.so_item.answer_id and i > 3: continue sorted_git_results = sorted(temp_result, key=attrgetter('score'), reverse=True) print 'Search Count : ', len(sorted_git_results) recommended = recommend(sorted_git_results) print 'Final Count : ', len(recommended) # Defect4J 쿼리 결과저장 # cot = 0 # for c, item in enumerate(recommended): # cot += 1 # if cot > 10: # break # result_file = "/Users/Falcon/Desktop/Pycharm_Project/FaCoY_Project/GitSearch/Defect4J_FaCoY/" + str(c+1) + "_" + str('_'.join(str(item[0]).split("/")[6:])) # write_file_over(result_file, str(item.file_content)) # result_file = '/Users/Falcon/Desktop/test.txt' # if os.path.exists(result_file): # os.remove(result_file) # # write_file(result_file, 'User Code Query \n' + str(query) + '\n' + '---------------------------' + '\n') # for c, i in enumerate(recommended): # contents = '' # contents = 'Rank: %d' % (int(c)+int(1)) # contents += '\nFile path: %s' % str(i.file[6:]) + '\n' + '---------------------------' + '\n' # contents += str(i.file_content) +'\n' + '=================================================================' + '\n\n\n' # write_file(result_file, contents) return recommended
class SnippetSearcher: def __init__(self, index_path, query=None): self.index_path = File(index_path) # self.index_path = index_path self.directory = None self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire() # self.searchermgr.tryIncRef(self.searcher) # self.reader = DirectoryReader.open(self.directory) # self.searcher = IndexSearcher(self.reader) # index = SimpleFSDirectory(indexDir) # self.reader = IndexReader.open(index) # self.searcher = SearcherFactory.newSearcher(self.reader) def get_matched_keywords(self, query, doc): matched_terms = [] weight_expl = self.searcher.explain(query, doc).toString().split("weight(") for expl in weight_expl: if " in " in expl: field_val = expl.split(" in ")[0] val = field_val.split(":")[-1] matched_terms.append(val) return matched_terms def more_like_this(self, result_num, query): result = [] queryparser = QueryParser(Version.LUCENE_CURRENT, "methods_called", self.porter_analyzer) if query: try: query = arranging_query_regex(query=query) # print '4. Right after the regex handling : ', query like_query = queryparser.parse(query) # print '5. Right after the Lucene parser : ', like_query hits = self.searcher.search(like_query, result_num).scoreDocs # filterScoreDosArray = hits.topDocs().scoreDocs; for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) # matched_terms = self.get_matched_keywords(like_query, hit.doc) result.append(doc.get("answer_id")) except Exception as e: print "AnswerSearcher: Error: %s" % e print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return result def find_question_ids(self, answer_ids): result_list = [] for id in answer_ids: # print "Answer id : ", id, " /// ", ; query = "SELECT parentID from posts where id = %s" % id question_id = DBManager.requestOneColumnQuery(query) result_list.append(question_id[0]) # print "Question id : ", question_id[0] return result_list # if __name__ == '__main__': # query = """ # typed_method_call:FTPClient.setControlEncoding typed_method_call:FTPClient.login typed_method_call:FTPClient.disconnect typed_method_call:FTPClient.enterLocalPassiveMode typed_method_call:FTPClient.isConnected typed_method_call:FTPClient.setFileType typed_method_call:FTPClient.connect typed_method_call:FTPClient.storeFile typed_method_call:FTPClient.logout typed_method_call:FTPClient.changeWorkingDirectory typed_method_call:Log.e typed_method_call:File.getName typed_method_call:FTPClient.makeDirectory typed_method_call:FileInputStream.close used_classes:FTP used_classes:Log used_classes:FTPClient used_classes:FileInputStream used_classes:boolean class_instance_creation:FTPClient class_instance_creation:FileInputStream methods:uploadFile methods:login methods:FTPConnector methods_called:disconnect methods_called:makeDirectory methods_called:setFileType methods_called:getName methods_called:e methods_called:isConnected methods_called:login methods_called:storeFile methods_called:enterLocalPassiveMode methods_called:logout methods_called:changeWorkingDirectory methods_called:close methods_called:setControlEncoding methods_called:connect literals:LOGIN ERROR literals:UTF-8 literals:Artbit3 literals:FTP_UPLOAD literals:artbit123 literals:FTP_CONNECT literals:music_upload # """ # # answer = SnippetSearcher("%sstackoverflow" % (INDICES_PATH), query) # # #유저 Code Query와 유사한 Snippet들을 가진 Answer Posts 도출 # answer_ids = answer.more_like_this(10, query=query) # print answer_ids # # #도출된 Answer Posts에 각각 해당되는 Question Posts Id들 찾기 # question_ids = answer.find_question_ids(answer_ids) # print question_ids
class GettingAnswerDocs: def __init__(self, index_path, query=None): self.index_path = File(index_path) # self.index_path = index_path self.directory = None self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire() # self.searchermgr.tryIncRef(self.searcher) # self.reader = DirectoryReader.open(self.directory) # self.searcher = IndexSearcher(self.reader) def search(self, a_ids): docs = [] # #For exceptional UQ # if a_ids[0] == '0': # query = TermQuery(Term("answer_id", str(a_ids))) # print query # topdoc = self.searcher.search(query, 1).scoreDocs # # doc = self.searcher.doc(topdoc[0][0].doc) # docs.append(doc, 0, 'No Title', 'No Question id', 'No Answer id', 'No Description') # return docs # else: i = 0 for i, a_id in enumerate(a_ids): query = TermQuery(Term("answer_id", str(a_id))) topdocs = self.searcher.search(query, 1).scoreDocs for hit in topdocs: doc = self.searcher.doc(hit.doc) docs.append( ResultItem(doc, len(a_ids) - i, doc.get("title"), doc.get("question_id"), doc.get("answer_id"), doc.get("description"))) if len(topdocs) > 0: i += 1 if i > a_ids.__len__(): break # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return docs
class BenchSearcher: def __init__(self, index_path, query=None): self.index_path = File(index_path) # self.index_path = index_path self.directory = None self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire() # self.searchermgr.tryIncRef(self.searcher) # self.reader = DirectoryReader.open(self.directory) # self.searcher = IndexSearcher(self.reader) def tokenize_string(self, analyzer, string): result = [] stream = analyzer.tokenStream(None, StringReader(string)) cattr = stream.addAttribute(CharTermAttribute) stream.reset() while stream.incrementToken(): result.append(cattr.toString()) stream.close() return result def camel_case_split(self, s): import re s = s.replace("_", " ") s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s) s = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1).lower().replace(" ", " ").split() return s def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string """ query = "" for field in ["description"]: for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) # tokenize term = self.tokenize_string(StandardAnalyzer(), term) # CamelCase temp = [] for t in term: temp += self.camel_case_split(t) # stopwords temp_2 = [] for t in temp: if t not in english_stop_words: temp_2.append(t) # stemming temp_3 = [] for t in temp_2: temp_3.append(stem(t)) # stopwords temp_4 = [] for t in temp_3: if t not in english_stop_words: temp_4.append(t) # query generation for term in temp_4: query += "%s:%s " % (field, term) for field in [ "typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals" ]: # "used_classes", , "literals" , "extends" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) java_stoplist = [ "java.lang.Object", 'void', 'Global', 'boolean', 'String', 'int', 'char', 'float', 'double', 'write', 'close', 'from', 'println', 'StringBuilder', 'write', 'toString', 'close', 'mkdir', 'exists' ] if term not in java_stoplist: query += "%s:%s " % (field, term) if len(doc.getFields("code_hints")) > 0: hints = [ hint.stringValue() for hint in doc.getFields("code_hints") ] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) if term not in english_stop_words: # print "Including 'code_hints' from Doc_To_Query TERMs... //", term query += "code_hints:%s " % term return query def get_matched_keywords2(self, query, doc): matched_terms = [] weight_expl = self.searcher.explain(query, doc).toString().split("weight(") for expl in weight_expl: if " in " in expl: field_val = expl.split(" in ")[0] # field, val = field_val.split(":") val = field_val.split(":")[-1] matched_terms.append(val) return matched_terms def more_like_this2(self, limit, item_doc, score_logs_for_each, user_query, flag): #flag = UQ(1) or not(0) bench_result = [] query = "" if flag == 1: query += user_query # item_doc = ResultItem(None, 0.0, "No Title", 'None','None', None) if flag == 0 and item_doc.doc: query += self.document_to_query(item_doc.doc) query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: parsed_query = queryparser.parse(query) hits = self.searcher.search(parsed_query, limit).scoreDocs temp = 1 for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched = doc.get('file').split('/')[9].split('.')[0] score_logs_for_each += str(matched) + '\t' + str( round(hit.score, 2)) + '\n' matched_terms = self.get_matched_keywords2( parsed_query, hit.doc) # print "Matched Terms : ", matched_terms # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content")) temp += 1 file_path = doc.get("file") content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = BenchResultItem(doc.get("file"), content, matched_terms, hit.score, item_doc, doc.get("line_numbers"), hit.doc) bench_result.append(item) except Exception as e: print "BenchSearcher Error: %s" % e print(traceback.format_exc()) # self.searchermgr.release() # self.searcher = None # self.directory.close() # self.directory = None return bench_result, score_logs_for_each def more_like_this3(self, limit, score_logs_for_each, user_query): query = "" bench_result = [] # if not item_doc: # item_doc.append(ResultItem(None, 1.0, "No Title", 0, 0)) # if item_doc.doc: # query += self.document_to_query(item_doc.doc) query += user_query query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: parsed_query = queryparser.parse(query) hits = self.searcher.search(parsed_query, limit).scoreDocs temp = 1 for i, hit in enumerate(hits): score_logs_for_each += str(round(hit.score, 2)) + '\n' doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( parsed_query, hit.doc) # print "Matched Terms : ", matched_terms # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content")) temp += 1 file_path = doc.get("file") content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = BenchResultItem_UQ(doc.get("file"), content, matched_terms, hit.score, doc.get("line_numbers"), hit.doc) bench_result.append(item) except Exception as e: print "BenchSearcher Error: %s" % e print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return bench_result, score_logs_for_each