def search_index(indexfile, querytext, top=10, qe=False, default_field="text", display_fields=["subreddit", "author", "text"]): lucene.initVM() lindex = SimpleFSDirectory(Paths.get(indexfile)) ireader = DirectoryReader.open(lindex) isearcher = IndexSearcher(ireader) analyser = StandardAnalyzer() parser = QueryParser(default_field, analyser) query = parser.parse(querytext) hits = isearcher.search(query, top).scoreDocs docIDs = [hit.doc for hit in hits] print_results(isearcher, hits, display_fields) if len(hits) == 0: print("No hits!") elif qe: print("\n") print("Which documents were relevant to your search need? (Enter spaced list of result numbers [1-{}], e.g. 2 4 5)".format(top)) relevantids = [docIDs[i-1] for i in [int(x) for x in input().split()]] nonrelevantids = [id for id in docIDs if id not in relevantids] print("\n\n") qequerytext = queryexpansion.rocchio(ireader, querytext, relevantids, nonrelevantids) print("Expanded search query: '{}'\n".format(qequerytext)) qequery = parser.parse(qequerytext) qehits = isearcher.search(qequery, top).scoreDocs print_results(isearcher, qehits, display_fields) ireader.close() lindex.close()
class GettingQuestionDocs: #Index path를 입력으로 받은 객체를 생성하고 search를 통해 현재 Answer Serchear_1에서 받은 질문 리스트들에 대한 doc들을 얻을 것. def __init__(self, index_path, query=None): self.index_path = File(index_path) # self.index_path = index_path self.directory = None self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire() # self.searchermgr.tryIncRef(self.searcher) # self.reader = DirectoryReader.open(self.directory) # self.searcher = IndexSearcher(self.reader) def search(self, q_ids, limit): docs = [] c = 0 for i, q_id in enumerate(q_ids): #Index 가 안되어있는 Question은 찾지 못함. query = TermQuery(Term("question_id", str(q_id))) topdocs = self.searcher.search( query, 1).scoreDocs #현재는 Accepted Answer 하나만 인덱싱 되기 때문에 1개로 한정 # index searcher에 TermQuery의 객체가 들어가고.. 질문 id에 달려있는 답변 중 상위 n개 가져옴/ scoreDocs는 점수 (소수형임..) for hit in topdocs: doc = self.searcher.doc(hit.doc) docs.append( ResultItem(doc, len(q_ids) - i, doc.get("title"), doc.get("question_id"))) if len(topdocs) > 0: c += 1 if c >= limit: break # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return docs
def index(self): if not (os.path.exists(self._dataDir) and os.path.isdir(self._dataDir)): raise IOError, "%s isn't existed or is not a directory" % ( self._dataDir) dir = SimpleFSDirectory(Paths.get(self._indexDir)) writer = IndexWriter(dir, StandardAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) self.indexDirectory(writer, self._dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
def query_index(query, hit_logs_for_each, score_logs_for_each): ### 1_Query Alternation user_code_query = Generator(query) directory = SimpleFSDirectory(File(INDICES_PATH + 'bigclonebench_4_text')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() benchsearcher = BenchSearcher(searcher) # BigCloneBench ### 8_Querying for the Final Results # Log : Bench_result for each query bench_result, score_logs_for_each = benchsearcher.more_like_this3( 5000, score_logs_for_each, user_code_query) searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None if bench_result: hit_logs_for_each += str(len(bench_result)) + '\t' else: hit_logs_for_each += ('0' + '\t') sorted_bench_results = sorted(bench_result, key=attrgetter('score'), reverse=True) print 'Search Count : ', len(sorted_bench_results) recommended = recommend(sorted_bench_results) print 'Final Count : ', len(recommended) if bench_result: hit_logs_for_each += str(len(recommended)) + '\t' else: hit_logs_for_each += ('0' + '\t') return recommended, hit_logs_for_each, score_logs_for_each
def createind(product,url): "This function creates index for lucene" global counter counter += 1 adId = counter adLine = product field_string = chunker(product.lower()) field_related_words = getDbpediaMatches(product, field_string) url = url lucene.initVM() # 1. create an index index_path = File("Home/WishMatcherIndex") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) index = SimpleFSDirectory(index_path) # 2. fill the index config = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(index, config) #for title in TITLES: import time millis = int(round(time.time() * 1000)) userid = str(millis) doc = Document() doc.add(Field("AdId", str(adId), Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("AdLine", adLine, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("FieldString", field_string, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("FieldRelatedWords", field_related_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URL", url, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print(adId) # 3. close resources writer.close() index.close() return ""
class Searcher: def __init__(self, indexDir): self.directory = SimpleFSDirectory(Paths.get(indexDir)) self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader) self.nameQueryParser = QueryParser('name', StandardAnalyzer()) self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND) self.idQueryParser = QueryParser('id', StandardAnalyzer()) self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND) def find_by_name(self, name): query = self.nameQueryParser.parse(name) docs = self.searcher.search(query, 100).scoreDocs tables = [] for scoreDoc in docs: doc = self.searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) tables.append(table) return tables def find_by_id(self, id): query = self.idQueryParser.parse(id) docs = self.searcher.search(query, 100).scoreDocs tables = [] for scoreDoc in docs: doc = self.searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) tables.append(table) return tables def close(self): self.directory.close() self.reader.close()
def query_index(query, hit_logs_for_each, score_logs_for_each): ### 1_Query Alternation user_code_query = Generator(query) directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 2_Finding 3 Answer Snippets using the User Query (refined) answers = SnippetSearcher(searcher, user_code_query) answer_ids = answers.more_like_this(20, query=user_code_query) searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Answer count if answer_ids: hit_logs_for_each += str(len(answer_ids)) + '\t' else: hit_logs_for_each += ('0' + '\t') ### 3_Finding the Associated Questions question_ids = answers.find_question_ids(answer_ids) # Log : Answer - Question count if question_ids: hit_logs_for_each += str(len(question_ids)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() getDoc = GettingQuestionDocs(searcher) item_docs = getDoc.search( question_ids, 20)[0:7] # 순위대로 최소 7개의 question을 얻기 위해서 여기서 7개를 자름. searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Question ItemDoc count if item_docs: hit_logs_for_each += str(len(item_docs)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 4_Finding 3 Similar Questions per a Question (3 X 3) similar_questions = [] question = SimilarQsSearcher(searcher) # Log : Similar Question count for each of Question ItemDoc i = 1 if item_docs: for item_doc in item_docs: similar_question = question.more_like_this2( item_doc, 7) # 각 question 들에 대해 7개씩 비슷한 것들 찾음. if similar_question: hit_logs_for_each += str(len(similar_question)) + '\t' else: hit_logs_for_each += ('0' + '\t') similar_questions += similar_question i += 1 else: hit_logs_for_each += ('0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' ) # 7개 searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Similar Question result count if similar_questions: hit_logs_for_each += str(len(similar_questions)) + '\t' else: hit_logs_for_each += ('0' + '\t') ### 5_Finding Associated Answers for each Question (9 - 9) answer_ids = find_answer_ids(similar_questions) # Log : Question - Answer count if answer_ids: hit_logs_for_each += str(len(answer_ids)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 6_Getting Answer Docs for the Final Query getDoc = GettingAnswerDocs(searcher) answer_docs = getDoc.search(answer_ids) searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Answer Docs count if answer_docs: hit_logs_for_each += str(len(answer_docs)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'bigclonebench_2')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() bench_results = [] benchsearcher = BenchSearcher(searcher) # BigCloneBench # Exceptional ### 7_Appending for the user query results ### 8_Querying for the Final Results # Log : Bench_result for each query for answer_doc in answer_docs: bench_result, score_logs_for_each = benchsearcher.more_like_this2( 100, answer_doc, score_logs_for_each, user_code_query, 0) # , user_query=user_code_query) if bench_result: hit_logs_for_each += str(len(bench_result)) + '\t' else: hit_logs_for_each += ('0' + '\t') bench_results += bench_result searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None if answer_docs < 49: for a in range(49 - len(answer_docs)): hit_logs_for_each += ('0' + '\t') if bench_results: hit_logs_for_each += str(len(bench_results)) + '\t' else: hit_logs_for_each += ('0' + '\t') sorted_bench_results = sorted(bench_results, key=attrgetter('score'), reverse=True) print 'Search Count : ', len(sorted_bench_results) recommended = recommend(sorted_bench_results) print 'Final Count : ', len(recommended) if bench_results: hit_logs_for_each += str(len(recommended)) + '\t' else: hit_logs_for_each += ('0' + '\t') return recommended, hit_logs_for_each, score_logs_for_each
writer = IndexWriter(index, config) def create_index(): for country in cleaned_dictionary: doc = Document() doc.add(Field("country", country[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("country_html", country[1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("capital", country[2], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("capital_html", country[3], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) create_index() #writer.deleteAll() writer.close() index.close() ### retrieval index = SimpleFSDirectory(File(sys.argv[1])) reader = IndexReader.open(index) n_docs = reader.numDocs() print("Index contains %d documents." % n_docs) def get_query_results(reader,query,n,field): searcher = IndexSearcher(reader) hits = searcher.search(query, n).scoreDocs print("Found %d hits:" % len(hits)) for i, hit in enumerate(hits): doc = searcher.doc(hit.doc) print("%d. %s" % (i + 1, doc.get(field)))
def query_index(query): ### 1_Query Alternation user_code_query = Generator(query) print 'query: ', query print 'user_code_query: ', user_code_query #open directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 2_Finding 3 Answer Snippets using the User Query (refined) answers = SnippetSearcher(searcher, user_code_query) answer_ids = answers.more_like_this(10, query=user_code_query) print 'answer_ids: ', answer_ids #close searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None ### 3_Finding the Associated Questions question_ids = answers.find_question_ids(answer_ids) print 'question ids: ', question_ids #open directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 4_Cutting items getDoc = GettingQuestionDocs(searcher) item_docs = getDoc.search( question_ids, 10)[0:3] # 순위대로 최소 7개의 question을 얻기 위해서 여기서 7개를 자름. # print 'item docs: ', item_docs ### 5_Finding 3 Similar Questions per a Question (3 X 3) similar_questions = [] question = SimilarQsSearcher(searcher) if item_docs: for item_doc in item_docs: similar_question = question.more_like_this2( item_doc, 3) # 각 question 들에 대해 7개씩 비슷한 것들 찾음. similar_questions += similar_question print 'similar_questions: ', similar_questions searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None ### 6_Finding Associated Answers for each Question (9 - 9) answer_ids = find_answer_ids(similar_questions) print 'answer ids: ', answer_ids if not answer_ids: recommended = '' return recommended # dest_path = u'/Users/Falcon/Desktop/***Ongoing***/***[4]_FaCoY_Defect4J_Data_Share_Kui/Defect4J_Results/' # project_name = u'Chart/'################################################### # write_file() directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 7_Getting Answer Docs for the Final Query getDoc = GettingAnswerDocs(searcher) answer_docs = getDoc.search(answer_ids) # print 'answer docs: ', answer_docs searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None directory = SimpleFSDirectory(File(INDICES_PATH + 'github')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() git_results = [] gitsearcher = GitSearcher(searcher) ### 7_Appending for the user query results git_result = gitsearcher.more_like_this2(10, answer_docs[0], user_code_query, 1) git_results += git_result # print 'answer docs: ', answer_docs ### 8_Querying for the Final Results for answer_doc in answer_docs: git_result = gitsearcher.more_like_this2(10, answer_doc, user_code_query, 0) git_results += git_result searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None git_results = sorted(git_results, key=attrgetter('so_item.answer_id'), reverse=True) id = 0 i = 0 temp_result = [] for item in git_results: if id != item.so_item.answer_id: id = item.so_item.answer_id i = 1 temp_result.append(item) elif id == item.so_item.answer_id and i < 3: i += 1 temp_result.append(item) elif id == item.so_item.answer_id and i > 3: continue sorted_git_results = sorted(temp_result, key=attrgetter('score'), reverse=True) print 'Search Count : ', len(sorted_git_results) recommended = recommend(sorted_git_results) print 'Final Count : ', len(recommended) # Defect4J 쿼리 결과저장 # cot = 0 # for c, item in enumerate(recommended): # cot += 1 # if cot > 10: # break # result_file = "/Users/Falcon/Desktop/Pycharm_Project/FaCoY_Project/GitSearch/Defect4J_FaCoY/" + str(c+1) + "_" + str('_'.join(str(item[0]).split("/")[6:])) # write_file_over(result_file, str(item.file_content)) # result_file = '/Users/Falcon/Desktop/test.txt' # if os.path.exists(result_file): # os.remove(result_file) # # write_file(result_file, 'User Code Query \n' + str(query) + '\n' + '---------------------------' + '\n') # for c, i in enumerate(recommended): # contents = '' # contents = 'Rank: %d' % (int(c)+int(1)) # contents += '\nFile path: %s' % str(i.file[6:]) + '\n' + '---------------------------' + '\n' # contents += str(i.file_content) +'\n' + '=================================================================' + '\n\n\n' # write_file(result_file, contents) return recommended
v_orig2 = v v = unicodedata.normalize('NFKD', v).encode('ascii', 'ignore') v_orig = v.strip() v = v.lower().strip() doc.add(Field("wiki_id", str(k), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("wiki_name_orig", str(v_orig), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("wiki_name", str(v), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("wiki_name_analyzed", str(v), Field.Store.YES, Field.Index.ANALYZED)) v_punct_removed = re.sub(' +', ' ', regex.sub(' ', v)).strip() doc.add( Field("wiki_name_analyzed_nopunct", str(v_punct_removed), Field.Store.YES, Field.Index.ANALYZED)) v_stop_removed = " ".join( [x for x in nltk.word_tokenize(v_punct_removed) if x not in stop]) doc.add( Field("wiki_name_analyzed_nopunct_nostop", str(v_stop_removed), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) i = i + 1 if i % 10000 == 0: print 'finished ', i print 'num errors while indexing ', num_errors writer.close() index.close()
class GettingAnswerDocs: def __init__(self, index_path, query=None): self.index_path = File(index_path) # self.index_path = index_path self.directory = None self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire() # self.searchermgr.tryIncRef(self.searcher) # self.reader = DirectoryReader.open(self.directory) # self.searcher = IndexSearcher(self.reader) def search(self, a_ids): docs = [] # #For exceptional UQ # if a_ids[0] == '0': # query = TermQuery(Term("answer_id", str(a_ids))) # print query # topdoc = self.searcher.search(query, 1).scoreDocs # # doc = self.searcher.doc(topdoc[0][0].doc) # docs.append(doc, 0, 'No Title', 'No Question id', 'No Answer id', 'No Description') # return docs # else: i = 0 for i, a_id in enumerate(a_ids): query = TermQuery(Term("answer_id", str(a_id))) topdocs = self.searcher.search(query, 1).scoreDocs for hit in topdocs: doc = self.searcher.doc(hit.doc) docs.append( ResultItem(doc, len(a_ids) - i, doc.get("title"), doc.get("question_id"), doc.get("answer_id"), doc.get("description"))) if len(topdocs) > 0: i += 1 if i > a_ids.__len__(): break # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return docs
class Indexer: """ Indexer Class """ (NAME, CONTENT, DATE, URL, TAGS, TIMESTAMP) = ("name", "content", "date", "url", "tags", "timestamp") def __init__(self, indexDir="", debug=False, verbose=False): """ :Parameters: - `indexDir`: Path where the Index will be saved. (Str) - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean) - `verbose`: Provide additional information about the initialization process. (Boolean) """ self.__verbose = verbose if indexDir != "": INDEX_DIR = indexDir else: INDEX_DIR = os.path.dirname( os.path.realpath(__file__)) + "/luceneIndex" if not os.path.exists(INDEX_DIR): os.makedirs(INDEX_DIR) self.__boAppend = False else: self.__boAppend = True # Initialize lucene and JVM lucene.initVM() # Get index storage if debug: # Store the index in memory self.__indexDir = RAMDirectory() self.__boAppend = False INDEX_DIR = "RAM Memory" else: # Store an index on disk self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR)) # Create Content FieldType self.__contentType = FieldType() self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.__contentType.setTokenized(True) self.__contentType.setStored(True) self.__contentType.setStoreTermVectors(True) self.__contentType.setStoreTermVectorPositions(True) self.__contentType.freeze() # Get the Analyzer self.__analyzer = StandardAnalyzer( StandardAnalyzer.ENGLISH_STOP_WORDS_SET) # Print Indexer Information print("Lucene version is: ", lucene.VERSION) print("Index Directory: ", INDEX_DIR) def __del__(self): self.__indexDir.close() ################################################## #Private Methods ################################################## @staticmethod def __getTimestamp(dateTime): """ Converts the document's date to an integer timestamp :Parameters: - `dateTime`: Document's date (Str) :Returns: - Date timestamp (Int) """ tm = time.strptime(dateTime, '%Y-%m-%dT%H:%M:%SZ') sTime = "{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}{5:0>2}".format( tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec) return int(sTime) @staticmethod def __getDateTime(timeStamp): """ Converts the document's timestamp to date :Parameters: - `timeStamp`: Document's timestamp :Returns: - Date (Str) """ date = datetime.datetime(year=int(timeStamp[0:4]), month=int(timeStamp[4:6]), day=int(timeStamp[6:8]), hour=int(timeStamp[8:10]), minute=int(timeStamp[10:12]), second=int(timeStamp[12:14])) return date.strftime('%Y-%m-%d %H:%M:%S') @staticmethod def __qualifyTags(tags): """ Creates the qualify string for tags :Parameters: - `tags`: List of document's tags :Return: - Qualify Tags (Str) """ sTags = "" for tag in tags: sTags += tag + '|' return sTags[:-1] @staticmethod def __scatterMatrix(numDocs, freqMtx): print("Scattering Frequency Matrix...") pB = ProgressBar(len(freqMtx), prefix='Progress:') matrix = [] innerMatrix = ['Term'] #Generate Document Columns for docIdx in range(numDocs): innerMatrix.append("D{0:0>4}".format(docIdx)) matrix.append(innerMatrix) #Generate Word Rows and Columns for word in sorted(freqMtx): innerMatrix = [] innerMatrix.append(word) for docIdx in range(numDocs): try: termCount = round(freqMtx[word][str(docIdx)], 3) innerMatrix.append(termCount) except KeyError: innerMatrix.append(0) matrix.append(innerMatrix) pB.updateProgress() return matrix @staticmethod def __saveMatrix(numDocs, freqMtx): pathMatrix = os.path.dirname( os.path.realpath(__file__)) + "/freqMtx.txt" fMatrix = open(pathMatrix, 'w') print("Saving Frequency Matrix File: ", pathMatrix) pB = ProgressBar(len(freqMtx), prefix='Progress:') # File Generation Start print("+========= Frequency Matrix =========+", file=fMatrix) print("%20s" % (' '), end=' ', file=fMatrix) for docIdx in range(numDocs): print("D{0:0>4}".format(docIdx), end=' ', file=fMatrix) print(file=fMatrix) for word in sorted(freqMtx): print("%20s" % (word), end=' ', file=fMatrix) for docIdx in range(numDocs): try: termCount = freqMtx[word][str(docIdx)] print("%02.03f" % (termCount), end=' ', file=fMatrix) except KeyError: print(" 0 ", end=' ', file=fMatrix) print(file=fMatrix) pB.updateProgress() # Close File fMatrix.close() def __stemString(self, stringToStem): stemmedTerms = [] tknStream = self.__analyzer.tokenStream('STEM', stringToStem) stemmed = SnowballFilter(tknStream, "English") stemmed.reset() while stemmed.incrementToken(): stemmedTerms.append( stemmed.getAttribute(CharTermAttribute.class_).toString()) tknStream.close() return stemmedTerms @staticmethod def __normalize(qVector, freqMtx): for term in qVector: for docId in freqMtx: if (term in freqMtx[docId]) and (freqMtx[docId][term] > qVector[term]): qVector[term] = freqMtx[docId][term] @staticmethod def __dotProduct(aVector, bVector): """ Calculate Dot Product :Parameters: - `aVector`: A Vector. (Dict) - `bVector`: B Vector. (Dict) :Returns: - Dot Product. (Int) """ dotProduct = 0 for term in aVector: if term in bVector: product = aVector[term] * bVector[term] dotProduct += product return dotProduct @staticmethod def __magnitude(vector): """ Calculate Dot Product :Parameters: - `vector`: Query Vector. (Dict) :Returns: - Vector Magnitude. (Int) """ # Magnitude of the vector is the square root of the dot product of the vector with itself. vectorMagnitude = Indexer.__dotProduct(vector, vector) vectorMagnitude = math.sqrt(vectorMagnitude) return vectorMagnitude ################################################## #Public Methods ################################################## def IndexDocs(self, documents): """ Index documents under the directory :Parameters: - `documents`: Documents to be indexed (List) """ # Get the Writer Configuration writerConfig = IndexWriterConfig(self.__analyzer) # Get index writer writer = IndexWriter(self.__indexDir, writerConfig) for document in documents: # Create a document that would we added to the index doc = Document() # Add a field to this document doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES)) doc.add( Field(Indexer.CONTENT, document['content'], self.__contentType)) doc.add( StringField(Indexer.DATE, document['date'], Field.Store.YES)) doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES)) doc.add( TextField(Indexer.TAGS, self.__qualifyTags(document['tags']), Field.Store.YES)) doc.add( LongPoint(Indexer.TIMESTAMP, self.__getTimestamp(document['date']))) # Add or update the document to the index if not self.__boAppend: # New index, so we just add the document (no old document can be there): if self.__verbose: print("Adding " + document['name']) writer.addDocument(doc) else: # Existing index (an old copy of this document may have been indexed) so # we use updateDocument instead to replace the old one matching the exact # path, if present: if self.__verbose: print("Updating " + document['name']) writer.updateDocument(Term(Indexer.NAME, document['name']), doc) # Print index information and close writer print("Indexed %d documents (%d docs in index)" % (len(documents), writer.numDocs())) writer.close() def Search(self, query, field=NAME, maxResult=1000): """ Search for a document into the Lucene's Index :Parameters: - `query`: Request to be made to the Index (Str). - `field`: Field to be consulted by the query (NAME, CONTENT, DATE, URL, TAGS). - `maxResult`: Maximum number of results. """ # Get the Index Directory reader = DirectoryReader.open(self.__indexDir) searcher = IndexSearcher(reader) # Create a query queryParser = QueryParser(field, self.__analyzer).parse(query) # Do a search hits = searcher.search(queryParser, maxResult) print("Found %d document(s) that matched query '%s':" % (hits.totalHits, queryParser)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) print("Document Nº: %d - Score: %.5f" % (hit.doc, hit.score)) print("Name: " + doc.get('name')) print("Tags: " + doc.get('tags') + "\n") reader.close() def StemDocument(self, docIdx): """ Return an array of the document's stemmed terms :Parameters: - `docIdx`: Document's index ID (Int). """ reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx).get(Indexer.CONTENT) reader.close() return self.__stemString(doc) def FreqMatrix(self, scattered=False, byTerms=True, saveMtx=False): """ Generates a Frequency Matrix of the current Index :Parameters: - `saveMtx`: Save the Frequency Matrix to a .txt file. (Boolean) """ freqMtx = {} # Terms - DocumentID Matrix reader = DirectoryReader.open(self.__indexDir) numDocs = reader.numDocs() print("Generating Frequency Matrix...") pB = ProgressBar(numDocs - 1, prefix='Progress:') for docIdx in range(numDocs): termItr = self.StemDocument(docIdx) termSize = len(termItr) docStr = '{0}'.format(docIdx) termDict = {} for termText in termItr: if byTerms: # Check if the term exists if termText in freqMtx: # Check if the document exists if docStr in freqMtx[termText]: termCount = int( math.ceil( ((freqMtx[termText][docStr] * termSize) / 100))) freqMtx[termText].update( {docStr: ((termCount + 1) / termSize) * 100}) else: freqMtx[termText].update( {docStr: (1 / termSize) * 100}) else: termIdx = {termText: {docStr: (1 / termSize) * 100}} freqMtx.update(termIdx) else: # Check if the term exists termText = termText.replace('.', '_') if termText in termDict: termCount = int( math.ceil((termDict[termText] * termSize) / 100)) termDict[termText] = ((termCount + 1) / termSize) * 100 else: termIdx = {termText: (1 / termSize) * 100} termDict.update(termIdx) if not byTerms: freqMtx.update({docStr: termDict}) pB.updateProgress() if saveMtx and byTerms: self.__saveMatrix(numDocs, freqMtx) if scattered and byTerms: freqMtx = self.__scatterMatrix(numDocs, freqMtx) # Close IndexReader reader.close() return freqMtx def GetSimilarity(self, query, freqMtx): """ Cosine Similarity """ qVector = {} qList = self.__stemString(query) for stem in qList: qVector.update({stem: 0}) self.__normalize(qVector, freqMtx) qList = [] #Get similarity between query and doc[n] for docIdx, dVector in freqMtx.items(): dP = self.__dotProduct(qVector, dVector) qM = self.__magnitude(qVector) dM = self.__magnitude(dVector) cosSimilarity = dP / (qM * dM) qList.append((docIdx, cosSimilarity)) return sorted(qList, key=lambda similarity: similarity[1], reverse=True) def AnalyzeDocument(self, docIdx): """ Generates a list of (entity, relation, entity) tuples as its output. :Parameters: - `docIdx`: Document's index ID (Int). """ gpeList = {} geolocator = Geocode() reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx) # Load NLTK Data nltkPath = os.path.dirname( os.path.realpath(__file__)) + '/../tools/nltk_data' nltk.data.path.append(nltkPath) # Named Entity Recognition content = doc.get(Indexer.CONTENT) sentences = nltk.sent_tokenize(content) #ProgressBar print("Analazing Document {0}".format(docIdx)) pB = ProgressBar(len(sentences), prefix='Progress:') # Loop over each sentence and tokenize it separately for sentence in sentences: ner = nltk.word_tokenize(sentence) ner = nltk.pos_tag(ner) ner = nltk.ne_chunk(ner) # Get all the Geo-Political Entities for subtrees in list( ner.subtrees( filter=lambda subtree: subtree.label() == 'GPE')): entityName = ' '.join([child[0] for child in subtrees]) if entityName not in gpeList: location = geolocator.GetGPE(entityName) if location: gpeList.update(location) pB.updateProgress() gpeList = geolocator.GetFeatureCollection(gpeList) return gpeList def GetDocField(self, docIdx, field=CONTENT): """ Get the document's field :Parameters: - `docIdx`: Document's index ID (Int). - `field`: Field to retrieve (Str). :Returns: - Document's field. (Str) """ reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx) content = doc.get(field) reader.close() return content
class SnippetSearcher: def __init__(self, index_path, query=None): self.index_path = File(index_path) # self.index_path = index_path self.directory = None self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire() # self.searchermgr.tryIncRef(self.searcher) # self.reader = DirectoryReader.open(self.directory) # self.searcher = IndexSearcher(self.reader) # index = SimpleFSDirectory(indexDir) # self.reader = IndexReader.open(index) # self.searcher = SearcherFactory.newSearcher(self.reader) def get_matched_keywords(self, query, doc): matched_terms = [] weight_expl = self.searcher.explain(query, doc).toString().split("weight(") for expl in weight_expl: if " in " in expl: field_val = expl.split(" in ")[0] val = field_val.split(":")[-1] matched_terms.append(val) return matched_terms def more_like_this(self, result_num, query): result = [] queryparser = QueryParser(Version.LUCENE_CURRENT, "methods_called", self.porter_analyzer) if query: try: query = arranging_query_regex(query=query) # print '4. Right after the regex handling : ', query like_query = queryparser.parse(query) # print '5. Right after the Lucene parser : ', like_query hits = self.searcher.search(like_query, result_num).scoreDocs # filterScoreDosArray = hits.topDocs().scoreDocs; for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) # matched_terms = self.get_matched_keywords(like_query, hit.doc) result.append(doc.get("answer_id")) except Exception as e: print "AnswerSearcher: Error: %s" % e print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return result def find_question_ids(self, answer_ids): result_list = [] for id in answer_ids: # print "Answer id : ", id, " /// ", ; query = "SELECT parentID from posts where id = %s" % id question_id = DBManager.requestOneColumnQuery(query) result_list.append(question_id[0]) # print "Question id : ", question_id[0] return result_list # if __name__ == '__main__': # query = """ # typed_method_call:FTPClient.setControlEncoding typed_method_call:FTPClient.login typed_method_call:FTPClient.disconnect typed_method_call:FTPClient.enterLocalPassiveMode typed_method_call:FTPClient.isConnected typed_method_call:FTPClient.setFileType typed_method_call:FTPClient.connect typed_method_call:FTPClient.storeFile typed_method_call:FTPClient.logout typed_method_call:FTPClient.changeWorkingDirectory typed_method_call:Log.e typed_method_call:File.getName typed_method_call:FTPClient.makeDirectory typed_method_call:FileInputStream.close used_classes:FTP used_classes:Log used_classes:FTPClient used_classes:FileInputStream used_classes:boolean class_instance_creation:FTPClient class_instance_creation:FileInputStream methods:uploadFile methods:login methods:FTPConnector methods_called:disconnect methods_called:makeDirectory methods_called:setFileType methods_called:getName methods_called:e methods_called:isConnected methods_called:login methods_called:storeFile methods_called:enterLocalPassiveMode methods_called:logout methods_called:changeWorkingDirectory methods_called:close methods_called:setControlEncoding methods_called:connect literals:LOGIN ERROR literals:UTF-8 literals:Artbit3 literals:FTP_UPLOAD literals:artbit123 literals:FTP_CONNECT literals:music_upload # """ # # answer = SnippetSearcher("%sstackoverflow" % (INDICES_PATH), query) # # #유저 Code Query와 유사한 Snippet들을 가진 Answer Posts 도출 # answer_ids = answer.more_like_this(10, query=query) # print answer_ids # # #도출된 Answer Posts에 각각 해당되는 Question Posts Id들 찾기 # question_ids = answer.find_question_ids(answer_ids) # print question_ids
class PyLucene: """ PyLucene module api """ def __init__(self, startJVM=False): if startJVM: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.STORE_DIR = "index_dir" self.store = SimpleFSDirectory(File(self.STORE_DIR)) tmp_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer = LimitTokenCountAnalyzer(tmp_analyzer, 10000) config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.store, config) def close_store(self): self.store.close() def index_doc(self, doc_dict): """ Index a doc to pylucene obs.: docid is a string not an integer """ doc = Document() doc.add(Field("doc_id", doc_dict["doc_id"], TextField.TYPE_STORED)) doc.add(Field("general_info", doc_dict["general_info"], TextField.TYPE_NOT_STORED)) doc.add(Field("subject", doc_dict["subject"], TextField.TYPE_NOT_STORED)) doc.add(Field("source", doc_dict["source"], TextField.TYPE_NOT_STORED)) doc.add(Field("initial_date", doc_dict["initial_date"], TextField.TYPE_NOT_STORED)) doc.add(Field("final_date", doc_dict["final_date"], TextField.TYPE_NOT_STORED)) body_text = doc_dict["content"] body_reader = StringReader(body_text) doc.add(Field("content", body_reader)) self.writer.addDocument(doc) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() self.writer.commit() ticker.tick = False print 'done' def search_docs(self, value, field="general_info"): MAX_RESULTS = 1000 searcher = IndexSearcher(DirectoryReader.open(self.store)) query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(value) topDocs = searcher.search(query, MAX_RESULTS) return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
# In[81]: doc.add(Field('text', texts, Field.Store.NO, Field.Index.ANALYZED)) # In[82]: writer.addDocument(doc) # In[83]: f.close() # In[75]: f = open('gutenberg/austen-sense.txt') # In[84]: writer.close() # In[86]: index_dir.close()
document = isearcher.doc(hits[i].doc) fieldoutput = " | ".join([str(document.get(field)) for field in display_fields]) print("#{})\t".format(i+1) + fieldoutput + "\n") if __name__ == '__main__': lucene.initVM() indexdir = "/home/keerthana/Downloads/project-information-retrieval-master/src/lucene.index" lindex = SimpleFSDirectory(Paths.get(indexdir)) ireader = DirectoryReader.open(lindex) isearcher = IndexSearcher(ireader) analyser = StandardAnalyzer() parser = QueryParser(input("Enter your field :"), analyser) query = parser.parse(input("Enter Your SearchQuery : ")) hits = isearcher.search(query, 10).scoreDocs print(hits) for i in range(len(hits)): print(i, hits[i]) hitDoc = isearcher.doc(hits[i].doc) print("{} || {} || {}".format(hitDoc.get("subreddit"), hitDoc.get("id"), hitDoc.get("text"))) if len(hits) == 0: print("No hits!") ireader.close() lindex.close()
class BenchSearcher: def __init__(self, index_path, query=None): self.index_path = File(index_path) # self.index_path = index_path self.directory = None self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer } self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) self.directory = SimpleFSDirectory(self.index_path) self.searchermgr = SearcherManager(self.directory, SearcherFactory()) self.searchermgr.maybeRefresh() self.searcher = self.searchermgr.acquire() # self.searchermgr.tryIncRef(self.searcher) # self.reader = DirectoryReader.open(self.directory) # self.searcher = IndexSearcher(self.reader) def tokenize_string(self, analyzer, string): result = [] stream = analyzer.tokenStream(None, StringReader(string)) cattr = stream.addAttribute(CharTermAttribute) stream.reset() while stream.incrementToken(): result.append(cattr.toString()) stream.close() return result def camel_case_split(self, s): import re s = s.replace("_", " ") s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s) s = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1).lower().replace(" ", " ").split() return s def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string """ query = "" for field in ["description"]: for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) # tokenize term = self.tokenize_string(StandardAnalyzer(), term) # CamelCase temp = [] for t in term: temp += self.camel_case_split(t) # stopwords temp_2 = [] for t in temp: if t not in english_stop_words: temp_2.append(t) # stemming temp_3 = [] for t in temp_2: temp_3.append(stem(t)) # stopwords temp_4 = [] for t in temp_3: if t not in english_stop_words: temp_4.append(t) # query generation for term in temp_4: query += "%s:%s " % (field, term) for field in [ "typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals" ]: # "used_classes", , "literals" , "extends" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) java_stoplist = [ "java.lang.Object", 'void', 'Global', 'boolean', 'String', 'int', 'char', 'float', 'double', 'write', 'close', 'from', 'println', 'StringBuilder', 'write', 'toString', 'close', 'mkdir', 'exists' ] if term not in java_stoplist: query += "%s:%s " % (field, term) if len(doc.getFields("code_hints")) > 0: hints = [ hint.stringValue() for hint in doc.getFields("code_hints") ] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) if term not in english_stop_words: # print "Including 'code_hints' from Doc_To_Query TERMs... //", term query += "code_hints:%s " % term return query def get_matched_keywords2(self, query, doc): matched_terms = [] weight_expl = self.searcher.explain(query, doc).toString().split("weight(") for expl in weight_expl: if " in " in expl: field_val = expl.split(" in ")[0] # field, val = field_val.split(":") val = field_val.split(":")[-1] matched_terms.append(val) return matched_terms def more_like_this2(self, limit, item_doc, score_logs_for_each, user_query, flag): #flag = UQ(1) or not(0) bench_result = [] query = "" if flag == 1: query += user_query # item_doc = ResultItem(None, 0.0, "No Title", 'None','None', None) if flag == 0 and item_doc.doc: query += self.document_to_query(item_doc.doc) query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: parsed_query = queryparser.parse(query) hits = self.searcher.search(parsed_query, limit).scoreDocs temp = 1 for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched = doc.get('file').split('/')[9].split('.')[0] score_logs_for_each += str(matched) + '\t' + str( round(hit.score, 2)) + '\n' matched_terms = self.get_matched_keywords2( parsed_query, hit.doc) # print "Matched Terms : ", matched_terms # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content")) temp += 1 file_path = doc.get("file") content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = BenchResultItem(doc.get("file"), content, matched_terms, hit.score, item_doc, doc.get("line_numbers"), hit.doc) bench_result.append(item) except Exception as e: print "BenchSearcher Error: %s" % e print(traceback.format_exc()) # self.searchermgr.release() # self.searcher = None # self.directory.close() # self.directory = None return bench_result, score_logs_for_each def more_like_this3(self, limit, score_logs_for_each, user_query): query = "" bench_result = [] # if not item_doc: # item_doc.append(ResultItem(None, 1.0, "No Title", 0, 0)) # if item_doc.doc: # query += self.document_to_query(item_doc.doc) query += user_query query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if query: try: parsed_query = queryparser.parse(query) hits = self.searcher.search(parsed_query, limit).scoreDocs temp = 1 for i, hit in enumerate(hits): score_logs_for_each += str(round(hit.score, 2)) + '\n' doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( parsed_query, hit.doc) # print "Matched Terms : ", matched_terms # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content")) temp += 1 file_path = doc.get("file") content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = BenchResultItem_UQ(doc.get("file"), content, matched_terms, hit.score, doc.get("line_numbers"), hit.doc) bench_result.append(item) except Exception as e: print "BenchSearcher Error: %s" % e print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return bench_result, score_logs_for_each