Пример #1
0
class GettingQuestionDocs:  #Index path를 입력으로 받은 객체를 생성하고 search를 통해 현재 Answer Serchear_1에서 받은 질문 리스트들에 대한 doc들을 얻을 것.
    def __init__(self, index_path, query=None):
        self.index_path = File(index_path)
        # self.index_path = index_path
        self.directory = None
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
        # self.searchermgr.tryIncRef(self.searcher)
        # self.reader = DirectoryReader.open(self.directory)
        # self.searcher = IndexSearcher(self.reader)

    def search(self, q_ids, limit):
        docs = []
        c = 0
        for i, q_id in enumerate(q_ids):  #Index 가 안되어있는 Question은 찾지 못함.
            query = TermQuery(Term("question_id", str(q_id)))
            topdocs = self.searcher.search(
                query, 1).scoreDocs  #현재는 Accepted Answer 하나만 인덱싱 되기 때문에 1개로 한정
            # index searcher에 TermQuery의 객체가 들어가고.. 질문 id에 달려있는 답변 중 상위 n개 가져옴/ scoreDocs는 점수 (소수형임..)
            for hit in topdocs:
                doc = self.searcher.doc(hit.doc)
                docs.append(
                    ResultItem(doc,
                               len(q_ids) - i, doc.get("title"),
                               doc.get("question_id")))

            if len(topdocs) > 0:
                c += 1
                if c >= limit:
                    break

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return docs
Пример #2
0
    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
Пример #3
0
def query_index(query, hit_logs_for_each, score_logs_for_each):
    ### 1_Query Alternation
    user_code_query = Generator(query)

    directory = SimpleFSDirectory(File(INDICES_PATH + 'bigclonebench_4_text'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    benchsearcher = BenchSearcher(searcher)  # BigCloneBench
    ### 8_Querying for the Final Results
    # Log : Bench_result for each query
    bench_result, score_logs_for_each = benchsearcher.more_like_this3(
        5000, score_logs_for_each, user_code_query)

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    if bench_result:
        hit_logs_for_each += str(len(bench_result)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    sorted_bench_results = sorted(bench_result,
                                  key=attrgetter('score'),
                                  reverse=True)

    print 'Search Count : ', len(sorted_bench_results)
    recommended = recommend(sorted_bench_results)
    print 'Final Count : ', len(recommended)
    if bench_result:
        hit_logs_for_each += str(len(recommended)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')
    return recommended, hit_logs_for_each, score_logs_for_each
Пример #4
0
class SimilarQsSearcher:
    def __init__(self, index_path, query=None):
        self.index_path = File(index_path)
        # self.index_path = index_path
        self.directory = None
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
        # self.searchermgr.tryIncRef(self.searcher)
        # self.reader = DirectoryReader.open(self.directory)
        # self.searcher = IndexSearcher(self.reader)

    def tokenize_string(self, analyzer, string):
        result = []
        stream = analyzer.tokenStream(None, StringReader(string))
        cattr = stream.addAttribute(CharTermAttribute)
        stream.reset()
        while stream.incrementToken():
            result.append(cattr.toString())
        stream.close()
        return result

    def camel_case_split(self, s):
        import re
        s = s.replace("_", " ")
        s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)
        s = re.sub('([a-z0-9])([A-Z])', r'\1 \2',
                   s1).lower().replace("  ", " ").split()
        return s

    def document_to_query(self, doc):
        """ Given a document it transforms the source code related fields to a lucene query string"""
        query = ""
        for field in [
                "description"
        ]:  #여기의 필드가 description 으로 설정 했고... 맨 끝에서 field, term이런식으로 넣으니.. 중복이 많음..
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    #tokenize
                    term = self.tokenize_string(StandardAnalyzer(), term)
                    #CamelCase
                    temp = []
                    for t in term:
                        temp += self.camel_case_split(t)
                    #stopwords
                    temp_2 = []
                    for t in temp:
                        if t not in english_stop_words:
                            temp_2.append(t)
                    #stemming
                    temp_3 = []
                    for t in temp_2:
                        temp_3.append(stem(t))
                    #stopwords
                    temp_4 = []
                    for t in temp_3:
                        if t not in english_stop_words:
                            temp_4.append(t)
                    #query generation
                    for term in temp_4:
                        query += "%s:%s " % (field, term)

        for field in [
                "typed_method_call", "methods", "used_classes",
                "class_instance_creation", "methods_called"
        ]:  # "extends", "annotations", "literals"
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue(
                    ))  #이 자리에서 Unified Query 정제 되나 한번 보자......
                    stoplist = ["java.lang.Object"]
                    if term not in stoplist:
                        query += "%s:%s " % (field, term)

        if len(doc.getFields("code_hints")) > 0:
            hints = [
                hint.stringValue() for hint in doc.getFields("code_hints")
            ]
            hints_str = " ".join(hints)
            for term in hints:
                if term:
                    term = QueryParser.escape(term)
                    if term not in english_stop_words:
                        # print "Including 'code_hints' from Doc_To_Query TERMs... //", term
                        query += "code_hints:%s " % term
        return query

    def get_matched_keywords2(self, query, doc):
        matched_terms = []
        weight_expl = self.searcher.explain(query,
                                            doc).toString().split("weight(")
        for expl in weight_expl:
            if " in " in expl:
                field_val = expl.split(" in ")[0]
                #field, val = field_val.split(":")
                val = field_val.split(":")[-1]
                matched_terms.append(val)
        return matched_terms

    def code_as_text(self, query):
        new_query = " "
        for term in self.tokenize_string(self.porter_analyzer, query):
            if term:
                term = QueryParser.escape(term)
                new_query += "description:%s " % (term)
        return new_query

    def more_like_this2(
        self, item_doc, result_num
    ):  #들어온 질문 docs들에 대해 순회하면서 최종 query로 생성하고 Question Index에서 비슷한거 검색할 것.
        similar_questions = []
        if not item_doc:
            item_doc.append(ResultItem(None, 1.0, "No Title", 0))
        query = ""
        if item_doc.doc:
            query += self.document_to_query(item_doc.doc)

        query = remove_unified_stop_lists(query)
        queryparser = QueryParser(Version.LUCENE_CURRENT, "term",
                                  self.analyzer)

        if query:  #########이 시점에서의 Unified Query는 Tokenization, Stemming 이 되어있음..########
            try:
                like_query = queryparser.parse(query)
                hits = self.searcher.search(
                    like_query,
                    result_num).scoreDocs  #Q와 비슷한 Q들 상위 3개씩의 결과 그럼 총 9개

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    similar_questions.append(doc.get("question_id"))

            except Exception as e:
                print "Question Searcher: Error: %s" % e
                # write_search_log("Question Searcher: Error: %s" % e + "\n")
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        # self.searchermgr.release(self.searcher)
        # self.searcher = None
        # self.directory.close()
        # self.directory = None
        return similar_questions
Пример #5
0
def query_index(query, hit_logs_for_each, score_logs_for_each):
    ### 1_Query Alternation
    user_code_query = Generator(query)

    directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 2_Finding 3 Answer Snippets using the User Query (refined)
    answers = SnippetSearcher(searcher, user_code_query)
    answer_ids = answers.more_like_this(20, query=user_code_query)

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Answer count
    if answer_ids:
        hit_logs_for_each += str(len(answer_ids)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    ### 3_Finding the Associated Questions
    question_ids = answers.find_question_ids(answer_ids)
    # Log : Answer - Question count
    if question_ids:
        hit_logs_for_each += str(len(question_ids)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    getDoc = GettingQuestionDocs(searcher)
    item_docs = getDoc.search(
        question_ids, 20)[0:7]  # 순위대로 최소 7개의 question을 얻기 위해서 여기서 7개를 자름.

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Question ItemDoc count
    if item_docs:
        hit_logs_for_each += str(len(item_docs)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 4_Finding 3 Similar Questions per a Question (3 X 3)
    similar_questions = []
    question = SimilarQsSearcher(searcher)

    # Log : Similar Question count for each of Question ItemDoc
    i = 1
    if item_docs:
        for item_doc in item_docs:
            similar_question = question.more_like_this2(
                item_doc, 7)  # 각 question 들에 대해 7개씩 비슷한 것들 찾음.
            if similar_question:
                hit_logs_for_each += str(len(similar_question)) + '\t'
            else:
                hit_logs_for_each += ('0' + '\t')
            similar_questions += similar_question
            i += 1
    else:
        hit_logs_for_each += ('0' + '\t' + '0' + '\t' + '0' + '\t' + '0' +
                              '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t'
                              )  # 7개

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Similar Question result count
    if similar_questions:
        hit_logs_for_each += str(len(similar_questions)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    ### 5_Finding Associated Answers for each Question (9 - 9)
    answer_ids = find_answer_ids(similar_questions)

    # Log : Question - Answer count
    if answer_ids:
        hit_logs_for_each += str(len(answer_ids)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 6_Getting Answer Docs for the Final Query
    getDoc = GettingAnswerDocs(searcher)
    answer_docs = getDoc.search(answer_ids)

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    # Log : Answer Docs count
    if answer_docs:
        hit_logs_for_each += str(len(answer_docs)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    directory = SimpleFSDirectory(File(INDICES_PATH + 'bigclonebench_2'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    bench_results = []
    benchsearcher = BenchSearcher(searcher)  # BigCloneBench

    # Exceptional
    ### 7_Appending for the user query results

    ### 8_Querying for the Final Results
    # Log : Bench_result for each query
    for answer_doc in answer_docs:
        bench_result, score_logs_for_each = benchsearcher.more_like_this2(
            100, answer_doc, score_logs_for_each, user_code_query,
            0)  # , user_query=user_code_query)
        if bench_result:
            hit_logs_for_each += str(len(bench_result)) + '\t'
        else:
            hit_logs_for_each += ('0' + '\t')
        bench_results += bench_result

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    if answer_docs < 49:
        for a in range(49 - len(answer_docs)):
            hit_logs_for_each += ('0' + '\t')

    if bench_results:
        hit_logs_for_each += str(len(bench_results)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')

    sorted_bench_results = sorted(bench_results,
                                  key=attrgetter('score'),
                                  reverse=True)

    print 'Search Count : ', len(sorted_bench_results)
    recommended = recommend(sorted_bench_results)
    print 'Final Count : ', len(recommended)
    if bench_results:
        hit_logs_for_each += str(len(recommended)) + '\t'
    else:
        hit_logs_for_each += ('0' + '\t')
    return recommended, hit_logs_for_each, score_logs_for_each
Пример #6
0
def query_index(query):
    ### 1_Query Alternation
    user_code_query = Generator(query)
    print 'query: ', query
    print 'user_code_query: ', user_code_query

    #open
    directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 2_Finding 3 Answer Snippets using the User Query (refined)
    answers = SnippetSearcher(searcher, user_code_query)
    answer_ids = answers.more_like_this(10, query=user_code_query)
    print 'answer_ids: ', answer_ids

    #close
    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    ### 3_Finding the Associated Questions
    question_ids = answers.find_question_ids(answer_ids)
    print 'question ids: ', question_ids

    #open
    directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 4_Cutting items
    getDoc = GettingQuestionDocs(searcher)
    item_docs = getDoc.search(
        question_ids, 10)[0:3]  # 순위대로 최소 7개의 question을 얻기 위해서 여기서 7개를 자름.
    # print 'item docs: ', item_docs

    ### 5_Finding 3 Similar Questions per a Question (3 X 3)
    similar_questions = []
    question = SimilarQsSearcher(searcher)

    if item_docs:
        for item_doc in item_docs:
            similar_question = question.more_like_this2(
                item_doc, 3)  # 각 question 들에 대해 7개씩 비슷한 것들 찾음.
            similar_questions += similar_question

    print 'similar_questions: ', similar_questions

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    ### 6_Finding Associated Answers for each Question (9 - 9)
    answer_ids = find_answer_ids(similar_questions)
    print 'answer ids: ', answer_ids

    if not answer_ids:
        recommended = ''
        return recommended
        # dest_path = u'/Users/Falcon/Desktop/***Ongoing***/***[4]_FaCoY_Defect4J_Data_Share_Kui/Defect4J_Results/'
        # project_name = u'Chart/'###################################################
        # write_file()

    directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    ### 7_Getting Answer Docs for the Final Query
    getDoc = GettingAnswerDocs(searcher)
    answer_docs = getDoc.search(answer_ids)

    # print 'answer docs: ', answer_docs

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    directory = SimpleFSDirectory(File(INDICES_PATH + 'github'))
    searchermgr = SearcherManager(directory, SearcherFactory())
    searchermgr.maybeRefresh()
    searcher = searchermgr.acquire()

    git_results = []
    gitsearcher = GitSearcher(searcher)

    ### 7_Appending for the user query results
    git_result = gitsearcher.more_like_this2(10, answer_docs[0],
                                             user_code_query, 1)
    git_results += git_result

    # print 'answer docs: ', answer_docs

    ### 8_Querying for the Final Results
    for answer_doc in answer_docs:
        git_result = gitsearcher.more_like_this2(10, answer_doc,
                                                 user_code_query, 0)
        git_results += git_result

    searchermgr.release(searcher)
    searchermgr.close()
    searcher = None
    directory.close()
    directory = None

    git_results = sorted(git_results,
                         key=attrgetter('so_item.answer_id'),
                         reverse=True)
    id = 0
    i = 0
    temp_result = []
    for item in git_results:
        if id != item.so_item.answer_id:
            id = item.so_item.answer_id
            i = 1
            temp_result.append(item)

        elif id == item.so_item.answer_id and i < 3:
            i += 1
            temp_result.append(item)
        elif id == item.so_item.answer_id and i > 3:
            continue

    sorted_git_results = sorted(temp_result,
                                key=attrgetter('score'),
                                reverse=True)

    print 'Search Count : ', len(sorted_git_results)
    recommended = recommend(sorted_git_results)
    print 'Final Count : ', len(recommended)

    # Defect4J 쿼리 결과저장
    # cot = 0
    # for c, item in enumerate(recommended):
    #     cot += 1
    #     if cot > 10:
    #         break
    #     result_file = "/Users/Falcon/Desktop/Pycharm_Project/FaCoY_Project/GitSearch/Defect4J_FaCoY/" + str(c+1) + "_" + str('_'.join(str(item[0]).split("/")[6:]))
    #     write_file_over(result_file, str(item.file_content))

    # result_file = '/Users/Falcon/Desktop/test.txt'
    # if os.path.exists(result_file):
    #     os.remove(result_file)
    #
    # write_file(result_file, 'User Code Query \n' + str(query) + '\n' + '---------------------------' + '\n')
    # for c, i in enumerate(recommended):
    #     contents = ''
    #     contents = 'Rank: %d' % (int(c)+int(1))
    #     contents += '\nFile path: %s' % str(i.file[6:]) + '\n' + '---------------------------' + '\n'
    #     contents += str(i.file_content) +'\n' + '=================================================================' + '\n\n\n'
    #     write_file(result_file, contents)

    return recommended
Пример #7
0
class SnippetSearcher:
    def __init__(self, index_path, query=None):
        self.index_path = File(index_path)
        # self.index_path = index_path
        self.directory = None
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
        # self.searchermgr.tryIncRef(self.searcher)
        # self.reader = DirectoryReader.open(self.directory)
        # self.searcher = IndexSearcher(self.reader)

        # index = SimpleFSDirectory(indexDir)
        # self.reader = IndexReader.open(index)
        # self.searcher = SearcherFactory.newSearcher(self.reader)

    def get_matched_keywords(self, query, doc):
        matched_terms = []
        weight_expl = self.searcher.explain(query,
                                            doc).toString().split("weight(")
        for expl in weight_expl:
            if " in " in expl:
                field_val = expl.split(" in ")[0]
                val = field_val.split(":")[-1]
                matched_terms.append(val)
        return matched_terms

    def more_like_this(self, result_num, query):
        result = []
        queryparser = QueryParser(Version.LUCENE_CURRENT, "methods_called",
                                  self.porter_analyzer)
        if query:
            try:
                query = arranging_query_regex(query=query)
                # print '4. Right after the regex handling : ', query
                like_query = queryparser.parse(query)
                # print '5. Right after the Lucene parser : ', like_query

                hits = self.searcher.search(like_query, result_num).scoreDocs
                # filterScoreDosArray = hits.topDocs().scoreDocs;

                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    # matched_terms = self.get_matched_keywords(like_query, hit.doc)
                    result.append(doc.get("answer_id"))

            except Exception as e:
                print "AnswerSearcher: Error: %s" % e
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return result

    def find_question_ids(self, answer_ids):
        result_list = []
        for id in answer_ids:
            # print "Answer id : ", id, " /// ", ;
            query = "SELECT parentID from posts where id = %s" % id
            question_id = DBManager.requestOneColumnQuery(query)
            result_list.append(question_id[0])
            # print "Question id : ", question_id[0]
        return result_list


# if __name__ == '__main__':
# 	query = """
# 	typed_method_call:FTPClient.setControlEncoding typed_method_call:FTPClient.login typed_method_call:FTPClient.disconnect typed_method_call:FTPClient.enterLocalPassiveMode typed_method_call:FTPClient.isConnected typed_method_call:FTPClient.setFileType typed_method_call:FTPClient.connect typed_method_call:FTPClient.storeFile typed_method_call:FTPClient.logout typed_method_call:FTPClient.changeWorkingDirectory typed_method_call:Log.e typed_method_call:File.getName typed_method_call:FTPClient.makeDirectory typed_method_call:FileInputStream.close used_classes:FTP used_classes:Log used_classes:FTPClient used_classes:FileInputStream used_classes:boolean class_instance_creation:FTPClient class_instance_creation:FileInputStream methods:uploadFile methods:login methods:FTPConnector methods_called:disconnect methods_called:makeDirectory methods_called:setFileType methods_called:getName methods_called:e methods_called:isConnected methods_called:login methods_called:storeFile methods_called:enterLocalPassiveMode methods_called:logout methods_called:changeWorkingDirectory methods_called:close methods_called:setControlEncoding methods_called:connect literals:LOGIN ERROR literals:UTF-8 literals:Artbit3 literals:FTP_UPLOAD literals:artbit123 literals:FTP_CONNECT literals:music_upload
# 	"""
#
# 	answer = SnippetSearcher("%sstackoverflow" % (INDICES_PATH), query)
#
# 	#유저 Code Query와 유사한 Snippet들을 가진 Answer Posts 도출
# 	answer_ids = answer.more_like_this(10, query=query)
# 	print answer_ids
#
# 	#도출된 Answer Posts에 각각 해당되는 Question Posts Id들 찾기
# 	question_ids = answer.find_question_ids(answer_ids)
# 	print question_ids
Пример #8
0
class GettingAnswerDocs:
    def __init__(self, index_path, query=None):
        self.index_path = File(index_path)
        # self.index_path = index_path
        self.directory = None
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
        # self.searchermgr.tryIncRef(self.searcher)
        # self.reader = DirectoryReader.open(self.directory)
        # self.searcher = IndexSearcher(self.reader)

    def search(self, a_ids):
        docs = []

        # #For exceptional UQ
        # if a_ids[0] == '0':
        # 	query = TermQuery(Term("answer_id", str(a_ids)))
        # 	print query
        # 	topdoc = self.searcher.search(query, 1).scoreDocs
        #
        # 	doc = self.searcher.doc(topdoc[0][0].doc)
        # 	docs.append(doc, 0, 'No Title', 'No Question id', 'No Answer id', 'No Description')
        # 	return docs
        # else:

        i = 0
        for i, a_id in enumerate(a_ids):
            query = TermQuery(Term("answer_id", str(a_id)))
            topdocs = self.searcher.search(query, 1).scoreDocs

            for hit in topdocs:
                doc = self.searcher.doc(hit.doc)
                docs.append(
                    ResultItem(doc,
                               len(a_ids) - i, doc.get("title"),
                               doc.get("question_id"), doc.get("answer_id"),
                               doc.get("description")))

            if len(topdocs) > 0:
                i += 1
                if i > a_ids.__len__():
                    break

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return docs
Пример #9
0
class BenchSearcher:
    def __init__(self, index_path, query=None):
        self.index_path = File(index_path)
        # self.index_path = index_path
        self.directory = None
        self.reader = None
        self.query = query
        self.porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        self.load_index()

    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
        # self.searchermgr.tryIncRef(self.searcher)
        # self.reader = DirectoryReader.open(self.directory)
        # self.searcher = IndexSearcher(self.reader)

    def tokenize_string(self, analyzer, string):
        result = []
        stream = analyzer.tokenStream(None, StringReader(string))
        cattr = stream.addAttribute(CharTermAttribute)
        stream.reset()
        while stream.incrementToken():
            result.append(cattr.toString())
        stream.close()
        return result

    def camel_case_split(self, s):
        import re
        s = s.replace("_", " ")
        s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)
        s = re.sub('([a-z0-9])([A-Z])', r'\1 \2',
                   s1).lower().replace("  ", " ").split()
        return s

    def document_to_query(self, doc):
        """ Given a document it transforms the source code related fields to a lucene query string """
        query = ""
        for field in ["description"]:
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    # tokenize
                    term = self.tokenize_string(StandardAnalyzer(), term)
                    # CamelCase
                    temp = []
                    for t in term:
                        temp += self.camel_case_split(t)
                    # stopwords
                    temp_2 = []

                    for t in temp:
                        if t not in english_stop_words:
                            temp_2.append(t)
                    # stemming
                    temp_3 = []
                    for t in temp_2:
                        temp_3.append(stem(t))
                    # stopwords
                    temp_4 = []

                    for t in temp_3:
                        if t not in english_stop_words:
                            temp_4.append(t)
                    # query generation
                    for term in temp_4:
                        query += "%s:%s " % (field, term)

        for field in [
                "typed_method_call", "methods", "used_classes",
                "class_instance_creation", "methods_called", "annotations",
                "literals"
        ]:  # "used_classes", , "literals" , "extends"
            for val in doc.getFields(field):
                if val.stringValue().strip():
                    term = QueryParser.escape(val.stringValue())
                    java_stoplist = [
                        "java.lang.Object", 'void', 'Global', 'boolean',
                        'String', 'int', 'char', 'float', 'double', 'write',
                        'close', 'from', 'println', 'StringBuilder', 'write',
                        'toString', 'close', 'mkdir', 'exists'
                    ]

                    if term not in java_stoplist:
                        query += "%s:%s " % (field, term)

        if len(doc.getFields("code_hints")) > 0:
            hints = [
                hint.stringValue() for hint in doc.getFields("code_hints")
            ]
            hints_str = " ".join(hints)
            for term in hints:
                if term:
                    term = QueryParser.escape(term)
                    if term not in english_stop_words:
                        # print "Including 'code_hints' from Doc_To_Query TERMs... //", term
                        query += "code_hints:%s " % term
        return query

    def get_matched_keywords2(self, query, doc):
        matched_terms = []
        weight_expl = self.searcher.explain(query,
                                            doc).toString().split("weight(")
        for expl in weight_expl:
            if " in " in expl:
                field_val = expl.split(" in ")[0]
                # field, val = field_val.split(":")
                val = field_val.split(":")[-1]
                matched_terms.append(val)
        return matched_terms

    def more_like_this2(self, limit, item_doc, score_logs_for_each, user_query,
                        flag):  #flag = UQ(1) or not(0)
        bench_result = []
        query = ""
        if flag == 1:
            query += user_query
            # item_doc = ResultItem(None, 0.0, "No Title", 'None','None', None)

        if flag == 0 and item_doc.doc:
            query += self.document_to_query(item_doc.doc)

        query = remove_unified_stop_lists(query)

        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)
        if query:
            try:
                parsed_query = queryparser.parse(query)
                hits = self.searcher.search(parsed_query, limit).scoreDocs
                temp = 1
                for i, hit in enumerate(hits):
                    doc = self.searcher.doc(hit.doc)
                    matched = doc.get('file').split('/')[9].split('.')[0]
                    score_logs_for_each += str(matched) + '\t' + str(
                        round(hit.score, 2)) + '\n'
                    matched_terms = self.get_matched_keywords2(
                        parsed_query, hit.doc)
                    # print "Matched Terms : ", matched_terms

                    # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content"))
                    temp += 1

                    file_path = doc.get("file")
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        pass

                    if content:
                        item = BenchResultItem(doc.get("file"), content,
                                               matched_terms,
                                               hit.score, item_doc,
                                               doc.get("line_numbers"),
                                               hit.doc)
                        bench_result.append(item)

            except Exception as e:
                print "BenchSearcher Error: %s" % e
                print(traceback.format_exc())

        # self.searchermgr.release()
        # self.searcher = None
        # self.directory.close()
        # self.directory = None
        return bench_result, score_logs_for_each

    def more_like_this3(self, limit, score_logs_for_each, user_query):
        query = ""
        bench_result = []
        # if not item_doc:
        # 	item_doc.append(ResultItem(None, 1.0, "No Title", 0, 0))
        # if item_doc.doc:
        # 	query += self.document_to_query(item_doc.doc)

        query += user_query
        query = remove_unified_stop_lists(query)

        queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call",
                                  self.analyzer)
        if query:
            try:
                parsed_query = queryparser.parse(query)
                hits = self.searcher.search(parsed_query, limit).scoreDocs
                temp = 1
                for i, hit in enumerate(hits):
                    score_logs_for_each += str(round(hit.score, 2)) + '\n'
                    doc = self.searcher.doc(hit.doc)
                    matched_terms = self.get_matched_keywords2(
                        parsed_query, hit.doc)
                    # print "Matched Terms : ", matched_terms

                    # print("File %s" % temp, doc.get("file"), "//", doc.get("file_content"))
                    temp += 1

                    file_path = doc.get("file")
                    content = None
                    try:
                        with open(file_path) as f:
                            content = f.read()
                    except:
                        pass

                    if content:
                        item = BenchResultItem_UQ(doc.get("file"), content,
                                                  matched_terms, hit.score,
                                                  doc.get("line_numbers"),
                                                  hit.doc)
                        bench_result.append(item)

            except Exception as e:
                print "BenchSearcher Error: %s" % e
                print(traceback.format_exc())

        # self.searchermgr.decRef(self.searcher)
        self.searchermgr.release(self.searcher)
        self.searcher = None
        self.directory.close()
        self.directory = None
        return bench_result, score_logs_for_each