示例#1
0
 def __init__(self, project_name):
     self.project_name = project_name
     self.cs = CosineSimilarity(self.project_name)
     self.tf = TermFrequency()
示例#2
0
class Process(object):
    def __init__(self, project_name):
        self.project_name = project_name
        self.cs = CosineSimilarity(self.project_name)
        self.tf = TermFrequency()

    def get_corpus(self, train_time):
        request_infos = get_request_title_description(self.project_name,
                                                      train_time)

        key_list = list()
        word_list = list()
        for request_info in request_infos:
            number = list(request_info.keys())[0]
            data = list(request_info.values())[0]
            valid_word = delete_stop_words(data)
            word = Counter(valid_word)

            key_list.append(number)
            word_list.append(word)

        return key_list, word_list

    def get_corpus_result(self, train_time):
        key_list, count_list = self.get_corpus(train_time)
        start = time.time()

        score_dict = {}
        for key, count in zip(key_list, count_list):

            word_score = {
                word: self.tf.tf_idf(word, count, count_list)
                for word in count
            }
            sorted_score = dict(
                sorted(word_score.items(), key=lambda x: x[1], reverse=True))

            sorted_list = list(sorted_score.keys())
            score_dict.update({key: sorted_list})
        end = time.time()
        print('spend time is {}'.format(end - start))
        return score_dict

    def cos_score(self, new, past, reviewer, k):
        """
        in this function will get pull request similarity.
        :param new:
        :param past:
        :param k: the number of pull request
        :return:
        """
        review_dict = {}

        for number, past_word in past.items():
            # print('the number is',number)
            lib_tech_past = list(past_word)
            past_reviewer = reviewer.get(number)
            if not lib_tech_past or not new or not past_reviewer:
                continue

            score = self.cs.score(new, past_word, number)

            for rpv in past_reviewer:
                if rpv not in review_dict:
                    review_dict.setdefault(rpv, score)

                else:
                    past_score = review_dict.get(rpv)
                    review_dict.update({rpv: past_score + score})

        candidate_list_sort = sorted(review_dict.items(),
                                     key=lambda item: item[1],
                                     reverse=True)
        # strip zero score
        strip_zero_value = list(filter(lambda i: i[1] > 0,
                                       candidate_list_sort))

        if len(strip_zero_value) > 0:
            split_list = candidate_list_sort
            # split_list = candidate_list_sort
            result = [i for i in split_list]
        else:
            result = []

        return result

    def cos_test_info(self, train_time):
        # get word vector
        info_dict = self.get_corpus_result(train_time)
        # get predict interval
        reviewer = get_reviewer(self.project_name)
        print(reviewer)

        # test_number = get_test_number(self.project_name)
        # for i in test_number:
        #     new_word = info_dict.get(i)
        #     past_dict = {k: v for k, v in info_dict.items() if (k < i)}
        #     score_list = self.cos_score(new_word, past_dict, reviewer, 5)
        #     print(score_list)

        return info_dict, reviewer