Exemplo n.º 1
0
def preprocess_javadoc(javadoc, javadoc_dict_classes, javadoc_dict_methods,
                       idf, w2v):

    for api in javadoc:

        javadoc_dict_classes[
            api.class_name] = api.package_name + '.' + api.class_name

        description_words = [
            SnowballStemmer('english').stem(word)
            for word in api.class_description
        ]
        api.class_description_matrix = similarity.init_doc_matrix(
            description_words, w2v)
        api.class_description_idf_vector = similarity.init_doc_idf_vector(
            description_words, idf)
        for api_method in api.methods_descriptions_stemmed:
            api.methods_matrix.append(
                similarity.init_doc_matrix(api_method, w2v))
            api.methods_idf_vector.append(
                similarity.init_doc_idf_vector(api_method, idf))
        for api_method in api.methods:
            javadoc_dict_methods[
                api.class_name + '.' +
                api_method] = api.package_name + '.' + api.class_name + '.' + api_method
Exemplo n.º 2
0
def preprocess_all_records_new(records, idf, w2v):
    processed_records = list()
    for record in records:
        # class_description_words = WordPunctTokenizer().tokenize(record.class_description.lower())
        title_words = WordPunctTokenizer().tokenize(record.title.lower())
        title_words = [
            SnowballStemmer('english').stem(word) for word in title_words
        ]
        # record.class_description_words = class_description_words
        # record.method_annotation_words = method_annotation_words
        record.title_words = title_words
        record.title_matrix = similarity.init_doc_matrix(title_words, w2v)
        record.title_idf_vector = similarity.init_doc_idf_vector(
            title_words, idf)
        method_api_sequence = record.method_api_sequence.split()
        final_method_api_sequence = list()
        for api in method_api_sequence:
            if api[0] == "." or api[:api.find(".")].lower == "missing":
                continue
            final_method_api_sequence.append(api)
        record.method_api_sequence = final_method_api_sequence
        record.method_block_flat = record.method_block_flat.split()
        if len(record.method_api_sequence) <= 0:
            continue
        processed_records.append(record)
    return processed_records
Exemplo n.º 3
0
def preprocess_all_questions(questions, idf, w2v):
    processed_questions = list()
    for question in questions:
        title_words = WordPunctTokenizer().tokenize(question.title.lower())
        if title_words[-1] == '?':
            title_words = title_words[:-1]
        if len(title_words) <= 3:
            continue
        title_words = [
            SnowballStemmer('english').stem(word) for word in title_words
        ]
        question.title_words = title_words
        question.matrix = similarity.init_doc_matrix(question.title_words, w2v)
        question.idf_vector = similarity.init_doc_idf_vector(
            question.title_words, idf)
        processed_questions.append(question)

    return processed_questions
Exemplo n.º 4
0
def main():
    w2v = gensim.models.Word2Vec.load(
        '../data/skip_w2v_model_stemmed')  # pre-trained word embedding
    idf = pickle.load(
        open('../data/my_idf',
             'rb'))  # pre-trained idf value of all words in the w2v dictionary
    records = pickle.load(open("../data/records_final.pkl", 'rb'))
    print(len(records))
    #获取需要推荐的问题
    experiments = util.get_class_experiments()
    print(len(experiments))

    csvfile_path = os.path.join(args.output_path,
                                "topclass_expand11-10.csv")  #输出结果
    csvfile = open(csvfile_path, 'w', newline="")
    writer = csv.writer(csvfile)
    writer.writerow(
        ["question_title", "top5", "ground_truth_intersection", "true_apis"])
    #所有问题的api的集合,看这个集合里面是否有答案存在

    #统计能进行推荐的问题个数,推荐出来的问题的个数
    recommend_num = 0
    recommend_success_num = 0
    processnum = 0
    #统计指标
    mrr = 0.0
    map = 0.0
    precision = 0
    recall = 0
    ndcg = 0.0

    rec_num = args.rec_num
    start = time.clock()
    for experiment in experiments:
        experiment_method_annotation = experiment.method_annotation

        # print(experiment_method_annotation)
        experiment_now_method_flat = experiment.now_method_flat
        experiment_true_api = experiment.true_api
        experiment_now_api = experiment.now_api
        # 求差,取出交集
        experiment_true_api = set(experiment_true_api) - set(
            experiment_now_api)

        query = experiment_method_annotation
        query_words = WordPunctTokenizer().tokenize(query.lower())
        query_words = [
            SnowballStemmer('english').stem(word) for word in query_words
        ]
        query_matrix = similarity.init_doc_matrix(query_words, w2v)
        query_idf_vector = similarity.init_doc_idf_vector(query_words, idf)

        #获取相似的TOP-N问题
        top_questions = similarity.get_topk_questions(query_words,
                                                      query_matrix,
                                                      query_idf_vector,
                                                      records, 11, 0.0)
        #获取得到问题的长度
        # print(top_questions)
        similar_questions_length = len(top_questions)
        # print("similar_questions_length:",similar_questions_length)
        #查看现有问题是否在相似问题中,如果不在则加入,否则直接根据相似问题构建张量
        flag = False

        similar_records_list = list(top_questions.keys())
        for record in similar_records_list:
            if (record.title_words == query_words):
                flag = True
        processnum += 1
        #现有问题在相似问题里面
        record_method_annotation_words = list()
        record_method_flat = list()
        record_api = list()
        for record in similar_records_list:
            if record.title_words not in record_method_annotation_words:
                record_method_annotation_words.append(record.title_words)
            if record.method_block_flat not in record_method_flat:
                record_method_flat.append(record.method_block_flat)
            for api in record.method_api_sequence:
                if api not in record_api:
                    record_api.append(api)
        #加入编程环境中出现的api
        for now_api in experiment_now_api:
            if now_api not in record_api:
                record_api.append(now_api)

        api_rec_all = []

        if flag == True:
            recommend_num += 1
            #构建张量

            print(len(record_method_annotation_words), len(record_method_flat),
                  len(record_api))
            record_method_annotation_words_dict = dict(
                zip(range(len(record_method_annotation_words)),
                    record_method_annotation_words))
            record_method_flat_dict = dict(
                zip(range(len(record_method_flat)), record_method_flat))
            record_api_dict = dict(zip(range(len(record_api)), record_api))
            tensor = np.zeros((len(record_method_annotation_words),
                               len(record_method_flat), len(record_api)),
                              dtype=int)
            for record in similar_records_list:
                for concrete_api in record.method_api_sequence:
                    tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(record.title_words)],
                           list(record_method_flat_dict.keys()
                                )[list(record_method_flat_dict.values()).
                                  index(record.method_block_flat)],
                           list(record_api_dict.keys(
                           ))[list(record_api_dict.values()).index(concrete_api
                                                                   )]] = 1
            for api in experiment_now_api:
                if api in record_api_dict.values():
                    tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(query_words)], :,
                           list(record_api_dict.keys(
                           ))[list(record_api_dict.values()).index(api)]] = 1
            #处理不是张量的情况
            one = query_words
            if len(record_api) == 0:
                continue
            if (len(record_method_annotation_words) == 1
                    or len(record_method_flat) == 1 or len(record_api) == 1):
                if (len(record_method_annotation_words) == 1
                        and len(record_method_flat) == 1 or
                        len(record_method_flat) == 1 and len(record_api) == 1
                        or len(record_api) == 1
                        and len(record_method_annotation_words) == 1):
                    api_rec_all = record_api
                    for m in set(experiment_now_api):
                        if m in api_rec_all:
                            api_rec_all.remove(m)
                elif (len(record_api) == 1):
                    api_rec_all = record_api
                    for m in set(experiment_now_api):
                        if m in api_rec_all:
                            api_rec_all.remove(m)
                else:
                    if (len(record_method_annotation_words) == 1):
                        matrix = tl.unfold(tensor, mode=1)
                        nmf = nimfa.Nmf(matrix,
                                        max_iter=200,
                                        rank=round(min(matrix.shape) / 2),
                                        update='euclidean',
                                        objective='fro')
                        nmf_fit = nmf()
                        W = nmf_fit.basis()
                        H = nmf_fit.coef()
                        matrix = np.dot(W, H)
                        two = list(
                            similarity.get_topk_method_flat(
                                experiment_now_method_flat,
                                list(record_method_flat_dict.values()), 1, 1,
                                -1, 1).values())[0]
                        rec_combine_api_key = np.argsort(
                            -matrix[list(record_method_flat_dict.keys()
                                         )[list(record_method_flat_dict.values(
                                         )).index(two)], :]).tolist()[0]
                        api_rec_all = [
                            record_api_dict[i] for i in rec_combine_api_key
                        ]
                        for m in set(experiment_now_api):
                            if m in api_rec_all:
                                api_rec_all.remove(m)
                    elif (len(record_method_flat) == 1):
                        matrix = tl.unfold(tensor, mode=0)
                        nmf = nimfa.Nmf(matrix,
                                        max_iter=200,
                                        rank=round(min(matrix.shape) / 2),
                                        update='euclidean',
                                        objective='fro')
                        nmf_fit = nmf()
                        W = nmf_fit.basis()
                        H = nmf_fit.coef()
                        matrix = np.dot(W, H)
                        rec_combine_api_key = np.argsort(-matrix[
                            list(record_method_annotation_words_dict.keys(
                            ))[list(record_method_annotation_words_dict.values(
                            )).index(one)], :]).tolist()[0]
                        api_rec_all = [
                            record_api_dict[i] for i in rec_combine_api_key
                        ]
                        for m in set(experiment_now_api):
                            if m in api_rec_all:
                                api_rec_all.remove(m)

            else:
                #张量分解
                tf.reset_default_graph()
                tensor = tl.tensor(tensor).astype(np.float32)
                data_provider = Provider()
                data_provider.full_tensor = lambda: tensor
                env = Environment(data_provider, summary_path='/tensor/ncp_ml')
                ncp = NCP_BCU(env)
                arg = NCP_BCU.NCP_Args(rank=round(
                    min(len(record_method_annotation_words),
                        len(record_method_flat), len(record_api)) / 2),
                                       validation_internal=1)
                ncp.build_model(arg)
                loss_hist = ncp.train(100)
                factor_matrices = ncp.factors
                full_tensor = tl.kruskal_to_tensor(factor_matrices)

                two = list(
                    similarity.get_topk_method_flat(
                        experiment_now_method_flat,
                        list(record_method_flat_dict.values()), 1, 1, -1,
                        1).values())[0]

                rec_combine_api_key = np.argsort(
                    -full_tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(one)],
                                 list(record_method_flat_dict.keys()
                                      )[list(record_method_flat_dict.values()).
                                        index(two)], :]).tolist()
                # 推荐的API列表,去除情境中已经含有的api
                api_rec_all = [record_api_dict[i] for i in rec_combine_api_key]
                for m in set(experiment_now_api):
                    if m in api_rec_all:
                        api_rec_all.remove(m)

        #现有问题不在相似问题里面
        else:
            similar_questions_length += 1

            #去除找不到相似问题的问题
            if similar_questions_length == 1:
                continue
            recommend_num += 1
            #添加新来的query
            record_method_annotation_words.append(query_words)
            print(len(record_method_annotation_words), len(record_method_flat),
                  len(record_api))
            #构建张量
            record_method_annotation_words_dict = dict(
                zip(range(len(record_method_annotation_words)),
                    record_method_annotation_words))
            record_method_flat_dict = dict(
                zip(range(len(record_method_flat)), record_method_flat))
            record_api_dict = dict(zip(range(len(record_api)), record_api))
            tensor = np.zeros((len(record_method_annotation_words),
                               len(record_method_flat), len(record_api)),
                              dtype=int)
            for record in similar_records_list:
                for concrete_api in record.method_api_sequence:
                    tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(record.title_words)],
                           list(record_method_flat_dict.keys()
                                )[list(record_method_flat_dict.values()).
                                  index(record.method_block_flat)],
                           list(record_api_dict.keys(
                           ))[list(record_api_dict.values()).index(concrete_api
                                                                   )]] = 1

            for api in experiment_now_api:
                if api in record_api_dict.values():
                    tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(query_words)], :,
                           list(record_api_dict.keys(
                           ))[list(record_api_dict.values()).index(api)]] = 1
            #处理不是张量分解
            one = query_words
            if len(record_api) == 0:
                continue
            if (len(record_method_annotation_words) == 1
                    or len(record_method_flat) == 1 or len(record_api) == 1):
                if (len(record_method_annotation_words) == 1
                        and len(record_method_flat) == 1 or
                        len(record_method_flat) == 1 and len(record_api) == 1
                        or len(record_api) == 1
                        and len(record_method_annotation_words) == 1):
                    api_rec_all = record_api
                    for m in set(experiment_now_api):
                        if m in api_rec_all:
                            api_rec_all.remove(m)
                elif (len(record_api) == 1):
                    api_rec_all = record_api
                    for m in set(experiment_now_api):
                        if m in api_rec_all:
                            api_rec_all.remove(m)
                else:
                    if (len(record_method_annotation_words) == 1):
                        matrix = tl.unfold(tensor, mode=1)
                        nmf = nimfa.Nmf(matrix,
                                        max_iter=200,
                                        rank=round(min(matrix.shape) / 2),
                                        update='euclidean',
                                        objective='fro')
                        nmf_fit = nmf()
                        W = nmf_fit.basis()
                        H = nmf_fit.coef()
                        matrix = np.dot(W, H)
                        two = list(
                            similarity.get_topk_method_flat(
                                experiment_now_method_flat,
                                list(record_method_flat_dict.values()), 1, 1,
                                -1, 1).values())[0]
                        rec_combine_api_key = np.argsort(
                            -matrix[list(record_method_flat_dict.keys()
                                         )[list(record_method_flat_dict.values(
                                         )).index(two)], :]).tolist()[0]
                        api_rec_all = [
                            record_api_dict[i] for i in rec_combine_api_key
                        ]
                        for m in set(experiment_now_api):
                            if m in api_rec_all:
                                api_rec_all.remove(m)
                    elif (len(record_method_flat) == 1):
                        matrix = tl.unfold(tensor, mode=0)
                        nmf = nimfa.Nmf(matrix,
                                        max_iter=200,
                                        rank=round(min(matrix.shape) / 2),
                                        update='euclidean',
                                        objective='fro')
                        nmf_fit = nmf()
                        W = nmf_fit.basis()
                        H = nmf_fit.coef()
                        matrix = np.dot(W, H)
                        rec_combine_api_key = np.argsort(-matrix[
                            list(record_method_annotation_words_dict.keys(
                            ))[list(record_method_annotation_words_dict.values(
                            )).index(one)], :]).tolist()[0]
                        api_rec_all = [
                            record_api_dict[i] for i in rec_combine_api_key
                        ]
                        for m in set(experiment_now_api):
                            if m in api_rec_all:
                                api_rec_all.remove(m)

            else:
                # 张量分解
                tf.reset_default_graph()
                tensor = tl.tensor(tensor).astype(np.float32)
                data_provider = Provider()
                data_provider.full_tensor = lambda: tensor
                env = Environment(data_provider, summary_path='/tensor/ncp_ml')
                ncp = NCP_BCU(env)
                arg = NCP_BCU.NCP_Args(rank=round(
                    min(len(record_method_annotation_words),
                        len(record_method_flat), len(record_api)) / 2),
                                       validation_internal=1)
                ncp.build_model(arg)
                loss_hist = ncp.train(100)
                factor_matrices = ncp.factors
                full_tensor = tl.kruskal_to_tensor(factor_matrices)
                # one = query_words
                two = list(
                    similarity.get_topk_method_flat(
                        experiment_now_method_flat,
                        list(record_method_flat_dict.values()), 1, 1, -1,
                        1).values())[0]

                rec_combine_api_key = np.argsort(
                    -full_tensor[list(record_method_annotation_words_dict.keys(
                    ))[list(record_method_annotation_words_dict.values()).
                       index(one)],
                                 list(record_method_flat_dict.keys()
                                      )[list(record_method_flat_dict.values()).
                                        index(two)], :]).tolist()
                #推荐的API列表
                api_rec_all = [record_api_dict[i] for i in rec_combine_api_key]
                for m in set(experiment_now_api):
                    if m in api_rec_all:
                        api_rec_all.remove(m)
        #判断结果在相似的问题中有没有出现
        # print(experiment_true_api)
        # print('----------------------------------')
        experiment_true_api = [
            true_api.split('.')[-2] for true_api in experiment_true_api
        ]
        experiment_true_api = removelist(experiment_true_api)
        experiment_now_api = [
            true_api.split('.')[-2] for true_api in experiment_now_api
        ]
        experiment_now_api = removelist(experiment_now_api)
        #去除experiment_now_api
        experiment_true_api = set(experiment_true_api) - set(
            experiment_now_api)
        record_api = [true_api.split('.')[-2] for true_api in record_api]
        record_api = removelist(record_api)
        api_rec_all = [true_api.split('.')[-2] for true_api in api_rec_all]
        api_rec_all = removelist(api_rec_all)
        for m in set(experiment_now_api):
            if m in api_rec_all:
                api_rec_all.remove(m)
        api_rec = api_rec_all[:rec_num]

        pos = -1
        tmp_map = 0.0
        hits = 0.0
        vector = list()
        for i, api in enumerate(api_rec_all[:rec_num]):
            if api in set(experiment_true_api) and pos == -1:
                pos = i + 1
            if api in set(experiment_true_api):
                vector.append(1)
                hits += 1
                tmp_map += hits / (i + 1)
            else:
                vector.append(0)

        tmp_map /= len(set(experiment_true_api))
        tmp_mrr = 0.0
        if pos != -1:
            tmp_mrr = 1.0 / pos
        map += tmp_map
        mrr += tmp_mrr
        ndcg += calculateNDCG.ndcg_at_k(vector[:rec_num], rec_num)
        ground_truth_intersection = set(api_rec).intersection(
            set(experiment_true_api))
        if (len(ground_truth_intersection) > 0):
            recommend_success_num += 1
        precision += len(ground_truth_intersection) / rec_num
        recall += len(ground_truth_intersection) / len(
            set(experiment_true_api))
        writer.writerow([
            experiment_method_annotation, api_rec, ground_truth_intersection,
            experiment_true_api
        ])

    writer.writerow(["recommend_num", "recommend_success_num"])
    writer.writerow([recommend_num, recommend_success_num])
    writer.writerow([
        "mrr/recommend_num", "recommend_num", "map/recommend_num",
        "success_rate@N", "precision@N/recommend_num",
        "recall@N/recommend_num", "ndcg/recommend_num"
    ]),
    writer.writerow([
        mrr / recommend_num, recommend_num, map / recommend_num,
        recommend_success_num / recommend_num, precision / recommend_num,
        recall / recommend_num, ndcg / recommend_num
    ])
    csvfile.close()
    end = time.clock()

    print('Running time: %s Seconds' % (end - start))

    logging.info("Finish")
Exemplo n.º 5
0
mrr = 0.0
tot = 0.0

while True:

    query = raw_input()

    query_words = WordPunctTokenizer().tokenize(query.lower())
    if query_words[-1] == '?':
        query_words = query_words[:-1]
    query_words = [
        SnowballStemmer('english').stem(word) for word in query_words
    ]
    #print query_words
    query_matrix = similarity.init_doc_matrix(query_words, w2v)
    query_idf_vector = similarity.init_doc_idf_vector(query_words, idf)
    print len(questions)
    top_questions = recommendation.get_topk_questions(query, query_matrix,
                                                      query_idf_vector,
                                                      questions, 50, parent)
    recommended_api = recommendation.recommend_api(query_matrix,
                                                   query_idf_vector,
                                                   top_questions, questions,
                                                   javadoc,
                                                   javadoc_dict_methods, -1)

    pos = -1
    for i, api in enumerate(recommended_api):
        print 'Rank', i + 1, ':', api
        recommendation.summarize_api_method(api, top_questions, questions,
Exemplo n.º 6
0
def get_result(query_x):
    query_to = query_x

    query_words = WordPunctTokenizer().tokenize(query_to.lower())
    if query_words[-1] == '?':
        query_words = query_words[:-1]
    query_words = [
        SnowballStemmer('english').stem(word) for word in query_words
    ]

    query_matrix = similarity.init_doc_matrix(query_words, w2v)
    query_idf_vector = similarity.init_doc_idf_vector(query_words, idf)

    top_questions = recommendation.get_topk_questions(query_to, query_matrix,
                                                      query_idf_vector,
                                                      questions, 50, parent)
    recommended_api = recommendation.recommend_api(query_matrix,
                                                   query_idf_vector,
                                                   top_questions, questions,
                                                   javadoc,
                                                   javadoc_dict_methods, -1)

    pos = -1
    # covert print to page
    five_apis = []
    methods_descriptions_five_texts = []
    titles_dict = {}
    code_snippets_dict = {}

    for i, api in enumerate(recommended_api):
        print 'Rank', i + 1, ':', api
        five_apis.append(api)
        methods_descriptions_pure_text, titles, code_snippets = recommendation.summarize_api_method(
            api, top_questions, questions, javadoc, javadoc_dict_methods)
        methods_descriptions_five_texts.append(methods_descriptions_pure_text)
        tot = 0
        titles_last = {}
        code_snippets_last = []
        for title in titles:
            if tot == 3:
                break
            if len(code_snippets[title[0]]) > 0:
                tot += 1
                title_id = -1
                for question in questions:
                    if title[0] == question.title:
                        title_id = question.id
                        break
                titles_last[title[0]] = title_id
        if tot < 3:
            for title in titles:
                if tot == 3:
                    break
                if len(code_snippets[title[0]]) == 0:
                    tot += 1
                    title_id = -1
                    for question in questions:
                        if title[0] == question.title:
                            title_id = question.id
                            break
                    titles_last[title[0]] = title_id
        tot = 0
        for title in titles:
            if tot == 3:
                break
            if len(code_snippets[title[0]]) > 0:
                tot += 1
                code_snippets_last.append(code_snippets[title[0]][0])
        titles_dict[i] = titles_last
        code_snippets_dict[i] = code_snippets_last

        # methods_descriptions_pure_text, titles, code_snippets to echo in the pages
        # watch recommendation.summarize_api_method
        if i == 4:
            break

    # query = 'Java Fastest way to read through text file with 2 million lines?'
    # query = 'How to round a number to n decimal places in Java'
    # query = 'run linux commands in java code'
    # query = 'How to remove single character from a String'
    # query = 'How to initialise an array in Java with a constant value efficiently'
    # query = 'How to generate a random permutation in Java?'
    # print five_apis, methods_descriptions_five_texts, titles_dict, code_snippets_dict

    return render_template(
        'result.html',
        uquery=query_to,
        result_apis=five_apis,
        methods_descriptions=methods_descriptions_five_texts,
        result_titles=titles_dict,
        result_code=code_snippets_dict)