Пример #1
0
def query_issue(scan_output, max_depth=4):
    """
    根据 已经排序过的app相似度降序列表 scan_output 搜索所有可能 issue
    ~1分钟得出结果
    :param scan_output: 格式参考 descript()函数的输出
    :param max_depth: 限制搜索深度,取最相似的前几个
    :return: 所有查询
    """
    # TODO 查询的 key 哪里出来的?
    logger = logging.getLogger("StreamLogger")
    rdb = issuedb.ISSuedb()
    sql = """select issue_num, comments, state, title, body, commit_id, labels from {}
                    order by length(body) desc"""
    overall_table = {}
    # 所有相关app和item
    for i in range(min(len(scan_output), max_depth)):
        one_dict = {}
        app = scan_output[i][0]
        one_dict['sim'] = scan_output[i][1]

        tab_name = table2tsv.file2table(app)
        one_dict['data'] = []
        one_dict['keys'] = []

        score_list = scan_output[i][2]
        keys_sea = _filter_search_keys(score_list, threshold=0.7)
        logger.debug(f"{app}\t{tab_name}\tsimilar keys length: {len(keys_sea)}")

        output = rdb.db_retrieve(sql.format(tab_name))
        head = ["issue_num", "comments", "state", "title", "body", "commit_id", "labels"]
        f_output = issuedb.retrieve_formatter(head, output)

        title_list = util.get_col(output, head.index('title'))
        body_list = util.get_col(output, head.index('body'))
        label_list = util.get_col(output, head.index('labels'))
        reply_list = util.get_col(output, head.index('issue_num'))
        pre_calc_val = _pre_calc(title_list=title_list,
                                 body_list=body_list,
                                 label_list=label_list,
                                 reply_list=reply_list,
                                 keys_sea=keys_sea)

        for k in keys_sea:
            keys = []
            for i in k:
                keys.append(" ".join(i))
            keys = " ".join(keys)
            ess_keys = nlp_util.stem_sentence(keys)

            tmp = search_rank.sort_candidate_seq(f_output, ess_keys, pre_calc_val)
            leng = min(3, len(tmp))
            one_dict['keys'].extend([ess_keys] * leng)
            one_dict['data'].extend(tmp[:leng])
        overall_table[tab_name] = one_dict
        logger.debug(pp.pformat(overall_table))
        logger.debug("#" * 50)
    return overall_table
Пример #2
0
def descript(query_decp, source_category, except_files=None,extend=False, pool_size=32):
    """
    生成描述文件
    ~1分钟得出结果
    :param query_decp: 描述文件矩阵
    example line: xml_file_name, class_name, element_name
    :param except_files: 排除文件关键词,接受字符串或字符串数组
    :param pool_size: 并行池大小
    :return: a tuple. 得到src app与 数据库每个app的总相似度,按照相似度降序排列. 用作 搜索 app
    """
    query_decp = nlp_util.process_xsv(query_decp)
    if extend :
        src_dir = work_path.in_project('./model/data/description_extend_all')
    else:
        src_dir = work_path.in_project('./model/data/description')
    print("PATH!!!! {}".format(src_dir))
    logger = logging.getLogger("StreamLogger")
    file_list = os.listdir(src_dir)
    file_list = [os.path.join(src_dir, f) for f in file_list]

    if except_files is not None:
        tmp = []
        rms = []
        if type(except_files) == str:
            for i in file_list:
                if except_files not in i:
                    tmp.append(i)
                else:
                    rms.append(i)
        elif type(except_files) == list or type(except_files) == set:
            except_files = set(except_files)
            for i in file_list:
                flag = False
                for j in except_files:
                    if j in i:
                        flag = True
                        break
                if not flag:
                    tmp.append(i)
                else:
                    rms.append(i)
        logger.debug(pp.pformat(rms))
    file_list = tmp
    logger.debug(pp.pformat(file_list))

    scan_output = _scan_match(source_category, query_decp, file_list, match_name.ngram_compare, [1, 0.5, 0.5],
                              threshold=0.7,
                              pool_size=pool_size)
    # 得到src app与 数据库每个app的总相似度,按照相似度降序排列。
    # tuple(
    # str "参考APP描述文件名",
    # float "APP相似度",
    # list "参考APP的组件相似度" [(请求app组件, 参考app组件,组件相似度)]
    # )
    logger.debug(pp.pformat(util.get_col(scan_output, [0, 1])))
    return scan_output
Пример #3
0
def test():
    # import search_rank
    # import matplotlib.pyplot as plt
    # import numpy as np
    #
    # bins = np.linspace(0, 1, 100)
    # plt.hist(util.get_col(list_score, 2), bins, density=True, histtype='step', cumulative=-1, label='Empirical')
    # plt.show()

    path = "tsv/"
    filelist = os.listdir(path)
    filelist.sort(key=lambda x: x.lower())
    out_dict = dict()

    count = 0
    for file in filelist:
        count += 1
        full_path = os.path.join(path, file)
        print(full_path)
        tmp_out = util.read_tsv(full_path)
        out_dict[file] = nlp_util.process_tsv(tmp_out)

    print("file count", count)

    count = 0
    score_distribute_list = []
    for i in range(len(filelist)):
        i_file = filelist[i]
        for j in range(len(filelist)):
            if i == j:
                print("Ignore same file", i_file)
                continue
            count += 1
            j_file = filelist[j]
            name = "{}^{}".format(i_file, j_file)
            if len(out_dict[i_file]) == 0 or len(out_dict[j_file]) == 0:
                print("EMPTY", name)
                list_score = []
                continue
            else:
                list_score = weight_compare_list(out_dict[i_file],
                                                 out_dict[j_file],
                                                 ngram_compare)
            score_col = util.get_col(list_score, 2)

            score_distribute_list.append((name, copy.deepcopy(score_col)))
            print("ADD", count, name)

    util.save_json(score_distribute_list, "score_distribute_list.json")
Пример #4
0
def query_issue(scan_output, max_depth=4):
    """
    根据 已经排序过的app相似度降序列表 scan_output 搜索所有可能 issue
    ~1分钟得出结果
    :param scan_output: 格式参考 descript()函数的输出
    :param max_depth: 限制搜索深度,取最相似的前几个
    :return: 所有查询
    """
    # TODO 查询的 key 哪里出来的?
    logger = logging.getLogger("StreamLogger")
    rdb = issuedb.ISSuedb()
    # sql = """select review_id,content,bold,star_num,helpful_num,reply_content from {}
    #                 order by length(content) desc"""
    sql = """select review_id,content,star_num from {}
                       order by length(content) desc"""
    overall_table = {}
    # 所有相关app和item
    for i in range(min(len(scan_output), max_depth)):
        one_dict = {}
        app = scan_output[i][0]
        one_dict['sim'] = scan_output[i][1]  # similarity_score
        # print(app)
        tab_name = table2tsv.file2table(app)  # suppose running
        print("@@@@@@@@@@@@")
        print(tab_name)
        one_dict['data'] = []
        one_dict['keys'] = []

        score_list = scan_output[i][2]
        keys_sea = _filter_search_keys(score_list, threshold=0.7)
        logger.debug(f"{app}\t{tab_name}\tsimilar keys length: {len(keys_sea)}")

        output = rdb.db_retrieve(sql.format(tab_name))  # output is list of tuple

        # head = ["review_id", "content", "bold", "star_num", "helpful_num", "reply_content"]
        head = ["review_id", "content", "star_num"]
        f_output = issuedb.retrieve_formatter(head, output)

        # title_list = util.get_col(output, head.index('title'))
        body_list = util.get_col(output, head.index('content'))

        star_list = util.get_col(output, head.index('star_num'))
        # reply_list = util.get_col(output, head.index('reply_content'))
        # bold_list = util.get_col(output, head.index('bold'))
        # label_list = util.get_col(output, head.index('labels'))
        # reply_list = util.get_col(output, head.index('issue_num'))
        pre_calc_val = _pre_calc(body_list=body_list, keys_sea=keys_sea)  # suppose running

        for k in keys_sea:
            keys = []
            for i in k:
                keys.append(" ".join(i))
            keys = " ".join(keys)
            ess_keys = nlp_util.stem_sentence(keys)
            tmp = search_rank.sort_candidate_seq(f_output, ess_keys, pre_calc_val)
            leng = min(3, len(tmp))
            one_dict['keys'].extend([ess_keys] * leng)
            one_dict['data'].extend(tmp[:leng])
        overall_table[tab_name] = one_dict
        logger.debug(pp.pformat(overall_table))
        logger.debug("#" * 50)
    return overall_table
Пример #5
0
    for k in count_dict:
        score += count_dict[k]
    return score



if __name__ == '__main__':
    eee = "it.feio.android.omninotes"
    s = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime(time.time()))
    test = util.read_csv("model/data/description_extend_all/"+eee+".csv")
    print("begin search similar apps")
    scan_output = descript(test, source_category="Productivity",
                           except_files=eee,extend=True, pool_size=32)  # get similar app
    print("begin rank reviews")
    rank_result = rank_review(scan_output)
    print(util.get_col(scan_output, [0, 1]))
    now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time()))
    # 1. 创建文件对象
    z = open(csv_path +eee+ now + ".csv", 'w', encoding='utf-8', newline='')
    # # 2. 基于文件对象构建 csv写入对象
    csv_writer = csv.writer(z)
    # # 3. 构建列表头
    csv_writer.writerow(["app_id", "score", "star_num", "helpful_num", "review_content"])
    for i in rank_result:
        # # 写入文件
        csv_writer.writerow([i[0], i[1], i[2].star_num, i[2].helpful_num, i[2].content])
    # 5. 关闭文件
    z.close()
    print("end.")
    print(s)
    print(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime(time.time())))