def descript(query_decp, source_category, except_files=None,extend=False, pool_size=32): """ 生成描述文件 ~1分钟得出结果 :param query_decp: 描述文件矩阵 example line: xml_file_name, class_name, element_name :param except_files: 排除文件关键词,接受字符串或字符串数组 :param pool_size: 并行池大小 :return: a tuple. 得到src app与 数据库每个app的总相似度,按照相似度降序排列. 用作 搜索 app """ query_decp = nlp_util.process_xsv(query_decp) if extend : src_dir = work_path.in_project('./model/data/description_extend_all') else: src_dir = work_path.in_project('./model/data/description') print("PATH!!!! {}".format(src_dir)) logger = logging.getLogger("StreamLogger") file_list = os.listdir(src_dir) file_list = [os.path.join(src_dir, f) for f in file_list] if except_files is not None: tmp = [] rms = [] if type(except_files) == str: for i in file_list: if except_files not in i: tmp.append(i) else: rms.append(i) elif type(except_files) == list or type(except_files) == set: except_files = set(except_files) for i in file_list: flag = False for j in except_files: if j in i: flag = True break if not flag: tmp.append(i) else: rms.append(i) logger.debug(pp.pformat(rms)) file_list = tmp logger.debug(pp.pformat(file_list)) scan_output = _scan_match(source_category, query_decp, file_list, match_name.ngram_compare, [1, 0.5, 0.5], threshold=0.7, pool_size=pool_size) # 得到src app与 数据库每个app的总相似度,按照相似度降序排列。 # tuple( # str "参考APP描述文件名", # float "APP相似度", # list "参考APP的组件相似度" [(请求app组件, 参考app组件,组件相似度)] # ) logger.debug(pp.pformat(util.get_col(scan_output, [0, 1]))) return scan_output
def _single_scan_helper(arg): index, file_path, sample_ui_list, comp_func, weight_list, threshold = arg logger = logging.getLogger("StreamLogger") logger.debug(file_path) tmp_out = util.read_csv(file_path) tmp_out = nlp_util.process_xsv(tmp_out) if len(tmp_out) == 0: logger.debug(f"EMPTY {file_path}") score_distribution_list = [] else: score_distribution_list = match_name.weight_compare_list(sample_ui_list, tmp_out, comp_func, weight_list) # score_distribution_list = util.get_col(score_distribution_list, 2) score = match_name.similar_index(score_distribution_list, threshold, col_index=2, rate=True) rt = (file_path, score, score_distribution_list) logger.debug(f"ADD {index} {file_path}") return rt