def descript(query_decp, source_category, except_files=None,extend=False, pool_size=32): """ 生成描述文件 ~1分钟得出结果 :param query_decp: 描述文件矩阵 example line: xml_file_name, class_name, element_name :param except_files: 排除文件关键词,接受字符串或字符串数组 :param pool_size: 并行池大小 :return: a tuple. 得到src app与 数据库每个app的总相似度,按照相似度降序排列. 用作 搜索 app """ query_decp = nlp_util.process_xsv(query_decp) if extend : src_dir = work_path.in_project('./model/data/description_extend_all') else: src_dir = work_path.in_project('./model/data/description') print("PATH!!!! {}".format(src_dir)) logger = logging.getLogger("StreamLogger") file_list = os.listdir(src_dir) file_list = [os.path.join(src_dir, f) for f in file_list] if except_files is not None: tmp = [] rms = [] if type(except_files) == str: for i in file_list: if except_files not in i: tmp.append(i) else: rms.append(i) elif type(except_files) == list or type(except_files) == set: except_files = set(except_files) for i in file_list: flag = False for j in except_files: if j in i: flag = True break if not flag: tmp.append(i) else: rms.append(i) logger.debug(pp.pformat(rms)) file_list = tmp logger.debug(pp.pformat(file_list)) scan_output = _scan_match(source_category, query_decp, file_list, match_name.ngram_compare, [1, 0.5, 0.5], threshold=0.7, pool_size=pool_size) # 得到src app与 数据库每个app的总相似度,按照相似度降序排列。 # tuple( # str "参考APP描述文件名", # float "APP相似度", # list "参考APP的组件相似度" [(请求app组件, 参考app组件,组件相似度)] # ) logger.debug(pp.pformat(util.get_col(scan_output, [0, 1]))) return scan_output
def get_concern_label(): c_label = set() with open(work_path.in_project("./model/conf/concern_label.dat"), 'r', encoding='utf8') as f: for row in f.readlines(): tmp = row.strip() if tmp != "": c_label.add(tmp) return c_label
def get_stops(): stops = set(punctuation) with open(work_path.in_project("./model/conf/stopwords.dat"), 'r', encoding='utf8') as f: for row in f.readlines(): tmp = row.strip() if tmp != "": stops.add(tmp) return stops
def get_hot_keys(): hot_k = set() with open(work_path.in_project("./model/conf/hotkey.dat"), 'r', encoding='utf8') as f: for row in f.readlines(): tmp = row.strip() if tmp != "": tmp = stem_word(tmp) hot_k.add(tmp) return hot_k
def except_list_build_helper(): src_dir = work_path.in_project('./model/data/description') file_list = os.listdir(src_dir) file_list = [util.bare_name(f) for f in file_list] rt = [] for i in range(len(file_list)): tmp = {} tmp['id'] = f'cf_{i + 1}' tmp['text'] = " ".join(file_list[i].split('_')) tmp['val'] = file_list[i] rt.append(tmp) return rt
from model import issuedb as idb import os from model import util, work_path __GENERATE__ = False if __GENERATE__: SRC_DIR = 'tsv/' TEST_DIR = 'tsv_test/' TSV_FILE = work_path.in_project('./model/conf/tab_url.tsv') __data_tsv = util.read_tsv(TSV_FILE) def generate_lookup_table(): db_driver = idb.ISSuedb() output = db_driver.db_retrieve( "select name from sqlite_master where type='table' order by name;") table_dict = {i[0].replace("$", "_"): i[0] for i in output} file_list = os.listdir(SRC_DIR) file_list = [os.path.join(SRC_DIR, f) for f in file_list] file_list_test = os.listdir(TEST_DIR) file_list_test = [os.path.join(TEST_DIR, f) for f in file_list_test] files = file_list + file_list_test files_dict = {i: False for i in files} reload = util.Reload(TSV_FILE) for item in table_dict:
def __init__(self, filepath=work_path.in_project('issue.db')): logger = logging.getLogger("StreamLogger") logger.info("DB file location: %s" % filepath) self.conn = sqlite3.connect(filepath) self.cursor = self.conn.cursor()
import copy import logging from model import util, work_path CONF_JSON = util.load_json(work_path.in_project('./model/conf/rank_coef.json')) SCORE_COEF = CONF_JSON["data"] MAX_VAL = CONF_JSON["scale_max"] def get_key_sea_count(all_key, text, unique=False): if type(text) != set: f_c = set(text) if unique else text count_dict = {} for k in all_key: count_dict[k] = 0 for k in f_c: if k in all_key: count_dict[k] += 1 count_dict["__corpus_len__"] = len(f_c) return copy.deepcopy(count_dict) def get_key_sea_count_corpus(all_key, corpus, unique=False): key_count = [] if type(all_key) != set: all_key = set(all_key) for c in corpus: key_count.append(get_key_sea_count(all_key, c, unique)) return key_count
from werkzeug.middleware.shared_data import SharedDataMiddleware from urllib.parse import quote, unquote from model import work_path, util from datetime import datetime, timezone, timedelta import api import logging from tasks import iss_query, job_ready_byid, job_get_byid ALLOWED_EXTENSIONS = set(['zip']) local_tz = timezone(timedelta(hours=8)) app = Flask(__name__) app.config['UPLOAD_FOLDER'] = work_path.get_upload() os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) app.config['DOWNLOAD_FOLDER'] = work_path.in_project('./downloads') os.makedirs(app.config['DOWNLOAD_FOLDER'], exist_ok=True) app.logger.setLevel(logging.INFO) logger = app.logger def allowed_file(filename): return '.' in filename and filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS def secure_filename(filename): filename, file_extension = os.path.splitext(filename) return quote(filename) + file_extension