def words_black_list_process_new(spark, df_primary): blackListFile = issueconstruction.g_word_segment_black_files if common.data_is_NULL(blackListFile): return df_primary blackList = common.read_file_lines_to_list(blackListFile) if common.data_is_NULL(blackList): return df_primary df_primary.registerTempTable("tb_black_content") spark.udf.register("black", black) df_black = spark.sql( "select OWNER,PMAnalysis,Rname,Vname,adminAdvice,approverComments,att_file_num1," "att_file_num3,att_img_num1,att_img_num3,baseline,category,categoryStr,causeAnalysis," "creationdate,currentNode,currentPerson,cut_words,defectModifier,defectNo,defect_ODCSeverity," "developerComments,issueProcessor,lastProcessed,lastupdateTimestamp,lengthofstay,nodeCode," "nodeName,operation_type,productLineName,productName,refresh_timestamp,solution,status," "submitBy,submitDate,suspendReason,testReport,testTool,testToolStr,testerComments," "name,describe,detail,black(describekey) as describekey,detailkey from tb_black_content" ) # 白名单筛选 words_white_list_process_new(spark, df_black)
def white(spark, df_black): retList = [] whiteListFile = issueconstruction.g_word_segment_white_file if common.data_is_NULL(whiteListFile): return df_black whiteList = common.read_file_lines_to_list(whiteListFile) if common.data_is_NULL(whiteList): return [] # 将白名单whiteList转换成一行一列的dataframe # schema=StructType([StructField("white",StringType(),True)]) # df_white=spark.createDataFrame(DataFrame(whiteList), schema) # ["white"] # df_all=df_white.withColumn("name",f.lit("白名单")) # df_all.registerTempTable("tmp_all") # df_whiteList=spark.sql("select concat_ws(',',collect_set(white)) as detailkey from tmp_all group by name") # # # 将白名单独立出来 # rule_white=df_whiteList.rdd.map(lambda row: row).reduce(sum) # rule_white=spark.sparkContext.broadcast(rule_white) rdd = df_black.rdd.map(lambda row: match_rule(whiteList, row)) schema = StructType([ StructField("name", StringType(), True), StructField("detailkey", StringType(), True) ]) df_detailkey = spark.createDataFrame(rdd, schema) return df_detailkey
def words_black_list_process(blackListFile, words): retList = [] if common.data_is_NULL(blackListFile): return words blackList = common.read_file_lines_to_list(blackListFile) if common.data_is_NULL(blackList): return words # Logger.logger.info("blackList:%s"%(common.list_2_str(blackList))) for d in words: if d in blackListFile: continue if d in retList: continue retList.append(d) # Logger.logger.info("after blackList:%s"%(common.list_2_str(retList))) return retList
def discovery_from_delhtmllabel(contentList): if common.data_is_NULL(contentList): logging.info("contentList is null!") dict_add = {} list_pm = [] for param_dict in contentList: name = param_dict.get("name") dict_add["name"] = name for pms in ["detail"]: pm = param_dict.get(pms) destStr = '' if len(pm) > 0: src_soup = BeautifulSoup(pm, 'html5lib') if src_soup is not None: # get_text得到html内容 src_soup_text = src_soup.get_text() if src_soup_text: destStr = src_soup_text.replace('\n', '') destStr = destStr.replace('\t', '') destStr = re.sub('\\s+', ' ', destStr) dict_add[pms] = destStr else: dict_add[pms] = ' ' list_pm.append(dict_add.copy()) # print(list_pm) list_iter = generate_output_file_overwrite(contentList, list_pm) return list_iter
def discovery_from_merge(list_thr): if common.data_is_NULL(list_thr): logging.info("数据合并失败") list_merge = [] for param in list_thr: kdddict = {} destList = [] destname = param["name"] for keys in param.keys(): if keys == "describekey" or keys == "detailkey": if "\\[" in param[keys] or "\\]" in param[keys]: destList = destList + common.csv_list_str_2_list( param[keys]) else: destList.append(param[keys]) # 已满足,将嵌套的list转换成一维的list # print(set(reduce(operator.add,destList))) # print(list(set(flat(destList)))) kdddict.update({destname[0]: list(set(flat(destList)))}) list_return = search_update(kdddict, list_thr) for dict_par in list_return: dict = { "name": dict_par["name"][0], "describe": dict_par["describe"], "detail": dict_par["detail"], "describekey": dict_par["describekey"], "detailkey": dict_par["detailkey"], "searchkey": dict_par["searchkey"] } list_merge.append(dict.copy()) return list_merge
def run(srcString, bUseCustomDict=False): # param是字典型,srcString字段的值 global g_word_segment_local_file global g_drpoList if common.data_is_NULL(srcString): return [] # load user custom word dict if bUseCustomDict: # 对简述分词 jieba.load_userdict(g_word_segment_local_file) jieba.initialize() # Data preprocessing数据预处理 user_string = common.C_trans2_E(srcString) # segment将数据生成一个list ret_list = jieba.lcut(user_string.lower()) wordList = [] for i in ret_list: if i in wordList: continue if i in g_drpoList: continue if len(i) < 2: continue wordList.append(i) return wordList
def __init__(self, \ operator, \ filterList): self.operator = operator self.filterList = [] if not common.data_is_NULL(filterList): for filter in filterList: self.filterList.append(filter.__dict__)
def data_clean(s): s = str(s) if common.data_is_NULL(s): return "" s = s.replace('"', "'") s = s.replace(',', "。") s = s.replace('\n', "") s = s.replace('\r', "") return s
def get(label): global g_entity_dict if not label in g_entity_dict.keys(): g_entity_dict.update({label: {}}) if common.data_is_NULL(g_entity_dict[label]): req([label]) return g_entity_dict[label]
def words_black_list_process_new(spark, df_primary): blackListFile = issueconstruction.g_word_segment_black_files if common.data_is_NULL(blackListFile): return df_primary blackList = common.read_file_lines_to_list(blackListFile) if common.data_is_NULL(blackList): return df_primary df_primary.registerTempTable("tb_black_content") spark.udf.register("black", black) df_black = spark.sql( "select name,describe,detail,trim(black(describekey)) as describekey,trim(detailkey) as detailkey from tb_black_content" ) # 白名单筛选 words_white_list_process_new(spark, df_black)
def words_white_list_process(whiteListFile, words): retList = [] if common.data_is_NULL(whiteListFile): return words whiteList = common.read_file_lines_to_list(whiteListFile) if common.data_is_NULL(whiteList): return [] # Logger.logger.info("whiteList:%s"%(common.list_2_str(whiteList))) for d in words: if len(str(d)) > 0: # 如果索引的词在白名单里 if d in whiteList: # 如果在白名单里不在retList里 if d not in retList: retList.append(d) else: continue # Logger.logger.info("after whiteList:%s"%(common.list_2_str(retList))) return retList
def word_primary_key_align(words, isExpend=True): global g_primary_key_align_dict if common.data_is_NULL(g_primary_key_align_dict): g_primary_key_align_dict = load_eneity_align_dict(g_align_label) if common.data_is_NULL(g_primary_key_align_dict): return words retWords = [] for w in words: if w in g_primary_key_align_dict.keys(): w1 = g_primary_key_align_dict[w] if w1 != w: if isExpend: retWords.append(w1) else: w = w1 retWords.append(w) return retWords
def req(keyList): global g_entity_dict cache_lock.acquire() for k in list(keyList): cmd = "g.V().hasLabel('{0}').valueMap('name','alias')".format(k) ret, result = query.get_common(cmd) if ret is not True: logging.info("get <%s> entity is NULL" % k) continue data = result.get('data') if common.data_is_NULL(data): logging.info("get <%s> entity is NULL" % k) continue for d in data: if "alias" in d: name = re.findall(r"name=\[(.+?)\],", d) else: name = re.findall(r"name=\[(.+?)\]\}", d) if common.data_is_NULL(name): continue name = common.list_str_2_list(name[0], ', ') alias = re.findall(r"alias=\[(.+?)\]\}", d) if not common.data_is_NULL(alias): alias = common.list_str_2_list(alias[0], ', ') g_entity_dict[k].update({name[0]: alias}) cache_lock.release() return
def load_eneity_align_dict(labelList): # label是给定的一个个label for label in labelList: # 得到label数据 data = cache.get(label) for k in data.keys(): name = k alias = data[k] if common.data_is_NULL(alias): continue if isinstance(alias, list): for a in alias: g_primary_key_align_dict.update({a: name}) else: g_primary_key_align_dict.update({alias: name}) return g_primary_key_align_dict
def discovery_from_wordseg(line): if common.data_is_NULL(line): logging.info("line is null") return True # param是字典里的简述跟详述 dict = {} list = [] for param in line: for keys in param.keys(): # 分词 destList = wordSegment.run(param[keys]) # 索引 destList = word_primary_key_align(destList) if keys == "name": dict.update({keys: destList}) else: dict.update({keys + "key": destList}) list.append(dict.copy()) list_detail_des = white_black(list, line) return list_detail_des
def discovery_from_delhtmllabel_new(contentList): if common.data_is_NULL(contentList): logging.info("contentList is null!") dict_add = {} list_pm = [] name = eval(dict(contentList)).get("name") dict_add["name"] = name for pms in ["detail"]: pm = contentList.get(pms) destStr = '' if len(pm) > 0: src_soup = BeautifulSoup(pm, 'html5lib') if src_soup is not None: # get_text得到html内容 src_soup_text = src_soup.get_text() if src_soup_text: destStr = src_soup_text.replace('\n', '') destStr = destStr.replace('\t', '') destStr = re.sub('\\s+', ' ', destStr) dict_add[pms] = destStr else: dict_add[pms] = ' ' list_pm.append(dict_add.copy()) print(list_pm)