def st_build_dict(dict_file): global seg_dict seg_dict = {} fin = open(dict_file,"r") for line in fin: line = line.strip() if not line or line[0] == '#': continue line_t = line.split() if not hanzi_util.is_zhs(line_t[0]): print("SKIP:%s"%(line_t[0])) continue if(len(line_t[0]) == 1): if line_t[0] not in seg_dict.keys(): seg_dict[line_t[0]] = [line_t[0]] else: print("EEEEEEEEEEEEEEEEEEEEEE1") seg_dict[line_t[0]].append(line_t[0]) return else: chr = line_t[0][0] if chr not in seg_dict.keys(): seg_dict[chr] = [line_t[0]] else: seg_dict[chr].append(line_t[0])
def build_train_data(): global train_data global train_tags global stop_words global train_word_id train_data = {} train_tags = [] stop_words = [] train_word_id = [] with open(STOP_FILE, 'r') as fin: for line in fin: line = line.strip() if line[0] == '#': continue stop_words.append(line) print("STOP WORD SIZE:%d\n" %(len(stop_words))) for parent,dirname,filenames in os.walk(DATA_DIR): for filename in filenames: tag_name = filename[:-4] print("正在处理:%s"%(tag_name)) train_tags.append(tag_name) line_num = 0 with open(DATA_DIR+'/'+filename,'r') as fin: for line in fin: line_num += 1 if not line_num % 1000 : print('LINE:%d'%(line_num)) line = line.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) if item_id in train_data.keys(): train_data[item_id]['COUNT'] += 1 else: #print("ADDING ITEM:%s" %(item)); train_data[item_id] = {} train_data[item_id]['COUNT'] = 1 if tag_name not in train_data[item_id].keys(): train_data[item_id][tag_name] = {} #公现指数计算 #我们只计算一个方向的 if len(objs) < 2: continue #print(objs) for index_i in range(len(objs) - 1): for index_j in range(index_i + 1, len(objs)): #print('%d-%d-%d'%(len(objs),index_i, index_j)) item_i = objs[index_i] item_j = objs[index_j] item_t = item_i<<32 | item_j if item_t in train_data[item_i][tag_name].keys(): train_data[item_i][tag_name][item_t] += 1 else: #反向的是否存在 train_data[item_i][tag_name][item_t] = 1 return
def load_corpus(filename): corpus = [] labels = [] labelmap = dict() with open(filename, 'r') as fin: for line in fin: line = line.strip() mt = re.match(r'\[(.+?)\](.+)', line) if mt: label = mt.group(1).split(',') for x in label: labelmap[x] = 1 line = mt.group(2).strip() else: label = None #标签后的文本内容 #太长的文本丢弃掉 if(len(line) > 512): continue line = jieba.cut(line, cut_all=False) doc = [] for item in line: if item not in stop_words and hanzi_util.is_zhs(item): doc.append(item) if len(doc)>0: corpus.append(doc) labels.append(label) return labelmap.keys(), corpus, labels
def load_corpus(filename): corpus = [] labels = [] labelmap = dict() with open(filename, 'r') as fin: for line in fin: line = line.strip() mt = re.match(r'\[(.+?)\](.+)', line) if mt: label = mt.group(1).split(',') for x in label: labelmap[x] = 1 line = mt.group(2).strip() else: label = None #标签后的文本内容 #太长的文本丢弃掉 if (len(line) > 512): continue line = jieba.cut(line, cut_all=False) doc = [] for item in line: if item not in stop_words and hanzi_util.is_zhs(item): doc.append(item) if len(doc) > 0: corpus.append(doc) labels.append(label) return labelmap.keys(), corpus, labels
def prep_word_dict(): CURRENT_W = None with open(IN_FILE) as fin: while True: try: line = fin.readline() except: print("READ ERROR:%d" % (LINE_NUM)) continue if not line: print("PROCESS DONE!") break if line[:4] == '[DDv': CURRENT_W = line[5:line.index(']')] term_to_id(CURRENT_W) continue if CURRENT_W and line[0] == '【' and ('=】' in line): line_x = line[line.index('】') + 1:] line_x = line_x.split() if line_x: for item in line_x: term_to_id(item) continue LINE_NUM = 0 with open(YL_FILE) as fin, open(YLP_FILE, 'w') as fout: while True: try: line = fin.readline() except: print("READ ERROR:%d" % (LINE_NUM)) continue if not line: print("PROCESS DONE!") break LINE_NUM += 1 if not (LINE_NUM % 5000): print('C:%d' % (LINE_NUM)) if len(line) > 30: continue seg_list = pynlpir.segment(line, pos_tagging=False) for i in range(len(seg_list)): if is_zhs(seg_list[i]): term_to_id(seg_list[i]) elif len(seg_list[i]) == 1 and is_punct(seg_list[i]): seg_list[i] = PUNCING else: seg_list[i] = PADDING fout.write(' '.join(seg_list) + '\n') term_to_id(PADDING) #term_to_id(PUNCING) print('SEN DONE!')
def prep_word_dict(): CURRENT_W = None with open(IN_FILE) as fin: while True: try: line = fin.readline() except: print("READ ERROR:%d" %(LINE_NUM) ) continue if not line: print("PROCESS DONE!") break if line[:4] == '[DDv' : CURRENT_W = line[5: line.index(']')] term_to_id(CURRENT_W) continue if CURRENT_W and line[0] == '【' and ('=】' in line): line_x = line[line.index('】')+1:] line_x = line_x.split() if line_x: for item in line_x: term_to_id(item) continue LINE_NUM = 0 with open(YL_FILE) as fin, open(YLP_FILE, 'w') as fout: while True: try: line = fin.readline() except: print("READ ERROR:%d" %(LINE_NUM) ) continue if not line: print("PROCESS DONE!") break LINE_NUM += 1 if not (LINE_NUM % 5000): print('C:%d' %(LINE_NUM)) if len(line) > 30: continue seg_list = pynlpir.segment(line, pos_tagging=False) for i in range(len(seg_list)): if is_zhs(seg_list[i]): term_to_id(seg_list[i]) elif len(seg_list[i]) == 1 and is_punct(seg_list[i]): seg_list[i] = PUNCING else: seg_list[i] = PADDING fout.write(' '.join(seg_list) + '\n') term_to_id(PADDING) #term_to_id(PUNCING) print('SEN DONE!')
def calc_vector(str): count_all = {} sub_train = [] if not str or not len(str): return None line = str.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): if item not in train_word_id: # 单字词已经被踢掉了 continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) if len(objs) < 2: return None for index_i in range(len(objs) - 1): for index_j in range(index_i + 1, len(objs)): if objs[index_i] < objs[index_j]: item_i = objs[index_i] item_j = objs[index_j] else: item_i = objs[index_j] item_j = objs[index_i] item_t = item_i<<32 | item_j sub_train.append(item_t) for item_tag in train_tags[1:]: tag_id = train_tags.index(item_tag) count_all[item_tag] = {} for item_w in sub_train: item_1 = item_w >> 32 item_2 = item_w & 0xFFFFFFFF item_2_tag = (item_2 << tag_shift) | tag_id count_s = 0 #count_s = sum(train_data_single[item_1].values()) + sum(train_data_single[item_2].values()) if tag_id in train_data_single[item_1]: count_s += train_data_single[item_1][tag_id] if tag_id in train_data_single[item_2]: count_s += train_data_single[item_2][tag_id] count = 0 if item_1 in train_data and \ item_2_tag in train_data[item_1]: count += train_data[item_1][item_2_tag] #这里将对数值取反,绝对值越小,概率越大 if count_s == 0 or count == 0: count_all[item_tag][item_w] = -math.log(0.0000000001) else: count_all[item_tag][item_w] = -math.log(count / count_s + 0.0000000001) return count_all
def final_prob(classifier, str_test): if not classifier or not str_test: return None str_test = str_test.strip() line_t = jieba.cut(str_test, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item) and item in train_word_id: item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) if not objs: return None feat = best_word_features(objs, best_words) if not feat: return None prob = classifier.prob_classify(feat) return prob
def main(): labelset, corpus, labels = load_corpus(LABEL_TRAIN_FILE) print("哈哈") print(labelset) if not os.path.exists("llda.dat"): llda = LLDA(K=len(labelset), alpha=0.001, beta=0.001) llda.set_corpus(labelset, corpus, labels) print("M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), len(labelset))) for i in range(100): print("-- %d " % (i + 1)) llda.inference() with open("llda.dat", 'wb') as fp: pickle.dump(llda, fp, -1) else: print("loading llda...") with open("llda.dat", 'rb') as fp: llda = pickle.load(fp) #困惑度 #通常情况下,困惑度越低,说明模型产生文档的能力越高,模型的推广性也就越好,通过观测困惑度来调整K取值 print("perplexity : %.4f" % llda.perplexity()) phi = llda.phi() theta = llda.theta() for k, label in enumerate(labelset): print("\n-- label %d : %s" % (k, label)) for w in numpy.argsort(-phi[k])[:10]: #print("%d~%s" %(k, w)) print("%s: %.4f" % (llda.vocas[w], phi[k, w])) test_str = "如何变更手机号码?" line = jieba.cut(test_str.strip(), cut_all=False) obj = [] for item in line: if item not in stop_words and hanzi_util.is_zhs(item): obj.append(item) #print(llda.phi()) #print(llda.theta()) print(len(phi)) print(len(theta[0])) for k, label in enumerate(labelset): print(theta[llda.term_to_id(obj[0]), k])
def final_prob(classifier, str_test): if not classifier or not str_test: return None str_test = str_test.strip() line_t = jieba.cut(str_test, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs( item) and item in train_word_id: item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) if not objs: return None feat = best_word_features(objs, best_words) if not feat: return None prob = classifier.prob_classify(feat) return prob
def calc_vector(str): count_all = {} sub_train = [] if not str or not len(str): return None line = str.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): if item not in train_word_id: continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) if len(objs) < 2: return None for index_i in range(len(objs) - 1): for index_j in range(index_i + 1, len(objs)): item_i = objs[index_i] item_j = objs[index_j] item_t = item_i<<32 | item_j sub_train.append(item_t) for item_tag in train_tags: count_all[item_tag] = {} for item_w in sub_train: item_1 = item_w >> 32 item_2 = item_w & 0xFFFFFFFF item_a = item_1 << 32 | item_2 item_b = item_2 << 32 | item_1 count_s = train_data[item_1]['COUNT'] + train_data[item_2]['COUNT'] count = 0 if item_1 in train_data.keys() and \ item_tag in train_data[item_1].keys() and \ item_a in train_data[item_1][item_tag].keys(): count += train_data[item_1][item_tag][item_a] if item_2 in train_data.keys() and \ item_tag in train_data[item_2].keys() and \ item_b in train_data[item_2][item_tag].keys(): count += train_data[item_2][item_tag][item_b] #这里将对数值取反,绝对值越小,概率越大 count_all[item_tag][item_w] = -math.log(count / count_s + 0.0000000001) return count_all
def main(): labelset, corpus, labels = load_corpus(LABEL_TRAIN_FILE) print("哈哈") print(labelset) if not os.path.exists("llda.dat"): llda = LLDA(K=len(labelset), alpha=0.001, beta=0.001) llda.set_corpus(labelset, corpus, labels) print ("M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), len(labelset))) for i in range(100): print("-- %d " % (i + 1)) llda.inference() with open("llda.dat", 'wb') as fp: pickle.dump(llda, fp, -1) else: print("loading llda...") with open("llda.dat", 'rb') as fp: llda = pickle.load(fp) #困惑度 #通常情况下,困惑度越低,说明模型产生文档的能力越高,模型的推广性也就越好,通过观测困惑度来调整K取值 print ("perplexity : %.4f" % llda.perplexity()) phi = llda.phi() theta = llda.theta() for k, label in enumerate(labelset): print ("\n-- label %d : %s" % (k, label)) for w in numpy.argsort(-phi[k])[:10]: #print("%d~%s" %(k, w)) print ("%s: %.4f" % (llda.vocas[w], phi[k,w])) test_str = "如何变更手机号码?" line = jieba.cut(test_str.strip(), cut_all=False) obj = [] for item in line: if item not in stop_words and hanzi_util.is_zhs(item): obj.append(item) #print(llda.phi()) #print(llda.theta()) print(len(phi)) print(len(theta[0])) for k, label in enumerate(labelset): print(theta[llda.term_to_id(obj[0]),k])
def final_prob(classifier, data_str): count_all = {} if not data_str or not len(data_str): return None line = data_str.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): if item not in train_word_id: # 单字词已经被踢掉了 continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) test_feature = best_word_features(objs, best_words) if not test_feature: print('特征为空...') return None prob = classifier.prob_classify(test_feature) return prob
def final_prob(classifier, data_str): count_all = {} sub_train = [] if not data_str or not len(data_str): return None line = data_str.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): if item not in train_word_id: # 单字词已经被踢掉了 continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) if len(objs) < 2: return None print('设计匹配对...') for index_i in range(len(objs) - 1): for index_j in range(index_i + 1, len(objs)): if objs[index_i] < objs[index_j]: item_i = objs[index_i] item_j = objs[index_j] else: item_i = objs[index_j] item_j = objs[index_i] item_t = item_i << word_shift | item_j sub_train.append(item_t) test_feature = best_word_features(sub_train, best_words) # debug 特征对 for i in test_feature.keys(): print("\t%s-%s" % (train_word_id[i >> word_shift], train_word_id[i & word_mask])) if not test_feature: print('特征为空...') return None prob = classifier.prob_classify(test_feature) return prob
def final_prob(classifier, data_str): count_all = {} sub_train = [] if not data_str or not len(data_str): return None line = data_str.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): if item not in train_word_id: # 单字词已经被踢掉了 continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) if len(objs) < 2: return None print('设计匹配对...') for index_i in range(len(objs) - 1): for index_j in range(index_i + 1, len(objs)): if objs[index_i] < objs[index_j]: item_i = objs[index_i] item_j = objs[index_j] else: item_i = objs[index_j] item_j = objs[index_i] item_t = item_i<<word_shift | item_j sub_train.append(item_t) test_feature = best_word_features(sub_train, best_words) # debug 特征对 for i in test_feature.keys(): print("\t%s-%s"%(train_word_id[i>>word_shift], train_word_id[i&word_mask])) if not test_feature: print('特征为空...') return None prob = classifier.prob_classify(test_feature) return prob
tag_name = filename[:-4] line_num = 0 with open(DATA_DIR + '/' + filename, 'r') as fin, open(DATA_DIR + '/' + tag_name + '_p.txt', 'w') as fout: print('正在处理:%s' % (filename)) while True: try: line = fin.readline() except UnicodeDecodeError as e: print('Unicode Error! filename=%s, line_num=%d' % (filename, line_num)) continue if not line: print('文件已处理完! filename=%s, line_num=%d' % (filename, line_num)) break line_num += 1 if not line_num % 1000: print('LINE:%d' % (line_num)) line = line.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): if item not in objs: objs.append(item) if not len(objs): continue line = ' '.join(objs) + '\n' #print(line, end='') fout.write(line)
line_num = 0 frequency = {} with open(doc_name) as fin: for line in fin: line = line.strip() if len(line) > 50: continue if len(line) < 4: continue line_num += 1 if not line_num % 5000: print("CURR:%d" % (line_num)) seg_list = list(jieba.cut(line, cut_all=False)) while '' in seg_list: seg_list.remove('') line_t = [ x for x in seg_list if x not in stop_words and hanzi_util.is_zhs(x) ] for token in line_t: if token in frequency: frequency[token] += 1 else: frequency[token] = 1 if not line_t: continue texts.append(line_t) documents.append(line) #texts = [[token for token in text if frequency[token] > 1] # for text in texts] del frequency dictionary = corpora.Dictionary(texts)
def build_model(): global STOP_WORDS global SCAN_WORDS global TYCC_ITEMS STOP_WORDS = [] with open(STOP_FILE, 'r') as fin: for line in fin: item = line.strip() if len(item.split()) > 1: print(item) if is_zhs(item): STOP_WORDS.append(item) STOP_WORDS = set(STOP_WORDS) print("STOP_WORDS:%d" % (len(STOP_WORDS))) STOP_WORDS = [] SCAN_WORDS = [] TYCC_ITEMS_PREP = {} with open(TYCC_FILE, 'r') as fin: for line in fin: items = line.strip().split() if len(items) < 3: continue if items[1][0] != '【' and items[1][-1] != '】': continue # 只注重同义词部分 if items[1][-3:] != '.=】': continue # 目前只考虑动词 if items[0] != 'v': continue words = [] for item in items[2:]: if item not in STOP_WORDS: words.append(item) #else: # print('TRIM:%s' %(item)) YX = items[1] if not len(words): continue for word in words: if word in TYCC_ITEMS_PREP: item = TYCC_ITEMS_PREP[word] item.append({YX: ' '.join(words)}) else: TYCC_ITEMS_PREP[word] = [{YX: ' '.join(words)}] print("TYCC_ITEMS ORIGINAL:%d" % (len(TYCC_ITEMS_PREP))) #无多意项剔除 TYCC_ITEMS = deepcopy(TYCC_ITEMS_PREP) for word in TYCC_ITEMS_PREP: if len(TYCC_ITEMS_PREP[word]) == 1: TYCC_ITEMS.pop(word) else: for yxs in TYCC_ITEMS_PREP[word]: for (k, v) in yxs.items(): SCAN_WORDS.extend(v.split()) del TYCC_ITEMS_PREP print("TYCC_ITEMS AFTER:%d" % (len(TYCC_ITEMS))) SCAN_WORDS = set(SCAN_WORDS) print("SCAN_WORDS:%d" % (len(SCAN_WORDS)))
def build_train_dat(): global TYCC_DAT TYCC_DAT = {} line_num = 0 with open(TRAIN_COPS) as fin: while True: try: each_line = fin.readline() if not each_line: break_flag = True print("处理完毕!") break line_num += 1 if not (line_num % 2000): print("C:%d" % (line_num)) each_line = each_line.strip() seg_list = jieba.cut(each_line, cut_all=False) seg_list = split_to_sentnces(list(seg_list)) for seg_ls in seg_list: if not seg_ls: continue words = [] for item in seg_ls: if not is_zhs(item): continue if item in STOP_WORDS: continue words.append(item) len_t = len(words) if not len_t or len_t < 3: continue for i in range(len_t): if words[i] in SCAN_WORDS: if words[i] not in TYCC_DAT: TYCC_DAT[words[i]] = {} if (i - 2) >= 0: TYCC_DAT[words[i]][words[i - 2]] = 1 TYCC_DAT[words[i]][words[i - 1]] = 2 elif (i - 1) >= 0: TYCC_DAT[words[i]][words[i - 1]] = 2 if (i + 2) < len_t: TYCC_DAT[words[i]][words[i + 2]] = 1 TYCC_DAT[words[i]][words[i + 1]] = 2 elif (i + 1) < len_t: TYCC_DAT[words[i]][words[i + 1]] = 2 else: if (i - 2) >= 0: if words[i - 2] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i - 2]] += 1 else: TYCC_DAT[words[i]][words[i - 2]] = 1 # for i - 1 if words[i - 1] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i - 1]] += 2 else: TYCC_DAT[words[i]][words[i - 1]] = 2 elif (i - 1) >= 0: if words[i - 1] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i - 1]] += 2 else: TYCC_DAT[words[i]][words[i - 1]] = 2 if (i + 2) < len_t: if words[i + 2] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i + 2]] += 1 else: TYCC_DAT[words[i]][words[i + 2]] = 1 # for i + 1 if words[i + 1] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i + 1]] += 2 else: TYCC_DAT[words[i]][words[i + 1]] = 2 elif (i + 1) < len_t: if words[i + 1] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i + 1]] += 2 else: TYCC_DAT[words[i]][words[i + 1]] = 2 except UnicodeDecodeError as e: print('Unicode Error! filename=%s, line_num=%d' % (TRAIN_COPS, line_num)) pass
if not os.path.exists("./dump.dat"): with open(STOP_FILE, 'r') as fin: stop_words = [] for line in fin: line = line.strip() stop_words.append(line) with open(TRAIN_FILE, 'r') as fin: train_set = [] for line in fin: #line = line.strip().split() line = line.strip() line = jieba.cut(line, cut_all=False) obj = [] for item in line: if item not in stop_words and hanzi_util.is_zhs(item): obj.append(item) train_set.append(obj) #stop fp = open("./dump.dat",'wb', -1) dump_data = [] dump_data.append(stop_words) dump_data.append(train_set) pickle.dump(dump_data, fp, -1) else: fp = open("./dump.dat",'rb') dump_data = pickle.load(fp) stop_words = dump_data[0] train_set = dump_data[1]
print("ORIGINAL:" + test_str) seg_list = jieba.cut(test_str, cut_all=False) for item in seg_list: if item in TYCC_ITEMS: print(" [%s] " % (item), end='') else: print(item, end='') seg_list = split_to_sentnces(list(seg_list)) for seg_ls in seg_list: if not seg_ls: continue words = [] for item in seg_ls: if not is_zhs(item): continue if item in STOP_WORDS: continue words.append(item) len_t = len(words) if not len_t or len_t < 3: print("UN-SUPPORT!!!") for i in range(len_t): if words[i] in TYCC_ITEMS: #有歧义词汇 print(" [%s] " % (words[i])) p = '_P_' n = '_P_' if (i - 1) >= 0: p = words[i - 1] if (i + 1) < len_t: n = words[i + 1] for yxs in TYCC_ITEMS[words[i]]:
def calc_vector(str): count_all = {} sub_train = [] pair_debug = {} if not str or not len(str): return (None, None) line = str.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): if item not in train_word_id: # 单字词已经被踢掉了 continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) if len(objs) < 2: return (None, None) #产生搭配组合 for index_i in range(len(objs) - 1): for index_j in range(index_i + 1, len(objs)): if objs[index_i] < objs[index_j]: item_i = objs[index_i] item_j = objs[index_j] else: item_i = objs[index_j] item_j = objs[index_i] item_t = item_i << 32 | item_j sub_train.append(item_t) #统计总出现次数 count_all = {} for item_tag in train_tags[1:]: tag_val = train_data[item_tag] count_all[item_tag] = {} for item_w in sub_train: count_all[item_tag][item_w] = 0 item_1 = item_w >> 32 item_2 = item_w & 0xFFFFFFFF if item_1 in tag_val and item_2 in tag_val[item_1]: count_all[item_tag][item_w] += tag_val[item_1][item_2] count_s = {} for item_w in sub_train: count_s[item_w] = 0 for item_tag in train_tags[1:]: count_s[item_w] += count_all[item_tag][item_w] print(count_s) print(count_all) #计算概率 count_ret = {} for item_tag in train_tags[1:]: count_ret[item_tag] = {} pair_debug[item_tag] = {} for item_w in sub_train: item_1 = item_w >> 32 item_2 = item_w & 0xFFFFFFFF #这里将对数值取反,绝对值越小,概率越大 count = count_all[item_tag][item_w] if count == 0 or count_s[item_w] == 0: count_ret[item_tag][item_w] = -math.log(0.0000000001) else: count_ret[item_tag][item_w] = -count * math.log( count / count_s[item_w] + 0.0000000001) pair_debug[item_tag][ train_word_id[item_1] + '~' + train_word_id[item_2]] = '%d/%d' % (count, count_s[item_w]) return (count_ret, pair_debug)
def run(self): while True: try: tag_name = q.get(timeout=5) except queue.Empty as e: print("Task Queue is empty, return!") return print("Thread-%d正在处理:%s,还剩:%d" % (self.threadID, tag_name, q.qsize())) #if os.path.exists(TMP_PATH+tag_name+'.dat'): # print("DAT %s already exits, skip it!"%(tag_name)) # q.task_done() # continue line_num = 0 fast_prep = 1 sub_train_data = {} if os.path.exists(DATA_DIR + '/' + tag_name + '_p.txt'): open_file = DATA_DIR + '/' + tag_name + '_p.txt' fast_prep = 1 else: open_file = DATA_DIR + '/' + tag_name + '.txt' fast_prep = 0 with open(open_file, 'r') as fin: while True: try: line = fin.readline() except UnicodeDecodeError as e: print('Unicode Error! thread=%d, tag=%s, line_num=%d' % (self.threadID, tag_name, line_num)) continue if not line: print('文件已处理完! thread=%d, tag=%s, line_num=%d' % (self.threadID, tag_name, line_num)) break line_num += 1 if not line_num % 1000: print('Thread-%d,LINE:%d' % (self.threadID, line_num)) objs = [] if not fast_prep: line = line.strip() line_t = jieba.cut(line, cut_all=False) for item in line_t: if item not in stop_words and hanzi_util.is_zhs( item): if len(item) == 1 and item not in white_words: continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) else: for item in line.split(): if len(item) == 1 and item not in white_words: continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) #objs = [ term_to_id(t_id) for t_id in line.split()] if len(objs) < 2: continue for index_i in range(len(objs) - 1): for index_j in range(index_i + 1, len(objs)): #print('%d-%d-%d'%(len(objs),index_i, index_j)) if objs[index_i] < objs[index_j]: item_i = objs[index_i] item_j = objs[index_j] else: item_i = objs[index_j] item_j = objs[index_i] if item_i in sub_train_data: if item_j in sub_train_data[item_i]: sub_train_data[item_i][item_j] += 1 else: sub_train_data[item_i][item_j] = 1 else: sub_train_data[item_i] = {} sub_train_data[item_i][item_j] = 1 # 数据量太大,将出现频次小于等于1的词剔除掉 print("精简数据...") iter_obj = copy.deepcopy(sub_train_data) for item_1 in iter_obj.keys(): if not iter_obj[item_1]: continue for item_2 in iter_obj[item_1].keys(): if iter_obj[item_1][item_2] <= 1: #print("DEBUG1:%d - %s/%s" %(iter_obj[item_1][item_2] ,train_word_id[item_1], train_word_id[item_2])) del sub_train_data[item_1][item_2] if not iter_obj[item_1]: print("DEBUG2:%s" % (train_word_id[item_1])) del sub_train_data[item_1] del iter_obj print("保存数据...") # sub_train_data dump_file = TMP_PATH + tag_name + '.dat' with open(dump_file, 'wb', -1) as fp: pickle.dump(sub_train_data, fp, -1) del sub_train_data print("Thread-%d处理[%s]结束!" % (self.threadID, tag_name)) q.task_done()
print("ORIGINAL:" + test_str) seg_list = jieba.cut(test_str, cut_all=False) for item in seg_list: if item in TYCC_ITEMS: print(" [%s] "%(item), end='') else: print(item, end='') seg_list = split_to_sentnces(list(seg_list)) for seg_ls in seg_list: if not seg_ls: continue words = [] for item in seg_ls: if not is_zhs(item): continue if item in STOP_WORDS: continue words.append(item) len_t = len(words) if not len_t or len_t < 3: print("UN-SUPPORT!!!") for i in range(len_t): if words[i] in TYCC_ITEMS: #有歧义词汇 print(" [%s] "%(words[i])) p = '_P_'; n = '_P_'; if (i-1) >= 0: p = words[i-1] if (i+1) < len_t: n = words[i+1] for yxs in TYCC_ITEMS[words[i]]: for(k, v) in yxs.items():
def build_train_data(): global train_word_id global train_data_single global train_data global train_tags global stop_words train_word_id = [] train_data_single = {} train_data = {} train_tags = ['NULL'] stop_words = [] with open(STOP_FILE, 'r') as fin: for line in fin: line = line.strip() if not line or line[0] == '#': continue stop_words.append(line) print("STOP WORD SIZE:%d\n" %(len(stop_words))) with open(WHITE_FILE, 'r') as fin: for line in fin: line = line.strip() if not line or line[0] == '#': continue white_words.append(line) print("WHITE WORD SIZE:%d\n" %(len(white_words))) for parent,dirname,filenames in os.walk(DATA_DIR): for filename in filenames: tag_name = filename[:-4] print("正在处理:%s"%(tag_name)) train_tags.append(tag_name) tag_id = train_tags.index(tag_name) line_num = 0 with open(DATA_DIR+'/'+filename,'r') as fin: for line in fin: line_num += 1 if not line_num % 1000 : print('LINE:%d'%(line_num)) line = line.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): if len(item) == 1 and item not in white_words: if item not in debug_s_words: debug_s_words.append(item) continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) # train_data_single if not item_id in train_data_single: train_data_single[item_id] = {} if not tag_id in train_data_single[item_id]: train_data_single[item_id][tag_id] = 1 else: train_data_single[item_id][tag_id] += 1 # train_data #公现指数计算 #我们只计算一个方向的,且排列按照 index 低-高 排列 if len(objs) < 2: continue #print(objs) for index_i in range(len(objs) - 1): for index_j in range(index_i + 1, len(objs)): #print('%d-%d-%d'%(len(objs),index_i, index_j)) if objs[index_i] < objs[index_j]: item_i = objs[index_i] item_j = objs[index_j] else: item_i = objs[index_j] item_j = objs[index_i] item_j_tag = (item_j << tag_shift ) | tag_id if item_i in train_data: if item_j_tag in train_data[item_i]: train_data[item_i][item_j_tag] += 1 else: train_data[item_i][item_j_tag] = 1 else: train_data[item_i] = {} train_data[item_i][item_j_tag] = 1 return
def calc_vector(str): count_all = {} sub_train = [] pair_debug = {} if not str or not len(str): return (None,None) line = str.strip() line_t = jieba.cut(line, cut_all=False) objs = [] for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): if item not in train_word_id: # 单字词已经被踢掉了 continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) if len(objs) < 2: return (None,None) #产生搭配组合 for index_i in range(len(objs) - 1): for index_j in range(index_i + 1, len(objs)): if objs[index_i] < objs[index_j]: item_i = objs[index_i] item_j = objs[index_j] else: item_i = objs[index_j] item_j = objs[index_i] item_t = item_i<<32 | item_j sub_train.append(item_t) #统计总出现次数 count_all = {} for item_tag in train_tags[1:]: tag_val = train_data[item_tag] count_all[item_tag] = {} for item_w in sub_train: count_all[item_tag][item_w] = 0 item_1 = item_w >> 32 item_2 = item_w & 0xFFFFFFFF if item_1 in tag_val and item_2 in tag_val[item_1]: count_all[item_tag][item_w] += tag_val[item_1][item_2] count_s = {} for item_w in sub_train: count_s[item_w] = 0 for item_tag in train_tags[1:]: count_s[item_w] += count_all[item_tag][item_w] print(count_s) print(count_all) #计算概率 count_ret = {} for item_tag in train_tags[1:]: count_ret[item_tag] = {} pair_debug[item_tag] = {} for item_w in sub_train: item_1 = item_w >> 32 item_2 = item_w & 0xFFFFFFFF #这里将对数值取反,绝对值越小,概率越大 count = count_all[item_tag][item_w] if count == 0 or count_s[item_w] == 0: count_ret[item_tag][item_w] = - math.log(0.0000000001) else: count_ret[item_tag][item_w] = - count*math.log(count / count_s[item_w] + 0.0000000001) pair_debug[item_tag][train_word_id[item_1]+'~'+train_word_id[item_2]] = '%d/%d' %(count, count_s[item_w]) return (count_ret, pair_debug)
def build_model(): global STOP_WORDS global SCAN_WORDS global TYCC_ITEMS STOP_WORDS = [] with open(STOP_FILE, 'r') as fin: for line in fin: item = line.strip() if len(item.split()) > 1: print(item) if is_zhs(item): STOP_WORDS.append(item) STOP_WORDS = set(STOP_WORDS) print("STOP_WORDS:%d" %(len(STOP_WORDS))) STOP_WORDS = [] SCAN_WORDS = [] TYCC_ITEMS_PREP = {} with open(TYCC_FILE, 'r') as fin: for line in fin: items = line.strip().split() if len(items) < 3: continue if items[1][0] != '【' and items[1][-1] != '】': continue # 只注重同义词部分 if items[1][-3:] != '.=】': continue # 目前只考虑动词 if items[0] != 'v': continue words = [] for item in items[2:]: if item not in STOP_WORDS: words.append(item) #else: # print('TRIM:%s' %(item)) YX = items[1] if not len(words): continue for word in words: if word in TYCC_ITEMS_PREP: item = TYCC_ITEMS_PREP[word] item.append({YX:' '.join(words)}) else: TYCC_ITEMS_PREP[word] = [{YX:' '.join(words)}] print("TYCC_ITEMS ORIGINAL:%d" %(len(TYCC_ITEMS_PREP))) #无多意项剔除 TYCC_ITEMS = deepcopy(TYCC_ITEMS_PREP) for word in TYCC_ITEMS_PREP: if len(TYCC_ITEMS_PREP[word]) == 1: TYCC_ITEMS.pop(word) else: for yxs in TYCC_ITEMS_PREP[word]: for(k, v) in yxs.items(): SCAN_WORDS.extend(v.split()) del TYCC_ITEMS_PREP print("TYCC_ITEMS AFTER:%d" %(len(TYCC_ITEMS))) SCAN_WORDS = set(SCAN_WORDS) print("SCAN_WORDS:%d" %(len(SCAN_WORDS)))
def build_train_dat(): global TYCC_DAT TYCC_DAT = {} line_num = 0 with open(TRAIN_COPS) as fin: while True: try: each_line = fin.readline() if not each_line: break_flag = True print("处理完毕!") break line_num += 1 if not (line_num % 2000): print("C:%d" %(line_num)) each_line = each_line.strip() seg_list = jieba.cut(each_line, cut_all=False) seg_list = split_to_sentnces(list(seg_list)) for seg_ls in seg_list: if not seg_ls: continue words = [] for item in seg_ls: if not is_zhs(item): continue if item in STOP_WORDS: continue words.append(item) len_t = len(words) if not len_t or len_t < 3: continue for i in range(len_t): if words[i] in SCAN_WORDS: if words[i] not in TYCC_DAT: TYCC_DAT[words[i]] = {} if (i-2) >= 0: TYCC_DAT[words[i]][words[i-2]] = 1 TYCC_DAT[words[i]][words[i-1]] = 2 elif (i-1) >= 0: TYCC_DAT[words[i]][words[i-1]] = 2 if (i+2) < len_t: TYCC_DAT[words[i]][words[i+2]] = 1 TYCC_DAT[words[i]][words[i+1]] = 2 elif (i+1) < len_t: TYCC_DAT[words[i]][words[i+1]] = 2 else: if (i-2) >= 0: if words[i-2] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i-2]] += 1 else: TYCC_DAT[words[i]][words[i-2]] = 1 # for i - 1 if words[i-1] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i-1]] += 2 else: TYCC_DAT[words[i]][words[i-1]] = 2 elif (i-1) >= 0: if words[i-1] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i-1]] += 2 else: TYCC_DAT[words[i]][words[i-1]] = 2 if (i+2) < len_t: if words[i+2] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i+2]] += 1 else: TYCC_DAT[words[i]][words[i+2]] = 1 # for i + 1 if words[i+1] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i+1]] += 2 else: TYCC_DAT[words[i]][words[i+1]] = 2 elif (i+1) < len_t: if words[i+1] in TYCC_DAT[words[i]]: TYCC_DAT[words[i]][words[i+1]] += 2 else: TYCC_DAT[words[i]][words[i+1]] = 2 except UnicodeDecodeError as e: print('Unicode Error! filename=%s, line_num=%d'%(TRAIN_COPS, line_num)) pass
def run(self): while True: try: tag_name = q.get(timeout = 5) except queue.Empty as e: print("Task Queue is empty, return!") return print("Thread-%d正在处理:%s,还剩:%d"%(self.threadID, tag_name, q.qsize())) #if os.path.exists(TMP_PATH+tag_name+'.dat'): # print("DAT %s already exits, skip it!"%(tag_name)) # q.task_done() # continue line_num = 0 fast_prep = 1 sub_train_data = {} if os.path.exists(DATA_DIR+'/'+tag_name+'_p.txt'): open_file = DATA_DIR+'/'+tag_name+'_p.txt' fast_prep = 1 else: open_file = DATA_DIR+'/'+tag_name+'.txt' fast_prep = 0 with open(open_file,'r') as fin: while True: try: line = fin.readline() except UnicodeDecodeError as e: print('Unicode Error! thread=%d, tag=%s, line_num=%d'%(self.threadID, tag_name, line_num)) continue if not line: print('文件已处理完! thread=%d, tag=%s, line_num=%d'%(self.threadID, tag_name, line_num)) break line_num += 1 if not line_num % 1000 : print('Thread-%d,LINE:%d'%(self.threadID, line_num)) objs = [] if not fast_prep: line = line.strip() line_t = jieba.cut(line, cut_all=False) for item in line_t: if item not in stop_words and hanzi_util.is_zhs(item): if len(item) == 1 and item not in white_words: continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) else: for item in line.split(): if len(item) == 1 and item not in white_words: continue item_id = term_to_id(item) if item_id not in objs: objs.append(item_id) #objs = [ term_to_id(t_id) for t_id in line.split()] if len(objs) < 2: continue for index_i in range(len(objs) - 1): for index_j in range(index_i + 1, len(objs)): #print('%d-%d-%d'%(len(objs),index_i, index_j)) if objs[index_i] < objs[index_j]: item_i = objs[index_i] item_j = objs[index_j] else: item_i = objs[index_j] item_j = objs[index_i] if item_i in sub_train_data: if item_j in sub_train_data[item_i]: sub_train_data[item_i][item_j] += 1 else: sub_train_data[item_i][item_j] = 1 else: sub_train_data[item_i] = {} sub_train_data[item_i][item_j] = 1 # 数据量太大,将出现频次小于等于1的词剔除掉 print("精简数据...") iter_obj = copy.deepcopy(sub_train_data) for item_1 in iter_obj.keys(): if not iter_obj[item_1]: continue for item_2 in iter_obj[item_1].keys(): if iter_obj[item_1][item_2] <= 1: #print("DEBUG1:%d - %s/%s" %(iter_obj[item_1][item_2] ,train_word_id[item_1], train_word_id[item_2])) del sub_train_data[item_1][item_2] if not iter_obj[item_1]: print("DEBUG2:%s" %(train_word_id[item_1])) del sub_train_data[item_1] del iter_obj print("保存数据...") # sub_train_data dump_file = TMP_PATH+tag_name+'.dat' with open(dump_file,'wb', -1) as fp: pickle.dump(sub_train_data, fp, -1) del sub_train_data print("Thread-%d处理[%s]结束!"%(self.threadID, tag_name)) q.task_done()
line = line.strip() stop_words.add(line) line_num = 0 frequency = {} with open(doc_name) as fin: for line in fin: line = line.strip() if len(line) > 50: continue if len(line) < 4: continue line_num += 1 if not line_num % 5000: print("CURR:%d"%(line_num)) seg_list = list(jieba.cut(line, cut_all=False)) while '' in seg_list: seg_list.remove('') line_t = [ x for x in seg_list if x not in stop_words and hanzi_util.is_zhs(x)] for token in line_t: if token in frequency: frequency[token] += 1 else: frequency[token] = 1 if not line_t: continue texts.append(line_t) documents.append(line) #texts = [[token for token in text if frequency[token] > 1] # for text in texts] del frequency dictionary = corpora.Dictionary(texts) k_value = len(dictionary) * 0.25