Python is_zhs 예제들, hanzi_util.is_zhs Python 예제들

예제 #1

0

파일 보기

파일: segment.py 프로젝트: zxlmufc/chinese_nlp

def st_build_dict(dict_file):
	global seg_dict
	seg_dict = {}
	fin = open(dict_file,"r")
	for line in fin:
		line = line.strip()
		if not line or line[0] == '#':
			continue
		line_t = line.split()
		if not hanzi_util.is_zhs(line_t[0]):
			print("SKIP:%s"%(line_t[0]))
			continue
		if(len(line_t[0]) == 1):
			if line_t[0] not in seg_dict.keys():
				seg_dict[line_t[0]] = [line_t[0]]
			else:
				print("EEEEEEEEEEEEEEEEEEEEEE1")
				seg_dict[line_t[0]].append(line_t[0])
				return
		else:
			chr = line_t[0][0]
			if chr not in seg_dict.keys():
				seg_dict[chr] = [line_t[0]]
			else:
				seg_dict[chr].append(line_t[0])

예제 #2

0

파일 보기

파일: corr_me_v1.py 프로젝트: zxlmufc/chinese_nlp

def build_train_data():
    global train_data
    global train_tags
    global stop_words
    global train_word_id
    train_data = {}
    train_tags = []
    stop_words = []
    train_word_id = []
    
    with open(STOP_FILE, 'r') as fin:
        for line in fin:
            line = line.strip()
            if line[0] == '#': continue
            stop_words.append(line)
    print("STOP WORD SIZE:%d\n" %(len(stop_words)))
        
    for parent,dirname,filenames in os.walk(DATA_DIR):
        for filename in filenames:
            tag_name = filename[:-4]
            print("正在处理：%s"%(tag_name))
            train_tags.append(tag_name)
            line_num = 0
            with open(DATA_DIR+'/'+filename,'r') as fin:
                for line in fin:
                    line_num += 1
                    if not line_num % 1000 : print('LINE:%d'%(line_num))
                    line = line.strip()
                    line_t = jieba.cut(line, cut_all=False)
                    objs = []
                    for item in line_t:
                        if item not in stop_words and hanzi_util.is_zhs(item):
                            item_id = term_to_id(item)
                            if item_id not in objs: 
                                objs.append(item_id)
                            if item_id in train_data.keys():
                                train_data[item_id]['COUNT'] += 1
                            else:
                                #print("ADDING ITEM:%s" %(item));
                                train_data[item_id] = {}
                                train_data[item_id]['COUNT'] = 1
                            if tag_name not in train_data[item_id].keys():
                                train_data[item_id][tag_name] = {}

                    #公现指数计算
                    #我们只计算一个方向的
                    if len(objs) < 2: continue
                    #print(objs)
                    for index_i in range(len(objs) - 1):
                        for index_j in range(index_i + 1, len(objs)):
                            #print('%d-%d-%d'%(len(objs),index_i, index_j))
                            item_i = objs[index_i]
                            item_j = objs[index_j]
                            item_t = item_i<<32 | item_j
                            if item_t in train_data[item_i][tag_name].keys():
                                train_data[item_i][tag_name][item_t] += 1
                            else:   #反向的是否存在
                                train_data[item_i][tag_name][item_t] = 1
    
    return

예제 #3

0

파일 보기

파일: llda.py 프로젝트: Sandy4321/chinese_nlp

def load_corpus(filename):
    corpus = []
    labels = []
    labelmap = dict()
    with open(filename, 'r') as fin:
        for line in fin:
            line = line.strip()
            mt = re.match(r'\[(.+?)\](.+)', line)
            if mt:
                label = mt.group(1).split(',')
                for x in label: labelmap[x] = 1
                line = mt.group(2).strip()
            else:
                label = None
            #标签后的文本内容
            #太长的文本丢弃掉
            if(len(line) > 512):
                continue
            line = jieba.cut(line, cut_all=False)
            doc = []
            for item in line:
                if item not in stop_words and hanzi_util.is_zhs(item):
                    doc.append(item)
            if len(doc)>0:
                corpus.append(doc)
                labels.append(label)
    
    return labelmap.keys(), corpus, labels

예제 #4

0

파일 보기

파일: llda.py 프로젝트: zxlmufc/chinese_nlp

def load_corpus(filename):
    corpus = []
    labels = []
    labelmap = dict()
    with open(filename, 'r') as fin:
        for line in fin:
            line = line.strip()
            mt = re.match(r'\[(.+?)\](.+)', line)
            if mt:
                label = mt.group(1).split(',')
                for x in label:
                    labelmap[x] = 1
                line = mt.group(2).strip()
            else:
                label = None
            #标签后的文本内容
            #太长的文本丢弃掉
            if (len(line) > 512):
                continue
            line = jieba.cut(line, cut_all=False)
            doc = []
            for item in line:
                if item not in stop_words and hanzi_util.is_zhs(item):
                    doc.append(item)
            if len(doc) > 0:
                corpus.append(doc)
                labels.append(label)

    return labelmap.keys(), corpus, labels

예제 #5

0

파일 보기

파일: wds_prep.py 프로젝트: zxlmufc/chinese_nlp

def prep_word_dict():

    CURRENT_W = None
    with open(IN_FILE) as fin:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" % (LINE_NUM))
                continue
            if not line:
                print("PROCESS DONE!")
                break

            if line[:4] == '[DDv':
                CURRENT_W = line[5:line.index(']')]
                term_to_id(CURRENT_W)
                continue

            if CURRENT_W and line[0] == '【' and ('=】' in line):
                line_x = line[line.index('】') + 1:]
                line_x = line_x.split()
                if line_x:
                    for item in line_x:
                        term_to_id(item)
                continue

    LINE_NUM = 0
    with open(YL_FILE) as fin, open(YLP_FILE, 'w') as fout:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" % (LINE_NUM))
                continue
            if not line:
                print("PROCESS DONE!")
                break

            LINE_NUM += 1
            if not (LINE_NUM % 5000): print('C:%d' % (LINE_NUM))
            if len(line) > 30: continue

            seg_list = pynlpir.segment(line, pos_tagging=False)
            for i in range(len(seg_list)):
                if is_zhs(seg_list[i]):
                    term_to_id(seg_list[i])
                elif len(seg_list[i]) == 1 and is_punct(seg_list[i]):
                    seg_list[i] = PUNCING
                else:
                    seg_list[i] = PADDING
            fout.write(' '.join(seg_list) + '\n')

    term_to_id(PADDING)
    #term_to_id(PUNCING)
    print('SEN DONE!')

예제 #6

0

파일 보기

파일: wds_prep.py 프로젝트: Sandy4321/chinese_nlp

def prep_word_dict():
    
    CURRENT_W = None
    with open(IN_FILE) as fin:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" %(LINE_NUM) )
                continue
            if not line:
                print("PROCESS DONE!")
                break

            if line[:4] == '[DDv' :
                CURRENT_W = line[5: line.index(']')]
                term_to_id(CURRENT_W)
                continue

            if CURRENT_W and line[0] == '【' and ('=】' in line):
                line_x = line[line.index('】')+1:]
                line_x = line_x.split()
                if line_x:
                    for item in line_x:
                        term_to_id(item)
                continue

    LINE_NUM = 0
    with open(YL_FILE) as fin, open(YLP_FILE, 'w') as fout:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" %(LINE_NUM) )
                continue
            if not line:
                print("PROCESS DONE!")
                break

            LINE_NUM += 1
            if not (LINE_NUM % 5000): print('C:%d' %(LINE_NUM))
            if len(line) > 30: continue

            seg_list = pynlpir.segment(line, pos_tagging=False)
            for i in range(len(seg_list)):
                if is_zhs(seg_list[i]):
                    term_to_id(seg_list[i])
                elif len(seg_list[i]) == 1 and is_punct(seg_list[i]):
                    seg_list[i] = PUNCING
                else:
                    seg_list[i] = PADDING
            fout.write(' '.join(seg_list) + '\n')

    term_to_id(PADDING)
    #term_to_id(PUNCING)
    print('SEN DONE!')

예제 #7

0

파일 보기

파일: corr_me_v2.py 프로젝트: Sandy4321/chinese_nlp

def calc_vector(str):
    count_all = {}
    sub_train = []
    if not str or not len(str):
        return None
    line = str.strip()
    line_t = jieba.cut(line, cut_all=False)
    objs = []
    for item in line_t:
        if item not in stop_words and hanzi_util.is_zhs(item):
            if item not in train_word_id:  # 单字词已经被踢掉了
               continue
            item_id = term_to_id(item)
            if item_id not in objs:
                objs.append(item_id)
    if len(objs) < 2: return None
    
    for index_i in range(len(objs) - 1):
        for index_j in range(index_i + 1, len(objs)):
            if objs[index_i] < objs[index_j]:
                item_i = objs[index_i]
                item_j = objs[index_j]
            else:
                item_i = objs[index_j]
                item_j = objs[index_i]
            item_t = item_i<<32 | item_j
            sub_train.append(item_t)

    for item_tag in train_tags[1:]:
        tag_id = train_tags.index(item_tag)
        count_all[item_tag] = {}
        for item_w in sub_train:
            item_1 = item_w >> 32
            item_2 = item_w  & 0xFFFFFFFF
            item_2_tag = (item_2 << tag_shift) | tag_id
            count_s = 0
            #count_s = sum(train_data_single[item_1].values()) + sum(train_data_single[item_2].values())
            if tag_id in train_data_single[item_1]:
                count_s += train_data_single[item_1][tag_id]
            if tag_id in train_data_single[item_2]:
                count_s += train_data_single[item_2][tag_id]
            count = 0
            if item_1 in train_data and \
            item_2_tag in train_data[item_1]:
                count +=  train_data[item_1][item_2_tag]

            #这里将对数值取反，绝对值越小，概率越大
            if count_s == 0 or count == 0:
                count_all[item_tag][item_w] = -math.log(0.0000000001)
            else:
                count_all[item_tag][item_w] = -math.log(count / count_s + 0.0000000001)
    return count_all

예제 #8

0

파일 보기

파일: sent.py 프로젝트: Sandy4321/chinese_nlp

def final_prob(classifier, str_test):
	if not classifier or not str_test:
		return None
	str_test = str_test.strip()
	line_t = jieba.cut(str_test, cut_all=False)
	objs = []
	for item in line_t:
		if item not in stop_words and hanzi_util.is_zhs(item) and item in train_word_id:
			item_id = term_to_id(item)
			if item_id not in objs: 
				objs.append(item_id)
	if not objs: return None
	feat = best_word_features(objs, best_words)
	if not feat: return None
	prob = classifier.prob_classify(feat)
	return prob

예제 #9

0

파일 보기

파일: llda.py 프로젝트: zxlmufc/chinese_nlp

def main():

    labelset, corpus, labels = load_corpus(LABEL_TRAIN_FILE)
    print("哈哈")
    print(labelset)

    if not os.path.exists("llda.dat"):
        llda = LLDA(K=len(labelset), alpha=0.001, beta=0.001)
        llda.set_corpus(labelset, corpus, labels)
        print("M=%d, V=%d, L=%d, K=%d" %
              (len(corpus), len(llda.vocas), len(labelset), len(labelset)))

        for i in range(100):
            print("-- %d " % (i + 1))
            llda.inference()

        with open("llda.dat", 'wb') as fp:
            pickle.dump(llda, fp, -1)
    else:
        print("loading llda...")
        with open("llda.dat", 'rb') as fp:
            llda = pickle.load(fp)

    #困惑度 #通常情况下，困惑度越低，说明模型产生文档的能力越高，模型的推广性也就越好，通过观测困惑度来调整K取值
    print("perplexity : %.4f" % llda.perplexity())

    phi = llda.phi()
    theta = llda.theta()
    for k, label in enumerate(labelset):
        print("\n-- label %d : %s" % (k, label))
        for w in numpy.argsort(-phi[k])[:10]:
            #print("%d~%s" %(k, w))
            print("%s: %.4f" % (llda.vocas[w], phi[k, w]))

    test_str = "如何变更手机号码？"
    line = jieba.cut(test_str.strip(), cut_all=False)
    obj = []
    for item in line:
        if item not in stop_words and hanzi_util.is_zhs(item):
            obj.append(item)
    #print(llda.phi())
    #print(llda.theta())
    print(len(phi))
    print(len(theta[0]))
    for k, label in enumerate(labelset):
        print(theta[llda.term_to_id(obj[0]), k])

예제 #10

0

파일 보기

파일: sent.py 프로젝트: zxlmufc/chinese_nlp

def final_prob(classifier, str_test):
    if not classifier or not str_test:
        return None
    str_test = str_test.strip()
    line_t = jieba.cut(str_test, cut_all=False)
    objs = []
    for item in line_t:
        if item not in stop_words and hanzi_util.is_zhs(
                item) and item in train_word_id:
            item_id = term_to_id(item)
            if item_id not in objs:
                objs.append(item_id)
    if not objs: return None
    feat = best_word_features(objs, best_words)
    if not feat: return None
    prob = classifier.prob_classify(feat)
    return prob

예제 #11

0

파일 보기

파일: corr_me_v1.py 프로젝트: zxlmufc/chinese_nlp

def calc_vector(str):
    count_all = {}
    sub_train = []
    if not str or not len(str):
        return None
    line = str.strip()
    line_t = jieba.cut(line, cut_all=False)
    objs = []
    for item in line_t:
        if item not in stop_words and hanzi_util.is_zhs(item):
            if item not in train_word_id:
               continue
            item_id = term_to_id(item)
            if item_id not in objs:
                objs.append(item_id)
    if len(objs) < 2: return None
    
    for index_i in range(len(objs) - 1):
        for index_j in range(index_i + 1, len(objs)):
            item_i = objs[index_i]
            item_j = objs[index_j]
            item_t = item_i<<32 | item_j
            sub_train.append(item_t)

    for item_tag in train_tags:
        count_all[item_tag] = {}
        for item_w in sub_train:
            item_1 = item_w >> 32
            item_2 = item_w  & 0xFFFFFFFF
            item_a = item_1 << 32 | item_2
            item_b = item_2 << 32 | item_1
            count_s = train_data[item_1]['COUNT'] + train_data[item_2]['COUNT']
            count = 0
            if item_1 in train_data.keys() and \
            item_tag in train_data[item_1].keys() and \
            item_a in train_data[item_1][item_tag].keys():
                count +=  train_data[item_1][item_tag][item_a]
            if item_2 in train_data.keys() and \
            item_tag in train_data[item_2].keys() and \
            item_b in train_data[item_2][item_tag].keys():
                count +=  train_data[item_2][item_tag][item_b]

            #这里将对数值取反，绝对值越小，概率越大
            count_all[item_tag][item_w] = -math.log(count / count_s + 0.0000000001)
    return count_all

예제 #12

0

파일 보기

파일: llda.py 프로젝트: Sandy4321/chinese_nlp

def main():

    labelset, corpus, labels = load_corpus(LABEL_TRAIN_FILE)
    print("哈哈")
    print(labelset)

    if not os.path.exists("llda.dat"):
        llda = LLDA(K=len(labelset), alpha=0.001, beta=0.001)
        llda.set_corpus(labelset, corpus, labels)
        print ("M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), len(labelset)))

        for i in range(100):
            print("-- %d " % (i + 1))
            llda.inference()
            
        with open("llda.dat", 'wb') as fp:
            pickle.dump(llda, fp, -1)
    else:
        print("loading llda...")
        with open("llda.dat", 'rb') as fp:
            llda = pickle.load(fp)
        
    #困惑度 #通常情况下，困惑度越低，说明模型产生文档的能力越高，模型的推广性也就越好，通过观测困惑度来调整K取值
    print ("perplexity : %.4f" % llda.perplexity())

    phi = llda.phi()
    theta = llda.theta()
    for k, label in enumerate(labelset):
        print ("\n-- label %d : %s" % (k, label))
        for w in numpy.argsort(-phi[k])[:10]:
            #print("%d~%s" %(k, w))
            print ("%s: %.4f" % (llda.vocas[w], phi[k,w]))
            
    test_str = "如何变更手机号码？"
    line = jieba.cut(test_str.strip(), cut_all=False)
    obj = []
    for item in line:
        if item not in stop_words and hanzi_util.is_zhs(item):
            obj.append(item)
    #print(llda.phi())
    #print(llda.theta())
    print(len(phi))
    print(len(theta[0]))
    for k, label in enumerate(labelset):
        print(theta[llda.term_to_id(obj[0]),k])

예제 #13

0

파일 보기

파일: corr_me_v4_2.py 프로젝트: Sandy4321/chinese_nlp

def final_prob(classifier, data_str):
    count_all = {}
    if not data_str or not len(data_str):
        return None
    line = data_str.strip()
    line_t = jieba.cut(line, cut_all=False)
    objs = []
    for item in line_t:
        if item not in stop_words and hanzi_util.is_zhs(item):
            if item not in train_word_id:  # 单字词已经被踢掉了
               continue
            item_id = term_to_id(item)
            if item_id not in objs:
                objs.append(item_id)

    test_feature = best_word_features(objs, best_words)
    if not test_feature: 
        print('特征为空...')
        return None
    prob = classifier.prob_classify(test_feature)
    return prob

예제 #14

0

파일 보기

파일: corr_me_v4_2.py 프로젝트: zxlmufc/chinese_nlp

def final_prob(classifier, data_str):
    count_all = {}
    if not data_str or not len(data_str):
        return None
    line = data_str.strip()
    line_t = jieba.cut(line, cut_all=False)
    objs = []
    for item in line_t:
        if item not in stop_words and hanzi_util.is_zhs(item):
            if item not in train_word_id:  # 单字词已经被踢掉了
                continue
            item_id = term_to_id(item)
            if item_id not in objs:
                objs.append(item_id)

    test_feature = best_word_features(objs, best_words)
    if not test_feature:
        print('特征为空...')
        return None
    prob = classifier.prob_classify(test_feature)
    return prob

예제 #15

0

파일 보기

def final_prob(classifier, data_str):
    count_all = {}
    sub_train = []
    if not data_str or not len(data_str):
        return None
    line = data_str.strip()
    line_t = jieba.cut(line, cut_all=False)
    objs = []
    for item in line_t:
        if item not in stop_words and hanzi_util.is_zhs(item):
            if item not in train_word_id:  # 单字词已经被踢掉了
                continue
            item_id = term_to_id(item)
            if item_id not in objs:
                objs.append(item_id)
    if len(objs) < 2: return None

    print('设计匹配对...')
    for index_i in range(len(objs) - 1):
        for index_j in range(index_i + 1, len(objs)):
            if objs[index_i] < objs[index_j]:
                item_i = objs[index_i]
                item_j = objs[index_j]
            else:
                item_i = objs[index_j]
                item_j = objs[index_i]
            item_t = item_i << word_shift | item_j
            sub_train.append(item_t)

    test_feature = best_word_features(sub_train, best_words)
    # debug 特征对
    for i in test_feature.keys():
        print("\t%s-%s" %
              (train_word_id[i >> word_shift], train_word_id[i & word_mask]))

    if not test_feature:
        print('特征为空...')
        return None
    prob = classifier.prob_classify(test_feature)
    return prob

예제 #16

0

파일 보기

파일: corr_me_v4.py 프로젝트: Sandy4321/chinese_nlp

def final_prob(classifier, data_str):
    count_all = {}
    sub_train = []
    if not data_str or not len(data_str):
        return None
    line = data_str.strip()
    line_t = jieba.cut(line, cut_all=False)
    objs = []
    for item in line_t:
        if item not in stop_words and hanzi_util.is_zhs(item):
            if item not in train_word_id:  # 单字词已经被踢掉了
               continue
            item_id = term_to_id(item)
            if item_id not in objs:
                objs.append(item_id)
    if len(objs) < 2: return None
    
    print('设计匹配对...')
    for index_i in range(len(objs) - 1):
        for index_j in range(index_i + 1, len(objs)):
            if objs[index_i] < objs[index_j]:
                item_i = objs[index_i]
                item_j = objs[index_j]
            else:
                item_i = objs[index_j]
                item_j = objs[index_i]
            item_t = item_i<<word_shift | item_j
            sub_train.append(item_t)

    test_feature = best_word_features(sub_train, best_words)
    # debug 特征对
    for i in test_feature.keys():
        print("\t%s-%s"%(train_word_id[i>>word_shift], train_word_id[i&word_mask]))

    if not test_feature: 
        print('特征为空...')
        return None
    prob = classifier.prob_classify(test_feature)
    return prob

예제 #17

0

파일 보기

        tag_name = filename[:-4]
        line_num = 0
        with open(DATA_DIR + '/' + filename,
                  'r') as fin, open(DATA_DIR + '/' + tag_name + '_p.txt',
                                    'w') as fout:
            print('正在处理：%s' % (filename))
            while True:
                try:
                    line = fin.readline()
                except UnicodeDecodeError as e:
                    print('Unicode Error! filename=%s, line_num=%d' %
                          (filename, line_num))
                    continue
                if not line:
                    print('文件已处理完! filename=%s, line_num=%d' %
                          (filename, line_num))
                    break
                line_num += 1
                if not line_num % 1000: print('LINE:%d' % (line_num))
                line = line.strip()
                line_t = jieba.cut(line, cut_all=False)
                objs = []
                for item in line_t:
                    if item not in stop_words and hanzi_util.is_zhs(item):
                        if item not in objs:
                            objs.append(item)
                if not len(objs): continue
                line = ' '.join(objs) + '\n'
                #print(line, end='')
                fout.write(line)

예제 #18

0

파일 보기

파일: simir.py 프로젝트: zxlmufc/chinese_nlp

        line_num = 0
        frequency = {}
        with open(doc_name) as fin:
            for line in fin:
                line = line.strip()
                if len(line) > 50: continue
                if len(line) < 4: continue
                line_num += 1
                if not line_num % 5000: print("CURR:%d" % (line_num))
                seg_list = list(jieba.cut(line, cut_all=False))
                while '' in seg_list:
                    seg_list.remove('')
                line_t = [
                    x for x in seg_list
                    if x not in stop_words and hanzi_util.is_zhs(x)
                ]
                for token in line_t:
                    if token in frequency:
                        frequency[token] += 1
                    else:
                        frequency[token] = 1
                if not line_t: continue
                texts.append(line_t)
                documents.append(line)

        #texts = [[token for token in text if frequency[token] > 1]
        #          for text in texts]

        del frequency
        dictionary = corpora.Dictionary(texts)

예제 #19

0

파일 보기

파일: disamb_v2.py 프로젝트: zxlmufc/chinese_nlp

def build_model():
    global STOP_WORDS
    global SCAN_WORDS
    global TYCC_ITEMS

    STOP_WORDS = []
    with open(STOP_FILE, 'r') as fin:
        for line in fin:
            item = line.strip()
            if len(item.split()) > 1:
                print(item)
            if is_zhs(item):
                STOP_WORDS.append(item)

    STOP_WORDS = set(STOP_WORDS)
    print("STOP_WORDS:%d" % (len(STOP_WORDS)))

    STOP_WORDS = []

    SCAN_WORDS = []
    TYCC_ITEMS_PREP = {}
    with open(TYCC_FILE, 'r') as fin:
        for line in fin:
            items = line.strip().split()
            if len(items) < 3: continue
            if items[1][0] != '【' and items[1][-1] != '】': continue

            # 只注重同义词部分
            if items[1][-3:] != '.=】': continue

            # 目前只考虑动词
            if items[0] != 'v': continue

            words = []
            for item in items[2:]:
                if item not in STOP_WORDS:
                    words.append(item)
                #else:
                #    print('TRIM:%s' %(item))

            YX = items[1]
            if not len(words): continue

            for word in words:
                if word in TYCC_ITEMS_PREP:
                    item = TYCC_ITEMS_PREP[word]
                    item.append({YX: ' '.join(words)})
                else:
                    TYCC_ITEMS_PREP[word] = [{YX: ' '.join(words)}]

    print("TYCC_ITEMS ORIGINAL:%d" % (len(TYCC_ITEMS_PREP)))

    #无多意项剔除
    TYCC_ITEMS = deepcopy(TYCC_ITEMS_PREP)
    for word in TYCC_ITEMS_PREP:
        if len(TYCC_ITEMS_PREP[word]) == 1:
            TYCC_ITEMS.pop(word)
        else:
            for yxs in TYCC_ITEMS_PREP[word]:
                for (k, v) in yxs.items():
                    SCAN_WORDS.extend(v.split())
    del TYCC_ITEMS_PREP
    print("TYCC_ITEMS AFTER:%d" % (len(TYCC_ITEMS)))
    SCAN_WORDS = set(SCAN_WORDS)
    print("SCAN_WORDS:%d" % (len(SCAN_WORDS)))

예제 #20

0

파일 보기

파일: disamb_v2.py 프로젝트: zxlmufc/chinese_nlp

def build_train_dat():
    global TYCC_DAT
    TYCC_DAT = {}

    line_num = 0
    with open(TRAIN_COPS) as fin:
        while True:
            try:
                each_line = fin.readline()
                if not each_line:
                    break_flag = True
                    print("处理完毕！")
                    break
                line_num += 1
                if not (line_num % 2000): print("C:%d" % (line_num))
                each_line = each_line.strip()

                seg_list = jieba.cut(each_line, cut_all=False)
                seg_list = split_to_sentnces(list(seg_list))
                for seg_ls in seg_list:
                    if not seg_ls: continue

                    words = []
                    for item in seg_ls:
                        if not is_zhs(item): continue
                        if item in STOP_WORDS: continue
                        words.append(item)

                    len_t = len(words)
                    if not len_t or len_t < 3: continue
                    for i in range(len_t):
                        if words[i] in SCAN_WORDS:
                            if words[i] not in TYCC_DAT:
                                TYCC_DAT[words[i]] = {}
                                if (i - 2) >= 0:
                                    TYCC_DAT[words[i]][words[i - 2]] = 1
                                    TYCC_DAT[words[i]][words[i - 1]] = 2
                                elif (i - 1) >= 0:
                                    TYCC_DAT[words[i]][words[i - 1]] = 2
                                if (i + 2) < len_t:
                                    TYCC_DAT[words[i]][words[i + 2]] = 1
                                    TYCC_DAT[words[i]][words[i + 1]] = 2
                                elif (i + 1) < len_t:
                                    TYCC_DAT[words[i]][words[i + 1]] = 2
                            else:
                                if (i - 2) >= 0:
                                    if words[i - 2] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i - 2]] += 1
                                    else:
                                        TYCC_DAT[words[i]][words[i - 2]] = 1
                                    # for i - 1
                                    if words[i - 1] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i - 1]] += 2
                                    else:
                                        TYCC_DAT[words[i]][words[i - 1]] = 2
                                elif (i - 1) >= 0:
                                    if words[i - 1] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i - 1]] += 2
                                    else:
                                        TYCC_DAT[words[i]][words[i - 1]] = 2

                                if (i + 2) < len_t:
                                    if words[i + 2] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i + 2]] += 1
                                    else:
                                        TYCC_DAT[words[i]][words[i + 2]] = 1
                                    # for i + 1
                                    if words[i + 1] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i + 1]] += 2
                                    else:
                                        TYCC_DAT[words[i]][words[i + 1]] = 2
                                elif (i + 1) < len_t:
                                    if words[i + 1] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i + 1]] += 2
                                    else:
                                        TYCC_DAT[words[i]][words[i + 1]] = 2

            except UnicodeDecodeError as e:
                print('Unicode Error! filename=%s, line_num=%d' %
                      (TRAIN_COPS, line_num))
                pass

예제 #21

0

파일 보기

파일: topic.py 프로젝트: Sandy4321/chinese_nlp

if not os.path.exists("./dump.dat"):
	with open(STOP_FILE, 'r') as fin:
		stop_words = []
		for line in fin:
			line = line.strip()
			stop_words.append(line)

	with open(TRAIN_FILE, 'r') as fin:
		train_set = []
		for line in fin:
			#line = line.strip().split()
			line = line.strip()
			line = jieba.cut(line, cut_all=False)
			obj = []
			for item in line:
				if item not in stop_words and hanzi_util.is_zhs(item):
					obj.append(item)
			train_set.append(obj)
			
	#stop
	fp = open("./dump.dat",'wb', -1)
	dump_data = []
	dump_data.append(stop_words)
	dump_data.append(train_set)
	pickle.dump(dump_data, fp, -1)
	
else:
	fp = open("./dump.dat",'rb')
	dump_data = pickle.load(fp)
	stop_words = dump_data[0]
	train_set = dump_data[1]

예제 #22

0

파일 보기

파일: disamb_v2.py 프로젝트: zxlmufc/chinese_nlp

    print("ORIGINAL:" + test_str)
    seg_list = jieba.cut(test_str, cut_all=False)
    for item in seg_list:
        if item in TYCC_ITEMS:
            print(" [%s] " % (item), end='')
        else:
            print(item, end='')

    seg_list = split_to_sentnces(list(seg_list))
    for seg_ls in seg_list:
        if not seg_ls: continue

        words = []
        for item in seg_ls:
            if not is_zhs(item): continue
            if item in STOP_WORDS: continue
            words.append(item)

        len_t = len(words)
        if not len_t or len_t < 3: print("UN-SUPPORT!!!")
        for i in range(len_t):
            if words[i] in TYCC_ITEMS:  #有歧义词汇
                print(" [%s] " % (words[i]))
                p = '_P_'
                n = '_P_'
                if (i - 1) >= 0:
                    p = words[i - 1]
                if (i + 1) < len_t:
                    n = words[i + 1]
                for yxs in TYCC_ITEMS[words[i]]:

예제 #23

0

파일 보기

파일: corr_me_v3.py 프로젝트: zxlmufc/chinese_nlp

def calc_vector(str):
    count_all = {}
    sub_train = []
    pair_debug = {}
    if not str or not len(str):
        return (None, None)
    line = str.strip()
    line_t = jieba.cut(line, cut_all=False)
    objs = []
    for item in line_t:
        if item not in stop_words and hanzi_util.is_zhs(item):
            if item not in train_word_id:  # 单字词已经被踢掉了
                continue
            item_id = term_to_id(item)
            if item_id not in objs:
                objs.append(item_id)
    if len(objs) < 2: return (None, None)

    #产生搭配组合
    for index_i in range(len(objs) - 1):
        for index_j in range(index_i + 1, len(objs)):
            if objs[index_i] < objs[index_j]:
                item_i = objs[index_i]
                item_j = objs[index_j]
            else:
                item_i = objs[index_j]
                item_j = objs[index_i]
            item_t = item_i << 32 | item_j
            sub_train.append(item_t)

    #统计总出现次数
    count_all = {}
    for item_tag in train_tags[1:]:
        tag_val = train_data[item_tag]
        count_all[item_tag] = {}
        for item_w in sub_train:
            count_all[item_tag][item_w] = 0
            item_1 = item_w >> 32
            item_2 = item_w & 0xFFFFFFFF
            if item_1 in tag_val and item_2 in tag_val[item_1]:
                count_all[item_tag][item_w] += tag_val[item_1][item_2]
    count_s = {}
    for item_w in sub_train:
        count_s[item_w] = 0
        for item_tag in train_tags[1:]:
            count_s[item_w] += count_all[item_tag][item_w]

    print(count_s)
    print(count_all)
    #计算概率
    count_ret = {}
    for item_tag in train_tags[1:]:
        count_ret[item_tag] = {}
        pair_debug[item_tag] = {}
        for item_w in sub_train:
            item_1 = item_w >> 32
            item_2 = item_w & 0xFFFFFFFF

            #这里将对数值取反，绝对值越小，概率越大
            count = count_all[item_tag][item_w]
            if count == 0 or count_s[item_w] == 0:
                count_ret[item_tag][item_w] = -math.log(0.0000000001)
            else:
                count_ret[item_tag][item_w] = -count * math.log(
                    count / count_s[item_w] + 0.0000000001)
                pair_debug[item_tag][
                    train_word_id[item_1] + '~' +
                    train_word_id[item_2]] = '%d/%d' % (count, count_s[item_w])
    return (count_ret, pair_debug)

예제 #24

0

파일 보기

파일: corr_me_v3.py 프로젝트: zxlmufc/chinese_nlp

    def run(self):
        while True:
            try:
                tag_name = q.get(timeout=5)
            except queue.Empty as e:
                print("Task Queue is empty, return!")
                return
            print("Thread-%d正在处理:%s，还剩:%d" %
                  (self.threadID, tag_name, q.qsize()))
            #if os.path.exists(TMP_PATH+tag_name+'.dat'):
            #    print("DAT %s already exits, skip it!"%(tag_name))
            #    q.task_done()
            #    continue
            line_num = 0
            fast_prep = 1
            sub_train_data = {}
            if os.path.exists(DATA_DIR + '/' + tag_name + '_p.txt'):
                open_file = DATA_DIR + '/' + tag_name + '_p.txt'
                fast_prep = 1
            else:
                open_file = DATA_DIR + '/' + tag_name + '.txt'
                fast_prep = 0
            with open(open_file, 'r') as fin:
                while True:
                    try:
                        line = fin.readline()
                    except UnicodeDecodeError as e:
                        print('Unicode Error! thread=%d, tag=%s, line_num=%d' %
                              (self.threadID, tag_name, line_num))
                        continue
                    if not line:
                        print('文件已处理完! thread=%d, tag=%s, line_num=%d' %
                              (self.threadID, tag_name, line_num))
                        break
                    line_num += 1
                    if not line_num % 1000:
                        print('Thread-%d,LINE:%d' % (self.threadID, line_num))
                    objs = []
                    if not fast_prep:
                        line = line.strip()
                        line_t = jieba.cut(line, cut_all=False)
                        for item in line_t:
                            if item not in stop_words and hanzi_util.is_zhs(
                                    item):
                                if len(item) == 1 and item not in white_words:
                                    continue
                                item_id = term_to_id(item)
                                if item_id not in objs:
                                    objs.append(item_id)
                    else:
                        for item in line.split():
                            if len(item) == 1 and item not in white_words:
                                continue
                            item_id = term_to_id(item)
                            if item_id not in objs:
                                objs.append(item_id)
                            #objs = [ term_to_id(t_id) for t_id in line.split()]
                    if len(objs) < 2: continue
                    for index_i in range(len(objs) - 1):
                        for index_j in range(index_i + 1, len(objs)):
                            #print('%d-%d-%d'%(len(objs),index_i, index_j))
                            if objs[index_i] < objs[index_j]:
                                item_i = objs[index_i]
                                item_j = objs[index_j]
                            else:
                                item_i = objs[index_j]
                                item_j = objs[index_i]
                            if item_i in sub_train_data:
                                if item_j in sub_train_data[item_i]:
                                    sub_train_data[item_i][item_j] += 1
                                else:
                                    sub_train_data[item_i][item_j] = 1
                            else:
                                sub_train_data[item_i] = {}
                                sub_train_data[item_i][item_j] = 1

            # 数据量太大，将出现频次小于等于1的词剔除掉
            print("精简数据...")
            iter_obj = copy.deepcopy(sub_train_data)
            for item_1 in iter_obj.keys():
                if not iter_obj[item_1]: continue
                for item_2 in iter_obj[item_1].keys():
                    if iter_obj[item_1][item_2] <= 1:
                        #print("DEBUG1:%d - %s/%s" %(iter_obj[item_1][item_2] ,train_word_id[item_1], train_word_id[item_2]))
                        del sub_train_data[item_1][item_2]
                if not iter_obj[item_1]:
                    print("DEBUG2:%s" % (train_word_id[item_1]))
                    del sub_train_data[item_1]
            del iter_obj

            print("保存数据...")
            # sub_train_data
            dump_file = TMP_PATH + tag_name + '.dat'
            with open(dump_file, 'wb', -1) as fp:
                pickle.dump(sub_train_data, fp, -1)
            del sub_train_data
            print("Thread-%d处理[%s]结束!" % (self.threadID, tag_name))
            q.task_done()

예제 #25

0

파일 보기

파일: disamb_v2.py 프로젝트: Sandy4321/chinese_nlp

    print("ORIGINAL:" + test_str)
    seg_list = jieba.cut(test_str, cut_all=False)
    for item in seg_list:
        if item in TYCC_ITEMS:
            print(" [%s] "%(item), end='')
        else:
            print(item, end='')

    seg_list = split_to_sentnces(list(seg_list))
    for seg_ls in seg_list:
        if not seg_ls: continue

        words = []
        for item in seg_ls:
            if not is_zhs(item): continue
            if item in STOP_WORDS: continue
            words.append(item)

        len_t = len(words)
        if not len_t or len_t < 3: print("UN-SUPPORT!!!")
        for i in range(len_t):
            if words[i] in TYCC_ITEMS:  #有歧义词汇
                print(" [%s] "%(words[i]))
                p = '_P_'; n = '_P_';
                if (i-1) >= 0:
                    p = words[i-1]
                if (i+1) < len_t:
                    n = words[i+1]
                for yxs in TYCC_ITEMS[words[i]]:
                    for(k, v) in yxs.items():

예제 #26

0

파일 보기

파일: corr_me_v2.py 프로젝트: Sandy4321/chinese_nlp

def build_train_data():
    global train_word_id
    global train_data_single
    global train_data
    global train_tags
    global stop_words
    train_word_id = []
    train_data_single = {}
    train_data = {}
    train_tags = ['NULL']
    stop_words = []
    
    with open(STOP_FILE, 'r') as fin:
        for line in fin:
            line = line.strip()
            if not line or line[0] == '#': continue
            stop_words.append(line)
    print("STOP WORD SIZE:%d\n" %(len(stop_words)))
     
    with open(WHITE_FILE, 'r') as fin:
        for line in fin:
            line = line.strip()
            if not line or line[0] == '#': continue
            white_words.append(line)
    print("WHITE WORD SIZE:%d\n" %(len(white_words)))

    for parent,dirname,filenames in os.walk(DATA_DIR):
        for filename in filenames:
            tag_name = filename[:-4]
            print("正在处理：%s"%(tag_name))
            train_tags.append(tag_name)
            tag_id = train_tags.index(tag_name)
            line_num = 0
            with open(DATA_DIR+'/'+filename,'r') as fin:
                for line in fin:
                    line_num += 1
                    if not line_num % 1000 : print('LINE:%d'%(line_num))
                    line = line.strip()
                    line_t = jieba.cut(line, cut_all=False)
                    objs = []
                    for item in line_t:
                        if item not in stop_words and hanzi_util.is_zhs(item):
                            if len(item) == 1 and item not in white_words:
                                if item not in debug_s_words: debug_s_words.append(item)
                                continue
                            item_id = term_to_id(item)
                            if item_id not in objs: 
                                objs.append(item_id)

                            # train_data_single
                            if not item_id in train_data_single:
                                train_data_single[item_id] = {}
                            if not tag_id in train_data_single[item_id]:
                                train_data_single[item_id][tag_id] = 1
                            else:
                                train_data_single[item_id][tag_id] += 1

                    # train_data
                    #公现指数计算
                    #我们只计算一个方向的，且排列按照 index 低-高 排列
                    if len(objs) < 2: continue
                    #print(objs)
                    for index_i in range(len(objs) - 1):
                        for index_j in range(index_i + 1, len(objs)):
                            #print('%d-%d-%d'%(len(objs),index_i, index_j))
                            if objs[index_i] < objs[index_j]:
                                item_i = objs[index_i]
                                item_j = objs[index_j]
                            else:
                                item_i = objs[index_j]
                                item_j = objs[index_i]
                            item_j_tag = (item_j << tag_shift ) | tag_id
                            if item_i in train_data:
                                if item_j_tag in train_data[item_i]:
                                    train_data[item_i][item_j_tag] += 1
                                else:
                                    train_data[item_i][item_j_tag] = 1
                            else:
                                train_data[item_i] = {}
                                train_data[item_i][item_j_tag] = 1
                                
    return

예제 #27

0

파일 보기

파일: corr_me_v3.py 프로젝트: Sandy4321/chinese_nlp

def calc_vector(str):
    count_all = {}
    sub_train = []
    pair_debug = {}
    if not str or not len(str):
        return (None,None)
    line = str.strip()
    line_t = jieba.cut(line, cut_all=False)
    objs = []
    for item in line_t:
        if item not in stop_words and hanzi_util.is_zhs(item):
            if item not in train_word_id:  # 单字词已经被踢掉了
               continue
            item_id = term_to_id(item)
            if item_id not in objs:
                objs.append(item_id)
    if len(objs) < 2: return (None,None)
    
    #产生搭配组合
    for index_i in range(len(objs) - 1):
        for index_j in range(index_i + 1, len(objs)):
            if objs[index_i] < objs[index_j]:
                item_i = objs[index_i]
                item_j = objs[index_j]
            else:
                item_i = objs[index_j]
                item_j = objs[index_i]
            item_t = item_i<<32 | item_j
            sub_train.append(item_t)

    #统计总出现次数
    count_all = {}
    for item_tag in train_tags[1:]:
        tag_val = train_data[item_tag]
        count_all[item_tag] = {}
        for item_w in sub_train:
            count_all[item_tag][item_w] = 0
            item_1 = item_w >> 32
            item_2 = item_w  & 0xFFFFFFFF
            if item_1 in tag_val and item_2 in tag_val[item_1]:
                count_all[item_tag][item_w] += tag_val[item_1][item_2]
    count_s = {}
    for item_w in sub_train:
        count_s[item_w] = 0
        for item_tag in train_tags[1:]:
            count_s[item_w] += count_all[item_tag][item_w] 
    
    print(count_s)
    print(count_all)
    #计算概率 
    count_ret = {}
    for item_tag in train_tags[1:]:
        count_ret[item_tag] = {}
        pair_debug[item_tag] = {}
        for item_w in sub_train:
            item_1 = item_w >> 32
            item_2 = item_w  & 0xFFFFFFFF

            #这里将对数值取反，绝对值越小，概率越大
            count = count_all[item_tag][item_w] 
            if count == 0 or count_s[item_w] == 0:
                count_ret[item_tag][item_w] = - math.log(0.0000000001)
            else:
                count_ret[item_tag][item_w] = - count*math.log(count / count_s[item_w] + 0.0000000001)
                pair_debug[item_tag][train_word_id[item_1]+'~'+train_word_id[item_2]] = '%d/%d' %(count, count_s[item_w])
    return (count_ret, pair_debug)

예제 #28

0

파일 보기

파일: disamb_v2.py 프로젝트: Sandy4321/chinese_nlp

def build_model():
    global STOP_WORDS
    global SCAN_WORDS
    global TYCC_ITEMS

    STOP_WORDS = []
    with open(STOP_FILE, 'r') as fin:
        for line in fin:
            item = line.strip()
            if len(item.split()) > 1:
                print(item)
            if is_zhs(item):
                STOP_WORDS.append(item)

    STOP_WORDS = set(STOP_WORDS)
    print("STOP_WORDS:%d" %(len(STOP_WORDS)))

    STOP_WORDS = []

    SCAN_WORDS = []
    TYCC_ITEMS_PREP = {}
    with open(TYCC_FILE, 'r') as fin:
        for line in fin:
            items = line.strip().split()
            if len(items) < 3: continue
            if items[1][0] != '【' and items[1][-1] != '】': continue

            # 只注重同义词部分
            if items[1][-3:] != '.=】': continue

            # 目前只考虑动词
            if items[0] != 'v': continue

            words = []
            for item in items[2:]:
                if item not in STOP_WORDS:
                    words.append(item)
                #else:
                #    print('TRIM:%s' %(item))

            YX = items[1]
            if not len(words): continue

            for word in words:
                if word in TYCC_ITEMS_PREP:
                    item = TYCC_ITEMS_PREP[word]
                    item.append({YX:' '.join(words)}) 
                else:
                    TYCC_ITEMS_PREP[word] = [{YX:' '.join(words)}]

    print("TYCC_ITEMS ORIGINAL:%d" %(len(TYCC_ITEMS_PREP)))

    #无多意项剔除
    TYCC_ITEMS = deepcopy(TYCC_ITEMS_PREP)
    for word in TYCC_ITEMS_PREP:
        if len(TYCC_ITEMS_PREP[word]) == 1:
            TYCC_ITEMS.pop(word)
        else:
            for yxs in TYCC_ITEMS_PREP[word]:
                for(k, v) in yxs.items():
                    SCAN_WORDS.extend(v.split())
    del TYCC_ITEMS_PREP               
    print("TYCC_ITEMS AFTER:%d" %(len(TYCC_ITEMS)))  
    SCAN_WORDS = set(SCAN_WORDS) 
    print("SCAN_WORDS:%d" %(len(SCAN_WORDS)))

예제 #29

0

파일 보기

파일: disamb_v2.py 프로젝트: Sandy4321/chinese_nlp

def build_train_dat():
    global TYCC_DAT
    TYCC_DAT = {}

    line_num = 0
    with open(TRAIN_COPS) as fin:
        while True:
            try:
                each_line = fin.readline()
                if not each_line:
                    break_flag = True
                    print("处理完毕！")
                    break
                line_num += 1
                if not (line_num % 2000): print("C:%d" %(line_num))
                each_line = each_line.strip()

                seg_list = jieba.cut(each_line, cut_all=False)
                seg_list = split_to_sentnces(list(seg_list))
                for seg_ls in seg_list:
                    if not seg_ls: continue

                    words = []
                    for item in seg_ls:
                        if not is_zhs(item): continue
                        if item in STOP_WORDS: continue
                        words.append(item)

                    len_t = len(words)
                    if not len_t or len_t < 3: continue
                    for i in range(len_t):
                        if words[i] in SCAN_WORDS:
                            if words[i] not in TYCC_DAT:
                                TYCC_DAT[words[i]] = {}
                                if (i-2) >= 0:
                                    TYCC_DAT[words[i]][words[i-2]] = 1
                                    TYCC_DAT[words[i]][words[i-1]] = 2
                                elif (i-1) >= 0:
                                    TYCC_DAT[words[i]][words[i-1]] = 2
                                if (i+2) < len_t:
                                    TYCC_DAT[words[i]][words[i+2]] = 1
                                    TYCC_DAT[words[i]][words[i+1]] = 2
                                elif (i+1) < len_t:
                                    TYCC_DAT[words[i]][words[i+1]] = 2
                            else:
                                if (i-2) >= 0:
                                    if words[i-2] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i-2]] += 1
                                    else:
                                        TYCC_DAT[words[i]][words[i-2]] = 1
                                    # for i - 1
                                    if words[i-1] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i-1]] += 2
                                    else:
                                        TYCC_DAT[words[i]][words[i-1]] = 2
                                elif (i-1) >= 0:
                                    if words[i-1] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i-1]] += 2
                                    else:
                                        TYCC_DAT[words[i]][words[i-1]] = 2

                                if (i+2) < len_t:
                                    if words[i+2] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i+2]] += 1
                                    else:
                                        TYCC_DAT[words[i]][words[i+2]] = 1
                                    # for i + 1
                                    if words[i+1] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i+1]] += 2
                                    else:
                                        TYCC_DAT[words[i]][words[i+1]] = 2
                                elif (i+1) < len_t:
                                    if words[i+1] in TYCC_DAT[words[i]]:
                                        TYCC_DAT[words[i]][words[i+1]] += 2
                                    else:
                                        TYCC_DAT[words[i]][words[i+1]] = 2

            except UnicodeDecodeError as e:
                print('Unicode Error! filename=%s, line_num=%d'%(TRAIN_COPS, line_num))
                pass

예제 #30

0

파일 보기

파일: corr_me_v3.py 프로젝트: Sandy4321/chinese_nlp

    def run(self):
        while True:
            try:
                tag_name = q.get(timeout =  5)
            except queue.Empty as e:
                print("Task Queue is empty, return!")
                return
            print("Thread-%d正在处理:%s，还剩:%d"%(self.threadID, tag_name, q.qsize()))
            #if os.path.exists(TMP_PATH+tag_name+'.dat'):
            #    print("DAT %s already exits, skip it!"%(tag_name))
            #    q.task_done()
            #    continue
            line_num = 0
            fast_prep = 1
            sub_train_data = {}
            if os.path.exists(DATA_DIR+'/'+tag_name+'_p.txt'):
                open_file = DATA_DIR+'/'+tag_name+'_p.txt'
                fast_prep = 1
            else:
                open_file = DATA_DIR+'/'+tag_name+'.txt'
                fast_prep = 0
            with open(open_file,'r') as fin:
                while True:
                    try:
                        line = fin.readline()
                    except UnicodeDecodeError as e:
                        print('Unicode Error! thread=%d, tag=%s, line_num=%d'%(self.threadID, tag_name, line_num))
                        continue 
                    if not line:
                        print('文件已处理完! thread=%d, tag=%s, line_num=%d'%(self.threadID, tag_name, line_num))
                        break
                    line_num += 1
                    if not line_num % 1000 : print('Thread-%d,LINE:%d'%(self.threadID, line_num))
                    objs = []
                    if not fast_prep:
                        line = line.strip()
                        line_t = jieba.cut(line, cut_all=False)
                        for item in line_t:
                            if item not in stop_words and hanzi_util.is_zhs(item):
                                if len(item) == 1 and item not in white_words:
                                    continue
                                item_id = term_to_id(item)
                                if item_id not in objs: 
                                    objs.append(item_id)
                    else:
                        for item in line.split():
                            if len(item) == 1 and item not in white_words:
                                continue
                            item_id = term_to_id(item)
                            if item_id not in objs: 
                                objs.append(item_id)
                            #objs = [ term_to_id(t_id) for t_id in line.split()]
                    if len(objs) < 2: continue
                    for index_i in range(len(objs) - 1):
                        for index_j in range(index_i + 1, len(objs)):
                            #print('%d-%d-%d'%(len(objs),index_i, index_j))
                            if objs[index_i] < objs[index_j]:
                                item_i = objs[index_i]
                                item_j = objs[index_j]
                            else:
                                item_i = objs[index_j]
                                item_j = objs[index_i]
                            if item_i in sub_train_data:
                                if item_j in sub_train_data[item_i]:
                                    sub_train_data[item_i][item_j] += 1
                                else:
                                    sub_train_data[item_i][item_j] = 1
                            else:
                                sub_train_data[item_i] = {}
                                sub_train_data[item_i][item_j] = 1

            # 数据量太大，将出现频次小于等于1的词剔除掉
            print("精简数据...")
            iter_obj = copy.deepcopy(sub_train_data)
            for item_1 in iter_obj.keys():
                if not iter_obj[item_1]: continue
                for item_2 in iter_obj[item_1].keys():
                    if iter_obj[item_1][item_2] <= 1:
                        #print("DEBUG1:%d - %s/%s" %(iter_obj[item_1][item_2] ,train_word_id[item_1], train_word_id[item_2]))
                        del sub_train_data[item_1][item_2]
                if not iter_obj[item_1]:
                    print("DEBUG2:%s" %(train_word_id[item_1]))
                    del sub_train_data[item_1]
            del iter_obj

            print("保存数据...")
            # sub_train_data
            dump_file = TMP_PATH+tag_name+'.dat'
            with open(dump_file,'wb', -1) as fp:
                pickle.dump(sub_train_data, fp, -1)
            del sub_train_data
            print("Thread-%d处理[%s]结束!"%(self.threadID, tag_name))               
            q.task_done()

예제 #31

0

파일 보기

파일: simir.py 프로젝트: bidai541/chinese_nlp

                line = line.strip()
                stop_words.add(line)

        line_num = 0
        frequency = {}
        with open(doc_name) as fin:
            for line in fin:
                line = line.strip()
                if len(line) > 50: continue
                if len(line) < 4: continue
                line_num += 1
                if not line_num % 5000: print("CURR:%d"%(line_num))
                seg_list = list(jieba.cut(line, cut_all=False))
                while '' in seg_list:
                    seg_list.remove('')
                line_t = [ x for x in seg_list if x not in stop_words and hanzi_util.is_zhs(x)]
                for token in line_t:
                    if token in frequency:
                        frequency[token] += 1
                    else:
                        frequency[token] = 1
                if not line_t: continue
                texts.append(line_t)
                documents.append(line)

        #texts = [[token for token in text if frequency[token] > 1]
        #          for text in texts]

        del frequency
        dictionary = corpora.Dictionary(texts)
        k_value = len(dictionary) * 0.25