示例#1
0
def split_into_sentences(line):
    tokens = []
    en_token = []

    def close_token(token):
        if token:
            tokens.append(''.join(token))
            del (token[:])

    for c in line:
        if is_terminator(c):
            # close current token
            if not tokens: continue
            close_token(en_token)
            tokens.append(c)
            yield tokens
            tokens = []
        elif is_punct(c):
            close_token(en_token)
            tokens.append(c)
        elif is_zh(c):
            close_token(en_token)
            tokens.append(c)
        elif c == u' ' or c == u'\t':
            close_token(en_token)
        else:
            en_token.append(c)
    if tokens:
        yield tokens
def split_into_sentences(line):
    tokens = []
    en_token = []

    def close_token(token):
        if token:
            tokens.append("".join(token))
            del (token[:])

    for c in line:
        if is_terminator(c):
            # close current token
            if not tokens:
                continue
            close_token(en_token)
            tokens.append(c)
            yield tokens
            tokens = []
        elif is_punct(c):
            close_token(en_token)
            tokens.append(c)
        elif is_zh(c):
            close_token(en_token)
            tokens.append(c)
        elif c == u" " or c == u"\t":
            close_token(en_token)
        else:
            en_token.append(c)
    if tokens:
        yield tokens
示例#3
0
def st_trainMatrix(trainfile):
	with open(trainfile) as fin:
		for line in fin:
			line = line.strip()
			line_items = line.split()
			for item in line_items:
				if hanzi_util.is_terminator(item) or ( len(item) ==1 and hanzi_util.is_punct(item) ):
					line_items.remove(item);
			# whether exists elements
			if not line_items:
				continue
			# BEMS encode
			# line_hits  <-->  line_items
			# 进行字符和处理结果的对应
			line_hits = []	# every char status
			for i_index in range(len(line_items)):
				if len(line_items[i_index]) == 1:
					line_hits += 'S'
				else:
					for j_index in range(len(line_items[i_index])):
						if j_index == 0:
							line_hits += 'B'
						elif j_index == len(line_items[i_index]) - 1:
							line_hits += 'E'
						else:
							line_hits += 'M'
			if len(''.join(line_items)) != len(line_hits):
				print("EEEEEEE %d<->%d" %(len(''.join(line_items)),len(line_hits)));
			#print(''.join(line_items))
			#print(line_hits)
			line_items = ''.join(line_items)

			for i in range(len(line_hits)-1):
				# for calc trans matrix P[I][J]
				count_trans[line_hits[i]][line_hits[i+1]] += 1
			for i in range(len(line_hits)-1):
				# for calc mixed_matrix 
				if line_items[i] not in count_mixed[line_hits[i]].keys():
					count_mixed[line_hits[i]][line_items[i]] = 1
				else:
					count_mixed[line_hits[i]][line_items[i]] += 1

	for (k_i, v_i) in count_trans.items():
		count = sum(v_i.values())
		for (k_j, v_j) in v_i.items():
			P_transMatrix[k_i][k_j] = v_j / count
    
	for (k_i, v_i) in count_mixed.items():
		for item in enumo:
			if item not in v_i.keys():
				count_mixed[k_i][item] = 1	#针对没有出现的词,将其出现频次设置为1

	for (k_i, v_i) in count_mixed.items():
		count = sum(v_i.values())
		for (k_j, v_j) in v_i.items():
			P_mixedMatrix[k_i][k_j] = (v_j +1) / count #添加1进行平滑	

	return 
示例#4
0
def prep_word_dict():
    
    CURRENT_W = None
    with open(IN_FILE) as fin:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" %(LINE_NUM) )
                continue
            if not line:
                print("PROCESS DONE!")
                break

            if line[:4] == '[DDv' :
                CURRENT_W = line[5: line.index(']')]
                term_to_id(CURRENT_W)
                continue

            if CURRENT_W and line[0] == '【' and ('=】' in line):
                line_x = line[line.index('】')+1:]
                line_x = line_x.split()
                if line_x:
                    for item in line_x:
                        term_to_id(item)
                continue

    LINE_NUM = 0
    with open(YL_FILE) as fin, open(YLP_FILE, 'w') as fout:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" %(LINE_NUM) )
                continue
            if not line:
                print("PROCESS DONE!")
                break

            LINE_NUM += 1
            if not (LINE_NUM % 5000): print('C:%d' %(LINE_NUM))
            if len(line) > 30: continue

            seg_list = pynlpir.segment(line, pos_tagging=False)
            for i in range(len(seg_list)):
                if is_zhs(seg_list[i]):
                    term_to_id(seg_list[i])
                elif len(seg_list[i]) == 1 and is_punct(seg_list[i]):
                    seg_list[i] = PUNCING
                else:
                    seg_list[i] = PADDING
            fout.write(' '.join(seg_list) + '\n')

    term_to_id(PADDING)
    #term_to_id(PUNCING)
    print('SEN DONE!')
示例#5
0
def prep_word_dict():

    CURRENT_W = None
    with open(IN_FILE) as fin:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" % (LINE_NUM))
                continue
            if not line:
                print("PROCESS DONE!")
                break

            if line[:4] == '[DDv':
                CURRENT_W = line[5:line.index(']')]
                term_to_id(CURRENT_W)
                continue

            if CURRENT_W and line[0] == '【' and ('=】' in line):
                line_x = line[line.index('】') + 1:]
                line_x = line_x.split()
                if line_x:
                    for item in line_x:
                        term_to_id(item)
                continue

    LINE_NUM = 0
    with open(YL_FILE) as fin, open(YLP_FILE, 'w') as fout:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" % (LINE_NUM))
                continue
            if not line:
                print("PROCESS DONE!")
                break

            LINE_NUM += 1
            if not (LINE_NUM % 5000): print('C:%d' % (LINE_NUM))
            if len(line) > 30: continue

            seg_list = pynlpir.segment(line, pos_tagging=False)
            for i in range(len(seg_list)):
                if is_zhs(seg_list[i]):
                    term_to_id(seg_list[i])
                elif len(seg_list[i]) == 1 and is_punct(seg_list[i]):
                    seg_list[i] = PUNCING
                else:
                    seg_list[i] = PADDING
            fout.write(' '.join(seg_list) + '\n')

    term_to_id(PADDING)
    #term_to_id(PUNCING)
    print('SEN DONE!')