def run_train(): pool = Pool(processes=cpu_count()) X, Y = make_data(pool, 'ted_7_ErasePunc_FullKorean__train.txt') print('make train data end.') X = norm_many(pool, X) print('norm train data end.') train(X, Y, 'model.tfl')
def run_test(): print('test') pool = Pool(processes=cpu_count()) X, Y = make_data(pool, 'ted_7_ErasePunc_FullKorean__test.txt') print('make test data end.') X = norm_many(pool, X) print('norm_data end.') interference(X, Y, 'model.tfl')
def run_train(train_file): print('train') pool = Pool(processes=cpu_count()) X, Y = make_data(pool, train_file) print('make train data end.') X = norm_many(pool, X) print('norm_data end.') train(X, Y, 'model_MDM001.tfl')
def run_test(): pool = Pool(processes=cpu_count()) X, Y = make_data(pool, 'ted_7_ErasePunc_FullKorean__test.txt') print('make test data end.') X = norm_many(pool, X) print('norm test data end.') pred = test(X, Y, 'model.tfl') print('pred[:10]={}'.format(pred[:10]))
def main(): if len(sys.argv) < 2: print('usage: bi_lstm.py (train|test|make)') sys.exit(1) if sys.argv[1] == 'train': train_file = 'MDM001_FullKorean__train.txt' #run_train(train_file) run_train_divided(train_file) elif sys.argv[1] == 'test': test_file = 'ted_7_ErasePunc_FullKorean__test.txt' lines = read_text_lines(test_file) lines = (refine_line(line) for line in lines) lines = [re.sub(r'[\ \n\r]+', '', line).strip() for line in lines] i = 0 with codecs.open('ted_test_result.txt', 'w', encoding='utf-8') as wfh: for Y in run_test_divided(test_file): # Y의 길이와 lines의 길이를 확인해가면서 합치기 # 아니면 Y가 10000줄 처리한 단위로 나오니까 10000줄씩 읽어서 대조해보기 y_pos = 0 buf = [] while True: ''' Y가 있는 만큼만 line을 진행시켜서 해보기 ''' line = lines[i] result = '' line_y = Y[y_pos:y_pos + len(line)] for ch, y in zip(line, line_y): if y == '1': result += ' ' + ch else: result += ch buf.append(result.strip()) y_pos += len(line) i += 1 if y_pos >= len(Y): break wfh.write('\n'.join(buf) + '\n') elif sys.argv[1] == 'make': make_file = 'MDM001_FullKorean__train.txt' lines = read_text_lines(make_file) lines = (refine_line(line) for line in lines) lines = [re.sub(r'[\ \n\r]+', '', line).strip() for line in lines] i = 0 pool = Pool(processes=cpu_count()) X = [] Y = [] for x, y in make_data_divided(pool, make_file): x = norm_many(pool, x) x = pad_sequences(x, maxlen=440, value=0.) if len(X) > 0: X = np.concatenate((X, x), axis=0) else: X = x print('{}) x'.format(i), end=', ') y = to_categorical(y, nb_classes=2) if len(Y) > 0: Y = np.concatenate((Y, y), axis=0) else: Y = y print('y') i += 1 # TODO: 파일 이름, 데이터셋 이름 바꾸기 #h5f = h5py.File('ted_train.h5', 'w') #h5f.create_dataset('ted7_X', data=X) #h5f.create_dataset('ted7_Y', data=Y) h5f = h5py.File('ted_MDM001.h5', 'w') h5f.create_dataset('MDM001_X', data=X) h5f.create_dataset('MDM001_Y', data=Y) h5f.close() else: print('usage: bi_lstm.py (train|test|make)')