예제 #1
0
def run_train():
    pool = Pool(processes=cpu_count())
    X, Y = make_data(pool, 'ted_7_ErasePunc_FullKorean__train.txt')
    print('make train data end.')
    X = norm_many(pool, X)
    print('norm train data end.')

    train(X, Y, 'model.tfl')
예제 #2
0
def run_test():
    print('test')
    pool = Pool(processes=cpu_count())
    X, Y = make_data(pool, 'ted_7_ErasePunc_FullKorean__test.txt')
    print('make test data end.')
    X = norm_many(pool, X)
    print('norm_data end.')
    interference(X, Y, 'model.tfl')
예제 #3
0
def run_train(train_file):
    print('train')
    pool = Pool(processes=cpu_count())
    X, Y = make_data(pool, train_file)
    print('make train data end.')
    X = norm_many(pool, X)
    print('norm_data end.')
    train(X, Y, 'model_MDM001.tfl')
예제 #4
0
def run_test():
    pool = Pool(processes=cpu_count())
    X, Y = make_data(pool, 'ted_7_ErasePunc_FullKorean__test.txt')
    print('make test data end.')
    X = norm_many(pool, X)
    print('norm test data end.')

    pred = test(X, Y, 'model.tfl')
    print('pred[:10]={}'.format(pred[:10]))
예제 #5
0
def main():
    if len(sys.argv) < 2:
        print('usage: bi_lstm.py (train|test|make)')
        sys.exit(1)

    if sys.argv[1] == 'train':
        train_file = 'MDM001_FullKorean__train.txt'
        #run_train(train_file)
        run_train_divided(train_file)
    elif sys.argv[1] == 'test':
        test_file = 'ted_7_ErasePunc_FullKorean__test.txt'
        lines = read_text_lines(test_file)
        lines = (refine_line(line) for line in lines)
        lines = [re.sub(r'[\ \n\r]+', '', line).strip() for line in lines]

        i = 0
        with codecs.open('ted_test_result.txt', 'w', encoding='utf-8') as wfh:
            for Y in run_test_divided(test_file):
                # Y의 길이와 lines의 길이를 확인해가면서 합치기
                # 아니면 Y가 10000줄 처리한 단위로 나오니까 10000줄씩 읽어서 대조해보기

                y_pos = 0
                buf = []
                while True:
                    '''
                    Y가 있는 만큼만 line을 진행시켜서 해보기
                    '''
                    line = lines[i]

                    result = ''
                    line_y = Y[y_pos:y_pos + len(line)]
                    for ch, y in zip(line, line_y):
                        if y == '1':
                            result += ' ' + ch
                        else:
                            result += ch

                    buf.append(result.strip())

                    y_pos += len(line)
                    i += 1
                    if y_pos >= len(Y):
                        break

                wfh.write('\n'.join(buf) + '\n')
    elif sys.argv[1] == 'make':
        make_file = 'MDM001_FullKorean__train.txt'
        lines = read_text_lines(make_file)
        lines = (refine_line(line) for line in lines)
        lines = [re.sub(r'[\ \n\r]+', '', line).strip() for line in lines]

        i = 0
        pool = Pool(processes=cpu_count())
        X = []
        Y = []
        for x, y in make_data_divided(pool, make_file):
            x = norm_many(pool, x)
            x = pad_sequences(x, maxlen=440, value=0.)
            if len(X) > 0:
                X = np.concatenate((X, x), axis=0)
            else:
                X = x

            print('{}) x'.format(i), end=', ')
            y = to_categorical(y, nb_classes=2)
            if len(Y) > 0:
                Y = np.concatenate((Y, y), axis=0)
            else:
                Y = y

            print('y')
            i += 1

        # TODO: 파일 이름, 데이터셋 이름 바꾸기
        #h5f = h5py.File('ted_train.h5', 'w')
        #h5f.create_dataset('ted7_X', data=X)
        #h5f.create_dataset('ted7_Y', data=Y)
        h5f = h5py.File('ted_MDM001.h5', 'w')
        h5f.create_dataset('MDM001_X', data=X)
        h5f.create_dataset('MDM001_Y', data=Y)
        h5f.close()
    else:
        print('usage: bi_lstm.py (train|test|make)')