예제 #1
0
def e2k_trans(num=None):
    if request.method == 'GET':
        return render_template("e2k.html")
    if request.method == 'POST':
        if request.form['src'] == "":
            return render_template("e2k.html")
        input_sen = request.form['src']

        text_file = open("input.txt", "w", encoding="utf8")
        text_file.write(input_sen)
        text_file.close()

        tokenizer=check_output('./tokenizer.perl en < input.txt> input.txt.tok',shell=True)
        apply_bpe=check_output('../subword_nmt/apply_bpe.py -c ../data_05/bleu05/test/kr.10000.code < ./input.txt.tok > ./input.txt.tok.sym.sub',shell=True)

        valid_iter = TextIterator(args.src_file, args.en_dict,
                                 batch_size=1, maxlen=1000,
                                 ahead=1, resume_num=0)
        for x_data, x_mask, cur_line, iloop in valid_iter:
            samples = translate_beam_1(e2k_model, x_data, args)
            output = ids2words(e2k_trg_inv_dict, samples, eos_id=EOS_token)
            output = unbpe(output)

        output = output.replace(" &apos; ", "\'")
        return render_template('e2k.html', src_contents = input_sen, trans_contents = output)
    else:
        return render_template("e2k.html")
model = torch.load(file_name)

for i in range(3):
    input_sen = input("source: ")
    print(input_sen)

    text_file = open("input.kr", "w",encoding="utf8")
    text_file.write(input_sen)
    text_file.close()

    tokenizer=check_output('./tokenizer.perl en < input.kr> input.kr.tok',shell=True)
    apply_bpe=check_output('../subword_nmt/apply_bpe.py -c ../data_05/bleu05/test/kr.10000.code < ./input.kr.tok > ./input.kr.tok.sub',shell=True)

    valid_iter = TextIterator(src_file, args.src_dict,
                             batch_size=1, maxlen=1000,
                             ahead=1, resume_num=0)


    for x_data, x_mask, cur_line, iloop in valid_iter:

        if args.beam_width == 1:
            samples = translate_beam_1(model, x_data, args)
        else:
            samples = translate_beam_k(model, x_data, args)

        sentence = ids2words(trg_inv_dict, samples, eos_id=EOS_token)
        sentence = unbpe(sentence)
        print("trans: ",sentence)
        #print(sentence.replace("&apos;", "/''"))
        #print(sentence)
예제 #3
0
def k2e_trans(num=None):
    #야매임
    if request.method == 'GET':
        return render_template("k2e.html")
    if request.method == 'POST':
        if request.form['src'] == "":
            return render_template("k2e.html")
        input_sen = request.form['src']
        replaced_sen = ""
        print("src_kr : " + input_sen)

        #토큰화
        text_file = open("input.txt", "w", encoding="utf8")
        text_file.write(input_sen)
        text_file.close()
        tokenizer = check_output(
            './tokenizer.perl en < input.txt> input.txt.tok', shell=True)

        #숫자기호화
        number_sym = call('./web_symbolize.py', shell=True)  #일단 kr -> en 만 했음.
        text_file = open("input.txt.tok.sym", "r", encoding="utf8")
        replaced_sen = text_file.read()
        print("number_sym : ", replaced_sen)

        #성경인물 => P0
        lang = "k2e"
        replaced_sen, info_dict = convert_pn_for_web(replaced_sen, PN_list,
                                                     lang)
        print("replaced_sen : " + replaced_sen)
        print("info_dict : ", info_dict)
        text_file.close()

        text_file = open("input.txt.tok.sym.pn", "w", encoding="utf8")
        text_file.write(replaced_sen)
        text_file.close()

        #참고하는 코드 파일로 바꿔줘야함
        apply_bpe=check_output("../subword_nmt/apply_bpe.py -c " +\
                                "./bible_people/bible_data_GYGJ+NIV/PN/subword/kr.5000.code " +\
                                "< ./input.txt.tok.sym.pn > ./input.txt.tok.sym.pn.sub", shell=True)

        #k2e 모델에 넣기
        valid_iter = TextIterator(args.src_file,
                                  args.kr_dict,
                                  batch_size=1,
                                  maxlen=1000,
                                  ahead=1,
                                  resume_num=0)
        for x_data, x_mask, cur_line, iloop in valid_iter:
            samples = translate_beam_1(k2e_model, x_data, args)
            # print("samples : ", samples)
            output = ids2words(k2e_trg_inv_dict, samples, eos_id=EOS_token)
            output = unbpe(output)

        output = output.replace(" &apos; ", "\'")
        output = output.replace(" &apos;", "\'")
        output = output.replace("&apos; ", "\'")
        output = output.replace("&apos;", "\'")
        output = output.replace(" &quot; ", "\"")
        output = output.replace(" &quot;", "\"")
        output = output.replace("&quot; ", "\"")
        output = output.replace("&quot;", "\"")

        #숫자 기호화 되돌리기
        mapping = open("mapping.sym", "rb")
        num_dict = pickle.load(mapping)

        print("num_dict : ", num_dict)
        print("output1 : " + output)
        for key, value in num_dict.items():  #key : __NO / value : 25
            if key in output:
                output = output.replace(key, value)

        #__P0같은거 원래대로 변환
        for key, val in info_dict.items():  #key : __P0, val : 예수(한국어)
            # print("key : " + key)
            # print("val : " + val)
            temp = key.strip()
            if temp in output:
                # print("key2 : " + key)
                # print("val2 : " + val)
                for (PN_key, PN_val) in PN_list:
                    # for PN_key, PN_val in PN_dict.items():
                    if val == PN_key:
                        # print("key : " + key)
                        # print("val : " + val)
                        # print("PN_key : " + PN_key)
                        # print("PN_val : " + PN_val)

                        # print("temp : " + temp + "\n")
                        # output = output.replace(key, PN_val)
                        output = output.replace(temp, PN_val)

        print("output2 : ", output)

        return render_template('k2e.html',
                               src_contents=input_sen,
                               trans_contents=output)
    else:
        return render_template("k2e.html")
예제 #4
0
def translate_file(args, valid=None, model=None):
    torch.no_grad()

    valid_iter = TextIterator(args.valid_src_file, args.src_dict,
                         batch_size=1, maxlen=1000,
                         ahead=1, resume_num=0)

    trg_dict2 = read_dict(args.trg_dict)

    args.trg_words_n = len(trg_dict2)

    trg_inv_dict = dict()
    for kk, vv in trg_dict2.items():
        trg_inv_dict[vv] = kk

    # model
    if model is None:

        file_name = args.save_dir + '/' + args.model_file + '.pth'
        if args.use_best == 1:
            file_name = file_name + '.best.pth'
            print("Using best model")
        model = torch.load(file_name)
    '''
    model = AttNMT(args=args)
    state_dict = tmp_model.module.state_dict()
    model.load_state_dict(state_dict)
    model.to(device)
    print("I'm using ", device)
    '''
    # translate
    if valid:
        multibleu_cmd = ["perl", args.bleu_script, args.valid_trg_file, "<"]
        mb_subprocess = Popen(multibleu_cmd, stdin=PIPE, stdout=PIPE, universal_newlines=True)
    else:
        fp = open(args.trans_file, 'w')

    for x_data, x_mask, cur_line, iloop in valid_iter:
        if valid or args.beam_width == 1:
            samples = translate_beam_1(model, x_data, args)
        else:
            samples = translate_beam_k(model, x_data, args)
        sentence = ids2words(trg_inv_dict, samples, eos_id=EOS_token)
        sentence = unbpe(sentence)
        if valid:
            mb_subprocess.stdin.write(sentence + '\n')
            mb_subprocess.stdin.flush()
            if iloop % 500 == 0:
                print(iloop, 'is validated...')
        else:
            fp.write(sentence+'\n')
            if iloop % 500 == 0:
                print(iloop, 'is translated...')

    ret = -1
    if valid:
        mb_subprocess.stdin.close()
        stdout = mb_subprocess.stdout.readline()
        out_parse = re.match(r'BLEU = [-.0-9]+', stdout)
        mb_subprocess.terminate()
        if out_parse:
            ret = float(out_parse.group()[6:])
    else:
        fp.close()

    torch.set_grad_enabled(True)
    return ret