Пример #1
0
data_count = 20000

#make_vocab
with open('add.txt', 'r', encoding='utf-8-sig') as f:
    data = f.readlines()
    inputs = []
    outputs = []
    for line in tqdm(data[:data_count]):
        [cn, en] = line.strip('\n').split('\t')

        inputs.append(cn.replace(',', ' ,')[:-1].lower())  # 句中逗号后本有空格,在逗号前增加空格,然后将逗号按一个元素分隔,去掉句末标点,转为小写
        outputs.append(en[:-1])  # 去掉英语标签句末标点
    #print(inputs[:10])
    #print(outputs[274:276])
    outputs = en_segment(outputs)
    inputs = cn_segment(inputs)
    # print(outputs)

encoder_vocab,decoder_vocab = make_vocab(inputs,outputs)
print('\n-----------vocab have made-----------')




arg = create_hparams()
arg.is_training = False
arg.input_vocab_size = len(encoder_vocab)
arg.label_vocab_size = len(decoder_vocab)


g = Graph(arg)
Пример #2
0
with open('self.txt', 'r', encoding='utf-8-sig') as f:
    data = f.readlines()
    inputs = []
    outputs = []
    for line in tqdm(data[:data_count]):
        [en, cn] = line.strip('\n').split('\t')

        outputs.append(cn[:-1])  # 去掉汉语标签句末标点
        inputs.append(en.replace(
            ',',
            ' ,')[:-1].lower())  # 句中逗号后本有空格,在逗号前增加空格,然后将逗号按一个元素分隔,去掉句末标点,转为小写

    #print('分词前:',inputs[:10])
    #print('分词前:',outputs[:10])
    inputs = cn_segment(inputs)
    outputs = en_segment(outputs)
    #print('分词后:',inputs[:10])
    #print('分词后:',outputs[:10])
    # print(outputs)

encoder_vocab, decoder_vocab = make_vocab(inputs, outputs)
print('\n-----------vocab have made-----------')

encoder_inputs, decoder_inputs, decoder_targets = data_format(
    inputs, outputs, encoder_vocab, decoder_vocab)

arg = create_hparams()
arg.input_vocab_size = len(encoder_vocab)
arg.label_vocab_size = len(decoder_vocab)
arg.epochs = epoch
Пример #3
0
data_count = 1000

#make_vocab
with open('cmn.txt', 'r', encoding='utf8') as f:
    data = f.readlines()
    inputs = []
    outputs = []
    for line in tqdm(data[:data_count]):
        [en, cn] = line.strip('\n').split('\t')

        inputs.append(en.replace(',', ' ,')[:-1].lower())  # 句中逗号后本有空格,在逗号前增加空格,然后将逗号按一个元素分隔,去掉句末标点,转为小写
        outputs.append(cn[:-1])  # 去掉汉语标签句末标点
    # print(inputs[:10])
    # print(outputs[274:276])
    inputs = en_segment(inputs)
    outputs = cn_segment(outputs)
    # print(outputs)

encoder_vocab,decoder_vocab = make_vocab(inputs,outputs)
print('\n-----------vocab have made-----------')




arg = create_hparams()
arg.is_training = False
arg.input_vocab_size = len(encoder_vocab)
arg.label_vocab_size = len(decoder_vocab)


g = Graph(arg)