data_count = 20000 #make_vocab with open('add.txt', 'r', encoding='utf-8-sig') as f: data = f.readlines() inputs = [] outputs = [] for line in tqdm(data[:data_count]): [cn, en] = line.strip('\n').split('\t') inputs.append(cn.replace(',', ' ,')[:-1].lower()) # 句中逗号后本有空格,在逗号前增加空格,然后将逗号按一个元素分隔,去掉句末标点,转为小写 outputs.append(en[:-1]) # 去掉英语标签句末标点 #print(inputs[:10]) #print(outputs[274:276]) outputs = en_segment(outputs) inputs = cn_segment(inputs) # print(outputs) encoder_vocab,decoder_vocab = make_vocab(inputs,outputs) print('\n-----------vocab have made-----------') arg = create_hparams() arg.is_training = False arg.input_vocab_size = len(encoder_vocab) arg.label_vocab_size = len(decoder_vocab) g = Graph(arg)
with open('self.txt', 'r', encoding='utf-8-sig') as f: data = f.readlines() inputs = [] outputs = [] for line in tqdm(data[:data_count]): [en, cn] = line.strip('\n').split('\t') outputs.append(cn[:-1]) # 去掉汉语标签句末标点 inputs.append(en.replace( ',', ' ,')[:-1].lower()) # 句中逗号后本有空格,在逗号前增加空格,然后将逗号按一个元素分隔,去掉句末标点,转为小写 #print('分词前:',inputs[:10]) #print('分词前:',outputs[:10]) inputs = cn_segment(inputs) outputs = en_segment(outputs) #print('分词后:',inputs[:10]) #print('分词后:',outputs[:10]) # print(outputs) encoder_vocab, decoder_vocab = make_vocab(inputs, outputs) print('\n-----------vocab have made-----------') encoder_inputs, decoder_inputs, decoder_targets = data_format( inputs, outputs, encoder_vocab, decoder_vocab) arg = create_hparams() arg.input_vocab_size = len(encoder_vocab) arg.label_vocab_size = len(decoder_vocab) arg.epochs = epoch
data_count = 1000 #make_vocab with open('cmn.txt', 'r', encoding='utf8') as f: data = f.readlines() inputs = [] outputs = [] for line in tqdm(data[:data_count]): [en, cn] = line.strip('\n').split('\t') inputs.append(en.replace(',', ' ,')[:-1].lower()) # 句中逗号后本有空格,在逗号前增加空格,然后将逗号按一个元素分隔,去掉句末标点,转为小写 outputs.append(cn[:-1]) # 去掉汉语标签句末标点 # print(inputs[:10]) # print(outputs[274:276]) inputs = en_segment(inputs) outputs = cn_segment(outputs) # print(outputs) encoder_vocab,decoder_vocab = make_vocab(inputs,outputs) print('\n-----------vocab have made-----------') arg = create_hparams() arg.is_training = False arg.input_vocab_size = len(encoder_vocab) arg.label_vocab_size = len(decoder_vocab) g = Graph(arg)