def readVocs(self, datafile, corpus_name): print("Reading lines...") # Read the file and split into lines lines = open(datafile, encoding='utf-8').\ read().strip().split('\n') # Split every line into pairs and normalize pairs = [[self.normalizeString(s) for s in l.split('\t')] for l in lines] voc = Voc(corpus_name) return voc, pairs
# For word shared in genes import json from Voc import Voc from util.datautil import * from collections import defaultdict import matplotlib.pyplot as plt from collections import Counter current_path = os.path.abspath(__file__) current_dir = os.path.dirname(current_path) abs_file_path = os.path.dirname(current_dir) voc_path = abs_file_path + "/data/data_clean_lower.txt" voc = Voc("total") voc.initVoc(voc_path) trainpairs, testpairs = prepareData(abs_file_path,"context") trainpairs.extend(testpairs) print(trainpairs[0][0]) print(trainpairs[0][1]) print(trainpairs[0][2]) descibe = [pair[2] for pair in trainpairs] bigsentence = "|".join([sentences for i in range(len(trainpairs)) for sentences in descibe[i]]) print(bigsentence) data = range(1000) # new_dict = json.dumps({'a': 'Runoob', 'b': 7}) # countdict = {sentences:bigsentence.count(sentences,0,len(bigsentence)) for i in range(len(trainpairs)) for sentences in trainpairs[i][2]} # print(countdict) # with open(current_dir + "/record.json", "w") as f: # json.dump(countdict, f) # print("加载入文件完成...") with open(current_dir + "/record.json", 'r') as load_f: load_dict = json.load(load_f)
def train(args): logging.basicConfig(filename='./log.txt', filemode='w', level=logging.WARNING, format='%(asctime)s %(levelname)-8s %(message)s') dataset = DataLoader(args.data_dir) dpTree = DependencyParsing() ces = CES() hes = HES() voc = Voc() max_text_length = 0 for data_file_name in [ args.train_data_file_name, args.dev_data_file_name, args.test_data_file_name ]: datas = dataset.load(data_file_name) pairs_label = dataset.sent2pairs_label(datas) max_text_length = max( [CalculateLength(item[0], dpTree) for item in pairs_label] + [CalculateLength(item[1], dpTree) for item in pairs_label] + [max_text_length]) print('max_text_length = %d' % (max_text_length)) # preparing training set datas = dataset.load(args.train_data_file_name) pairs_label = dataset.sent2pairs_label(datas) sent1_all_preorder = [] sent1_all_postorder = [] sent2_all_preorder = [] sent2_all_postorder = [] logging.warning("Starting dependencyparsing and tokenizing ...") for idx, item in enumerate(pairs_label): dependency1 = dpTree.dependencyparsing(item[0]) sent1_all_preorder.append(ces(dependency1)) sent1_all_postorder.append(hes(dependency1)) dependency2 = dpTree.dependencyparsing(item[1]) sent2_all_preorder.append(ces(dependency2)) sent2_all_postorder.append(hes(dependency2)) sent1_tokens = dpTree.tokenize(item[0]) sent2_tokens = dpTree.tokenize(item[1]) voc.build_idx2tok(sent1_tokens) voc.build_idx2tok(sent2_tokens) if (idx + 1) % args.display_interval == 0: logging.warning('{}/{} is over...'.format(idx + 1, len(pairs_label))) voc.build_tok2idx() print('voc size = %d' % (len(voc))) pdb.set_trace() #preprocess suitable input preprocessor = Preprocessor(dpTree, voc, max_text_length) train_datas = [] logging.warning("Starting preprocessing ...") for item, sent1_preorder, sent1_postorder, sent2_preorder, sent2_postorder in zip( pairs_label, sent1_all_preorder, sent1_all_postorder, sent2_all_preorder, sent2_all_postorder): data = [] data.append(preprocessor.sent2idx(item[0])) data.append(preprocessor.ordersent2idx(item[0], sent1_preorder)) data.append(preprocessor.ordersent2idx(item[0], sent1_postorder)) data.append(preprocessor.sent2idx(item[1])) data.append(preprocessor.ordersent2idx(item[1], sent2_preorder)) data.append(preprocessor.ordersent2idx(item[1], sent2_postorder)) data.append(preprocessor.order2sentidx(sent1_preorder)) data.append(preprocessor.order2sentidx(sent1_postorder)) data.append(preprocessor.order2sentidx(sent2_preorder)) data.append(preprocessor.order2sentidx(sent2_postorder)) data.append([len(sent1_preorder)]) data.append([len(sent2_preorder)]) data.append(preprocessor.labelonehots(item[2])) train_datas.append(data) # preparing development set datas = dataset.load(args.dev_data_file_name) pairs_label = dataset.sent2pairs_label(datas) sent1_all_preorder = [] sent1_all_postorder = [] sent2_all_preorder = [] sent2_all_postorder = [] logging.warning("Starting dependencyparsing and tokenizing ...") for idx, item in enumerate(pairs_label): dependency1 = dpTree.dependencyparsing(item[0]) sent1_all_preorder.append(ces(dependency1)) sent1_all_postorder.append(hes(dependency1)) dependency2 = dpTree.dependencyparsing(item[1]) sent2_all_preorder.append(ces(dependency2)) sent2_all_postorder.append(hes(dependency2)) if (idx + 1) % args.display_interval == 0: logging.warning('{}/{} is over...'.format(idx + 1, len(pairs_label))) dev_datas = [] logging.warning("Starting preprocessing ...") for item, sent1_preorder, sent1_postorder, sent2_preorder, sent2_postorder in zip( pairs_label, sent1_all_preorder, sent1_all_postorder, sent2_all_preorder, sent2_all_postorder): data = [] data.append(preprocessor.sent2idx(item[0])) data.append(preprocessor.ordersent2idx(item[0], sent1_preorder)) data.append(preprocessor.ordersent2idx(item[0], sent1_postorder)) data.append(preprocessor.sent2idx(item[1])) data.append(preprocessor.ordersent2idx(item[1], sent2_preorder)) data.append(preprocessor.ordersent2idx(item[1], sent2_postorder)) data.append(preprocessor.order2sentidx(sent1_preorder)) data.append(preprocessor.order2sentidx(sent1_postorder)) data.append(preprocessor.order2sentidx(sent2_preorder)) data.append(preprocessor.order2sentidx(sent2_postorder)) data.append([len(sent1_preorder)]) data.append([len(sent2_preorder)]) data.append(preprocessor.labelonehots(item[2])) dev_datas.append(data) # start training # build GRURNN model = GRURNN(max_text_length, args.output_size, args.cell_size, args.batch_size, len(voc), args.embedding_size, args.lr) saver = tf.train.Saver(max_to_keep=0) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.4 sess = tf.Session(config=config) merged = tf.summary.merge_all() writer = tf.summary.FileWriter("logs/", sess.graph) sess.run(tf.global_variables_initializer()) # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) dev_total_acc = 0 dev_total_loss = 0 dev_total_best_acc = 0 dev_total_best_loss = 0 stopping_steps = 0 logging.warning("Starting training ...") for epoch in range(args.epochs): logging.warning('epoch : {} / {}'.format(epoch + 1, args.epochs)) train_datas_idx = random.sample(range(len(train_datas)), len(train_datas)) for idx, batch in enumerate( batchnize(train_datas_idx, train_datas, args.batch_size)): # initialize data feed_dict = { model.premises_normal: batch[0], model.premises_preorder: batch[1], model.premises_postorder: batch[2], model.hypotheses_normal: batch[3], model.hypotheses_preorder: batch[4], model.hypotheses_postorder: batch[5], model.premises_preordersentidx: batch[6], model.premises_postordersentidx: batch[7], model.hypotheses_preordersentidx: batch[8], model.hypotheses_postordersentidx: batch[9], model.premises_len: batch[10], model.hypotheses_len: batch[11], model.labels: batch[12] } # training train_rs, _, train_loss, train_acc, train_logits = sess.run( [ merged, model.train_op, model.cross_entropy, model.acc, model.logits ], feed_dict=feed_dict) writer.add_summary(train_rs, idx) # cal_num += 1 # total_loss += loss # print("{}, {}".format(np.argmax(train_datas[data_idx][12]), np.argmax(logits))) # if np.argmax(train_datas[data_idx][12]) == np.argmax(logits): # acc += 1 # print(logits) # print loss if (idx + 1) % args.display_interval == 0: logging.warning( "idx : {} , cross entropy = {} , acc = {}".format( idx + 1, train_loss, train_acc)) # cal_num = 0 # total_loss = 0 # acc = 0 dev_datas_idx = list(range(len(dev_datas))) for idx, batch in enumerate( batchnize(dev_datas_idx, dev_datas, args.batch_size)): feed_dict = { model.premises_normal: batch[0], model.premises_preorder: batch[1], model.premises_postorder: batch[2], model.hypotheses_normal: batch[3], model.hypotheses_preorder: batch[4], model.hypotheses_postorder: batch[5], model.premises_preordersentidx: batch[6], model.premises_postordersentidx: batch[7], model.hypotheses_preordersentidx: batch[8], model.hypotheses_postordersentidx: batch[9], model.premises_len: batch[10], model.hypotheses_len: batch[11], model.labels: batch[12] } dev_loss, dev_acc, dev_logits = sess.run( [model.cross_entropy, model.acc, model.logits], feed_dict=feed_dict) dev_total_acc += dev_acc * len(batch[0]) # logging.warning('dev_total_acc = {} , dev_acc = {} , len(batch[0]) = {}'.format(dev_total_acc, dev_acc, len(batch[0]))) dev_total_acc /= len(dev_datas) logging.warning('epoch : {}, dev_total_acc = {}'.format( epoch + 1, dev_total_acc)) if dev_total_acc > dev_total_best_acc: stopping_steps = 0 dev_total_best_acc = dev_total_acc save_path = saver.save(sess, "model/net.ckpt", global_step=epoch + 1) else: stopping_steps += 1 if stopping_steps >= args.early_stopping_steps: logging.warning( 'Early stopping is trigger at epoch : {}, best development set accuracy : {}' .format(epoch + 1, dev_total_best_acc)) break dev_total_acc = 0
from __future__ import unicode_literals from Classes import * from Proc import Proc from Voc import Voc from natasha import NamesExtractor, AddressExtractor import requests import os import lxml.html from googlesearch import search corpus_name = "train" corpus = os.path.join("Data", corpus_name) USE_CUDA = torch.cuda.is_available() device = torch.device("cuda" if USE_CUDA else "cpu") voc = Voc(corpus) # voc.load() proc = Proc(10, 3) datafile = os.path.join(corpus, "di_all.txt") save_dir = os.path.join("Data", "save") model_name = 'cb_model' attn_model = 'dot' hidden_size = 500 encoder_n_layers = 2 decoder_n_layers = 2 dropout = 0.1 batch_size = 64 checkpoint_iter = 10000 loadFilename = os.path.join( save_dir, model_name, corpus_name,
nlp = spacy.load("en_core_web_sm") MAX_LENGTH = 15 PAD_token = 0 # 填充 SOS_token = 1 # 句子开头 EOS_token = 2 # 句子结尾 def normalizeString(s): s = re.sub('[^a-z\\-.,!?]', ' ', s.lower()) s = word_tokenize(s) return ' '.join(s) voc = Voc(None) with open("model/voc.pkl", 'rb') as file: voc = pickle.loads(file.read()) with open("model/pairs.pkl", 'rb') as file: pairs = pickle.loads(file.read()) """ transform to numerical sentence """ def indexesFromSentence(voc, sentence): return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token] def evaluate(searcher, voc, sentence, max_length=MAX_LENGTH): indexes_batch = [indexesFromSentence(voc, sentence)] lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
def prepareData(file, filters=None, max_length=None, reverse=False): pairs = read_file(file, reverse) print(f"Tenemos {len(pairs)} pares de frases") if filters is not None: assert max_length is not None pairs = [filterPairs(pair) for pair in pairs] print(f"Filtramos a {len(pairs)} pares de frases") # Reverse pairs, make Lang instances if reverse: pairs = [list(reversed(p)) for p in pairs] input_lang = Voc('luis') output_lang = Voc('ana') else: input_lang = Voc('ana') output_lang = Voc('luis') for pair in pairs: input_lang.addSentence(pair[0]) output_lang.addSentence(pair[1]) # add <eos> token pair[0] += " EOS" pair[1] += " EOS" print("Longitud vocabularios:") print(input_lang.name, input_lang.n_words) print(output_lang.name, output_lang.n_words) return input_lang, output_lang, pairs
Usage: Running this file to train model(general dialogue chatbot) and save model in local directory. """ corpus_name = "cornell movie-dialogs corpus" device = torch.device("cpu") datafile = os.path.join(corpus_name, 'formatted_movie_lines.txt') MIN_COUNT = 3 MAX_LENGTH = 15 PAD_token = 0 # padding SOS_token = 1 # start tag of sentence EOS_token = 2 # end tag of sentace """ Loading voc object and pairs variable from local directory. """ voc = Voc(None) with open("model/voc.pkl", 'rb') as file: voc = pickle.loads(file.read()) with open("model/pairs.pkl", 'rb') as file: pairs = pickle.loads(file.read()) """ transform to numerical sentence """ def indexesFromSentence(voc, sentence): return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token] """ padding """
if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iter), iter, iter / n_iter * 100, print_loss_avg)) # if iter % plot_every == 0: # plot_loss_avg = plot_loss_total / plot_every # plot_losses.append(plot_loss_avg) # plot_loss_total = 0 # showPlot(plot_losses) voc_path = current_dir + "/data/eng-fra.txt" print(voc_path) vocEng = Voc("eng") vocFra = Voc("fra") engVoc = vocEng.naiveInitVoc(voc_path, 0) fraVoc = vocFra.naiveInitVoc(voc_path, 1) word2glove = [] # pairs = prepareData(abs_file_path,"concate") pairs = naivePrepare(voc_path) print(pairs) hidden_size = 100 encoder1 = EncoderRNN(vocEng.num_words, hidden_size).to(device) attn_decoder1 = AttnDecoderRNN(hidden_size, vocFra.num_words, dropout=0.1).to(device)
def readVocs(datafile, corpus_name): lines = open(datafile, encoding='utf-8').read().strip().split('\n') pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] voc = Voc(corpus_name) return voc, pairs