Exemplo n.º 1
0
 def readVocs(self, datafile, corpus_name):
     print("Reading lines...")
     # Read the file and split into lines
     lines = open(datafile, encoding='utf-8').\
         read().strip().split('\n')
     # Split every line into pairs and normalize
     pairs = [[self.normalizeString(s) for s in l.split('\t')]
              for l in lines]
     voc = Voc(corpus_name)
     return voc, pairs
Exemplo n.º 2
0
# For word shared in genes
import json
from Voc import Voc
from util.datautil import *
from collections import defaultdict
import matplotlib.pyplot as plt
from collections import Counter
current_path = os.path.abspath(__file__)
current_dir = os.path.dirname(current_path)
abs_file_path = os.path.dirname(current_dir)
voc_path = abs_file_path + "/data/data_clean_lower.txt"
voc = Voc("total")
voc.initVoc(voc_path)
trainpairs, testpairs = prepareData(abs_file_path,"context")
trainpairs.extend(testpairs)
print(trainpairs[0][0])
print(trainpairs[0][1])
print(trainpairs[0][2])
descibe = [pair[2] for pair in trainpairs]
bigsentence = "|".join([sentences for i in range(len(trainpairs)) for sentences in descibe[i]])
print(bigsentence)
data = range(1000)
# new_dict = json.dumps({'a': 'Runoob', 'b': 7})

# countdict = {sentences:bigsentence.count(sentences,0,len(bigsentence)) for i in range(len(trainpairs)) for sentences in trainpairs[i][2]}
# print(countdict)
# with open(current_dir + "/record.json", "w") as f:
#     json.dump(countdict, f)
#     print("加载入文件完成...")
with open(current_dir + "/record.json", 'r') as load_f:
    load_dict = json.load(load_f)
def train(args):
    logging.basicConfig(filename='./log.txt',
                        filemode='w',
                        level=logging.WARNING,
                        format='%(asctime)s %(levelname)-8s %(message)s')
    dataset = DataLoader(args.data_dir)
    dpTree = DependencyParsing()

    ces = CES()
    hes = HES()
    voc = Voc()

    max_text_length = 0
    for data_file_name in [
            args.train_data_file_name, args.dev_data_file_name,
            args.test_data_file_name
    ]:
        datas = dataset.load(data_file_name)
        pairs_label = dataset.sent2pairs_label(datas)
        max_text_length = max(
            [CalculateLength(item[0], dpTree) for item in pairs_label] +
            [CalculateLength(item[1], dpTree)
             for item in pairs_label] + [max_text_length])

    print('max_text_length = %d' % (max_text_length))
    # preparing training set

    datas = dataset.load(args.train_data_file_name)
    pairs_label = dataset.sent2pairs_label(datas)

    sent1_all_preorder = []
    sent1_all_postorder = []
    sent2_all_preorder = []
    sent2_all_postorder = []
    logging.warning("Starting dependencyparsing and tokenizing ...")
    for idx, item in enumerate(pairs_label):
        dependency1 = dpTree.dependencyparsing(item[0])
        sent1_all_preorder.append(ces(dependency1))
        sent1_all_postorder.append(hes(dependency1))

        dependency2 = dpTree.dependencyparsing(item[1])
        sent2_all_preorder.append(ces(dependency2))
        sent2_all_postorder.append(hes(dependency2))

        sent1_tokens = dpTree.tokenize(item[0])
        sent2_tokens = dpTree.tokenize(item[1])

        voc.build_idx2tok(sent1_tokens)
        voc.build_idx2tok(sent2_tokens)

        if (idx + 1) % args.display_interval == 0:
            logging.warning('{}/{} is over...'.format(idx + 1,
                                                      len(pairs_label)))

    voc.build_tok2idx()
    print('voc size = %d' % (len(voc)))
    pdb.set_trace()
    #preprocess suitable input
    preprocessor = Preprocessor(dpTree, voc, max_text_length)
    train_datas = []

    logging.warning("Starting preprocessing ...")
    for item, sent1_preorder, sent1_postorder, sent2_preorder, sent2_postorder in zip(
            pairs_label, sent1_all_preorder, sent1_all_postorder,
            sent2_all_preorder, sent2_all_postorder):
        data = []
        data.append(preprocessor.sent2idx(item[0]))
        data.append(preprocessor.ordersent2idx(item[0], sent1_preorder))
        data.append(preprocessor.ordersent2idx(item[0], sent1_postorder))

        data.append(preprocessor.sent2idx(item[1]))
        data.append(preprocessor.ordersent2idx(item[1], sent2_preorder))
        data.append(preprocessor.ordersent2idx(item[1], sent2_postorder))

        data.append(preprocessor.order2sentidx(sent1_preorder))
        data.append(preprocessor.order2sentidx(sent1_postorder))
        data.append(preprocessor.order2sentidx(sent2_preorder))
        data.append(preprocessor.order2sentidx(sent2_postorder))

        data.append([len(sent1_preorder)])
        data.append([len(sent2_preorder)])

        data.append(preprocessor.labelonehots(item[2]))

        train_datas.append(data)

    # preparing development set
    datas = dataset.load(args.dev_data_file_name)
    pairs_label = dataset.sent2pairs_label(datas)

    sent1_all_preorder = []
    sent1_all_postorder = []
    sent2_all_preorder = []
    sent2_all_postorder = []
    logging.warning("Starting dependencyparsing and tokenizing ...")
    for idx, item in enumerate(pairs_label):
        dependency1 = dpTree.dependencyparsing(item[0])
        sent1_all_preorder.append(ces(dependency1))
        sent1_all_postorder.append(hes(dependency1))

        dependency2 = dpTree.dependencyparsing(item[1])
        sent2_all_preorder.append(ces(dependency2))
        sent2_all_postorder.append(hes(dependency2))

        if (idx + 1) % args.display_interval == 0:
            logging.warning('{}/{} is over...'.format(idx + 1,
                                                      len(pairs_label)))

    dev_datas = []
    logging.warning("Starting preprocessing ...")
    for item, sent1_preorder, sent1_postorder, sent2_preorder, sent2_postorder in zip(
            pairs_label, sent1_all_preorder, sent1_all_postorder,
            sent2_all_preorder, sent2_all_postorder):
        data = []
        data.append(preprocessor.sent2idx(item[0]))
        data.append(preprocessor.ordersent2idx(item[0], sent1_preorder))
        data.append(preprocessor.ordersent2idx(item[0], sent1_postorder))

        data.append(preprocessor.sent2idx(item[1]))
        data.append(preprocessor.ordersent2idx(item[1], sent2_preorder))
        data.append(preprocessor.ordersent2idx(item[1], sent2_postorder))

        data.append(preprocessor.order2sentidx(sent1_preorder))
        data.append(preprocessor.order2sentidx(sent1_postorder))
        data.append(preprocessor.order2sentidx(sent2_preorder))
        data.append(preprocessor.order2sentidx(sent2_postorder))

        data.append([len(sent1_preorder)])
        data.append([len(sent2_preorder)])

        data.append(preprocessor.labelonehots(item[2]))

        dev_datas.append(data)

    # start training
    # build GRURNN

    model = GRURNN(max_text_length, args.output_size, args.cell_size,
                   args.batch_size, len(voc), args.embedding_size, args.lr)
    saver = tf.train.Saver(max_to_keep=0)
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=True)

    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.4
    sess = tf.Session(config=config)
    merged = tf.summary.merge_all()
    writer = tf.summary.FileWriter("logs/", sess.graph)
    sess.run(tf.global_variables_initializer())
    # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

    # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)

    dev_total_acc = 0
    dev_total_loss = 0
    dev_total_best_acc = 0
    dev_total_best_loss = 0
    stopping_steps = 0

    logging.warning("Starting training ...")
    for epoch in range(args.epochs):
        logging.warning('epoch : {} / {}'.format(epoch + 1, args.epochs))
        train_datas_idx = random.sample(range(len(train_datas)),
                                        len(train_datas))
        for idx, batch in enumerate(
                batchnize(train_datas_idx, train_datas, args.batch_size)):

            # initialize data
            feed_dict = {
                model.premises_normal: batch[0],
                model.premises_preorder: batch[1],
                model.premises_postorder: batch[2],
                model.hypotheses_normal: batch[3],
                model.hypotheses_preorder: batch[4],
                model.hypotheses_postorder: batch[5],
                model.premises_preordersentidx: batch[6],
                model.premises_postordersentidx: batch[7],
                model.hypotheses_preordersentidx: batch[8],
                model.hypotheses_postordersentidx: batch[9],
                model.premises_len: batch[10],
                model.hypotheses_len: batch[11],
                model.labels: batch[12]
            }
            # training
            train_rs, _, train_loss, train_acc, train_logits = sess.run(
                [
                    merged, model.train_op, model.cross_entropy, model.acc,
                    model.logits
                ],
                feed_dict=feed_dict)
            writer.add_summary(train_rs, idx)

            # cal_num += 1
            # total_loss += loss

            # print("{}, {}".format(np.argmax(train_datas[data_idx][12]), np.argmax(logits)))
            # if np.argmax(train_datas[data_idx][12]) == np.argmax(logits):
            #     acc += 1

            # print(logits)
            # print loss
            if (idx + 1) % args.display_interval == 0:
                logging.warning(
                    "idx : {} , cross entropy = {} , acc = {}".format(
                        idx + 1, train_loss, train_acc))
                # cal_num = 0
                # total_loss = 0
                # acc = 0

        dev_datas_idx = list(range(len(dev_datas)))
        for idx, batch in enumerate(
                batchnize(dev_datas_idx, dev_datas, args.batch_size)):
            feed_dict = {
                model.premises_normal: batch[0],
                model.premises_preorder: batch[1],
                model.premises_postorder: batch[2],
                model.hypotheses_normal: batch[3],
                model.hypotheses_preorder: batch[4],
                model.hypotheses_postorder: batch[5],
                model.premises_preordersentidx: batch[6],
                model.premises_postordersentidx: batch[7],
                model.hypotheses_preordersentidx: batch[8],
                model.hypotheses_postordersentidx: batch[9],
                model.premises_len: batch[10],
                model.hypotheses_len: batch[11],
                model.labels: batch[12]
            }
            dev_loss, dev_acc, dev_logits = sess.run(
                [model.cross_entropy, model.acc, model.logits],
                feed_dict=feed_dict)

            dev_total_acc += dev_acc * len(batch[0])
            # logging.warning('dev_total_acc = {} , dev_acc = {} , len(batch[0]) = {}'.format(dev_total_acc, dev_acc, len(batch[0])))
        dev_total_acc /= len(dev_datas)
        logging.warning('epoch : {}, dev_total_acc = {}'.format(
            epoch + 1, dev_total_acc))
        if dev_total_acc > dev_total_best_acc:
            stopping_steps = 0
            dev_total_best_acc = dev_total_acc
            save_path = saver.save(sess,
                                   "model/net.ckpt",
                                   global_step=epoch + 1)
        else:
            stopping_steps += 1

        if stopping_steps >= args.early_stopping_steps:
            logging.warning(
                'Early stopping is trigger at epoch : {}, best development set accuracy : {}'
                .format(epoch + 1, dev_total_best_acc))
            break

        dev_total_acc = 0
Exemplo n.º 4
0
from __future__ import unicode_literals

from Classes import *
from Proc import Proc
from Voc import Voc
from natasha import NamesExtractor, AddressExtractor
import requests
import os
import lxml.html
from googlesearch import search

corpus_name = "train"
corpus = os.path.join("Data", corpus_name)
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
voc = Voc(corpus)
# voc.load()
proc = Proc(10, 3)
datafile = os.path.join(corpus, "di_all.txt")
save_dir = os.path.join("Data", "save")
model_name = 'cb_model'
attn_model = 'dot'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

checkpoint_iter = 10000
loadFilename = os.path.join(
    save_dir, model_name, corpus_name,
Exemplo n.º 5
0
nlp = spacy.load("en_core_web_sm")

MAX_LENGTH = 15

PAD_token = 0  # 填充
SOS_token = 1  # 句子开头
EOS_token = 2  # 句子结尾


def normalizeString(s):
    s = re.sub('[^a-z\\-.,!?]', ' ', s.lower())
    s = word_tokenize(s)
    return ' '.join(s)


voc = Voc(None)
with open("model/voc.pkl", 'rb') as file:
    voc = pickle.loads(file.read())

with open("model/pairs.pkl", 'rb') as file:
    pairs = pickle.loads(file.read())
""" transform to numerical sentence  """


def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def evaluate(searcher, voc, sentence, max_length=MAX_LENGTH):
    indexes_batch = [indexesFromSentence(voc, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
Exemplo n.º 6
0
def prepareData(file, filters=None, max_length=None, reverse=False):

    pairs = read_file(file, reverse)
    print(f"Tenemos {len(pairs)} pares de frases")

    if filters is not None:
        assert max_length is not None
        pairs = [filterPairs(pair) for pair in pairs]
        print(f"Filtramos a {len(pairs)} pares de frases")

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Voc('luis')
        output_lang = Voc('ana')
    else:
        input_lang = Voc('ana')
        output_lang = Voc('luis')

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

        # add <eos> token
        pair[0] += " EOS"
        pair[1] += " EOS"

    print("Longitud vocabularios:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)

    return input_lang, output_lang, pairs
Exemplo n.º 7
0
Usage: Running this file to train model(general dialogue chatbot) and save model in local directory.
"""

corpus_name = "cornell movie-dialogs corpus"
device = torch.device("cpu")
datafile = os.path.join(corpus_name, 'formatted_movie_lines.txt')
MIN_COUNT = 3
MAX_LENGTH = 15

PAD_token = 0  # padding
SOS_token = 1  # start tag of sentence
EOS_token = 2  # end tag of sentace
"""
Loading voc object and pairs variable from local directory.
"""
voc = Voc(None)
with open("model/voc.pkl", 'rb') as file:
    voc = pickle.loads(file.read())

with open("model/pairs.pkl", 'rb') as file:
    pairs = pickle.loads(file.read())
""" transform to numerical sentence  """


def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


""" padding """

Exemplo n.º 8
0
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iter), iter,
                                         iter / n_iter * 100, print_loss_avg))
    #     if iter % plot_every == 0:
    #         plot_loss_avg = plot_loss_total / plot_every
    #         plot_losses.append(plot_loss_avg)
    #         plot_loss_total = 0
    # showPlot(plot_losses)


voc_path = current_dir + "/data/eng-fra.txt"
print(voc_path)
vocEng = Voc("eng")
vocFra = Voc("fra")

engVoc = vocEng.naiveInitVoc(voc_path, 0)
fraVoc = vocFra.naiveInitVoc(voc_path, 1)
word2glove = []
# pairs = prepareData(abs_file_path,"concate")
pairs = naivePrepare(voc_path)
print(pairs)
hidden_size = 100

encoder1 = EncoderRNN(vocEng.num_words, hidden_size).to(device)

attn_decoder1 = AttnDecoderRNN(hidden_size, vocFra.num_words,
                               dropout=0.1).to(device)
Exemplo n.º 9
0
def readVocs(datafile, corpus_name):
    lines = open(datafile, encoding='utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs