예제 #1
0
def main():

    import argparse

    hlp = "Compute some stats about alignments."

    parser = argparse.ArgumentParser(hlp)
    parser.add_argument('--corpus', required=True, help="Corpus to use.",
        choices=["dailymail", "cnn"])
    parser.add_argument('--data-path', required=True, 
        help="Path to Cheng&Lapata data.")
    parser.add_argument("--alignments-path", required=True,
        help="Path to token alignments.")
    parser.add_argument("--output-vocab", required=True,
        help="Location to write output vocab.")

    args = parser.parse_args()
    #vocab_in = read_vocab(args.input_vocab)
    vocab_out = read_vocab(args.output_vocab) 

    for split in ["training", "validation", "test"]:
        print("Split: {}".format(split))
        data_path = os.path.join(args.data_path, args.corpus, split)
        alignments_path = os.path.join(
            args.alignments_path, args.corpus, split)
        
        collect_split_stats(data_path, alignments_path, vocab_out)
예제 #2
0
    def __init__(self,
                 train_input_file,
                 train_target_file,
                 test_input_file,
                 test_target_file,
                 vocab_file,
                 num_units,
                 layers,
                 dropout,
                 batch_size,
                 learning_rate,
                 output_dir,
                 save_step=100,
                 eval_step=1000,
                 param_histogram=False,
                 restore_model=False,
                 init_train=True,
                 init_infer=False):
        self.num_units = num_units
        self.layers = layers
        self.dropout = dropout
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.save_step = save_step
        self.eval_step = eval_step
        self.param_histogram = param_histogram
        self.restore_model = restore_model
        self.init_train = init_train
        self.init_infer = init_infer

        if init_train:
            self.train_reader = data.SeqReader(train_input_file,
                                               train_target_file, vocab_file,
                                               batch_size)  #训练
            self.train_reader.start()
            self.train_data = self.train_reader.read()
            self.eval_reader = data.SeqReader(test_input_file,
                                              test_target_file, vocab_file,
                                              batch_size)  #测试

            self.eval_reader.start()
            self.eval_data = self.eval_reader.read()

        self.model_file = path.join(output_dir, 'model.ckpl')
        self.log_writter = tf.summary.FileWriter(output_dir)

        if init_train:
            self._init_train()
            self._init_eval()

        if init_infer:
            self.infer_vocabs = data.read_vocab(vocab_file)
            self.infer_vocab_indices = dict(
                (c, i) for i, c in enumerate(self.infer_vocabs))
            self._init_infer()
            self.reload_infer_model()
예제 #3
0
def evaluate_line():
    config_path = os.path.join(FLAGS.config_path, 'config')
    test_config = load_config(config_path)

    _, word_to_id = read_vocab(test_config['vocab_file'])
    categorys, cat_to_id = read_category()
    contents, labels = read_file('data/cnews.val2.txt')
    model = Model(test_config)

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    # 读取模型
    checkpoint_path = os.path.join(FLAGS.checkpoints_path)
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_path)
    saver.restore(session, checkpoint_file)

    while True:
        line = input("请输入测试句子:")
        x_input = [[word_to_id[x] for x in line if x in word_to_id]]
        x_pad = kr.preprocessing.sequence.pad_sequences(x_input, 600)
        predict = model.evaluate(session, x_pad)
        print(categorys[predict[0][0]])
예제 #4
0
def get_train_config():
    train_contents, train_labels = read_file(FLAGS.train_file)
    # 1.先构建训练数据的词汇字典
    if not os.path.exists(FLAGS.vocab_file):
        words = build_vocab(train_contents, FLAGS.vocab_file)
    else:
        words, _ = read_vocab(FLAGS.vocab_file)
    # 2.获取分类数据,构建分类数据的字典表,并保存至文件中
    categories, cat_to_id = read_category()
    # 3.生成训练配置文件
    vocab_size = len(words)
    num_classes = len(categories)
    #长度太大会内存溢出
    # seq_len = max([len(content) for content in train_contents])
    seq_len = 600
    filter_sizes = [int(i) for i in FLAGS.filter_sizes.split(',')]
    # 生成环境配置文件
    make_path(FLAGS)
    config_path = os.path.join(FLAGS.config_path, 'config')
    if not os.path.isfile(config_path):
        train_config = config_model(seq_len, vocab_size, num_classes,
                                    filter_sizes)
        save_config(train_config, config_path)
    return train_config
예제 #5
0
from itertools import chain
import nltk


import numpy as np

import data_utils

vocab = data_utils.read_vocab(data_utils.vocab_path)

X, Y = data_utils.prepare_data(data_utils.pos_train, vocab)

# pos = nltk.pos_tag()
# print pos

# X_indexes = []
# for window in X:
#     window_indexes = []
#     for word in window:
#         if(word in vocab):
#             window_indexes.append(vocab[word])
#         else:
#             window_indexes.append(vocab['UUUNKKK'])
#     X_indexes.append(window_indexes)
#
print(Y[:1])
print(X[:1])
# print X_indexes[0]
# embeddings = data_utils.read_embeddings(data_utils.wv_path)

예제 #6
0
def main():
    PRETRAIN_EMBEDDINGS = False
    USE_SUBWORDS = True
    if len(sys.argv) > 1:
        PRETRAIN_EMBEDDINGS = sys.argv[1]
    if len(sys.argv) > 2:
        USE_SUBWORDS = sys.argv[2]

    print('pre-trained embeddings is set to %s' % PRETRAIN_EMBEDDINGS)
    print('sub-words embeddings is set to %s' % USE_SUBWORDS)

    EPOCHS_TO_TRAIN = 100
    CONTEXT_SIZE = 5
    BATCH_SIZE = 1000
    WORKERS = 2

    vocab = data_utils.read_vocab(data_utils.vocab_path)
    embeddings = np.random.randn(len(vocab), 50) / np.sqrt(len(vocab))
    EMBEDDING_DIM = len(embeddings[0])
    vocab_reversed = {}
    if PRETRAIN_EMBEDDINGS:
        embeddings = data_utils.read_embeddings(data_utils.wv_path)
    if USE_SUBWORDS:
        embeddings, vocab, vocab_reversed = data_utils.generate_embeddings_with_prefixes(
            embeddings, vocab, EMBEDDING_DIM)

    print('Starting execution...')
    print('using EMBEDDING_DIM of %s' % EMBEDDING_DIM)
    print('using CONTEXT_SIZE of %s' % CONTEXT_SIZE)
    print('using BATCH_SIZE of %s' % BATCH_SIZE)
    print('using EPOCHS_TO_TRAIN of %s' % EPOCHS_TO_TRAIN)

    class Net(nn.Module):
        def __init__(self, vocab_size, embed_dim, context_size,
                     pretrained_embeddings):
            super(Net, self).__init__()
            self.embeddings = nn.Embedding(vocab_size, embed_dim)
            self.embeddings.weight.data.copy_(
                torch.from_numpy(pretrained_embeddings))
            # if PRETRAIN_EMBEDDINGS:
            #     self.embeddings.weight.requires_grad = False
            self.linear1 = nn.Linear(context_size * embed_dim, 128)
            self.linear2 = nn.Linear(128, len(data_utils.POS_TAGS))

        def forward(self, x):
            embeds = self.embeddings(x).view(
                (-1, CONTEXT_SIZE * EMBEDDING_DIM))

            if USE_SUBWORDS:
                prefixes = get_prefixes_embeddings(x, vocab, vocab_reversed)
                suffixes = get_suffixes_embeddings(x, vocab, vocab_reversed)

                prefixes_tensor = Variable(
                    (torch.from_numpy(prefixes)).type(torch.LongTensor))
                suffixes_tensor = Variable(
                    (torch.from_numpy(suffixes)).type(torch.LongTensor))

                prefixes_embeds = self.embeddings(prefixes_tensor).view(
                    (-1, CONTEXT_SIZE * EMBEDDING_DIM))
                suffixes_embeds = self.embeddings(suffixes_tensor).view(
                    (-1, CONTEXT_SIZE * EMBEDDING_DIM))

                embeds = embeds + prefixes_embeds + suffixes_embeds

            out = F.tanh(self.linear1(embeds))
            out = F.log_softmax(self.linear2(out))
            return out

    net = Net(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE, embeddings)

    print('Preparing train/test/dev sets')
    train_data_loader = data_utils.prepare_tensor_dataset(
        data_utils.pos_train, vocab, WORKERS, BATCH_SIZE)
    dev_data_loader = data_utils.prepare_tensor_dataset(
        data_utils.pos_dev, vocab, WORKERS, BATCH_SIZE)
    test_data_loader = data_utils.prepare_tensor_dataset(data_utils.pos_test,
                                                         vocab,
                                                         WORKERS,
                                                         BATCH_SIZE,
                                                         include_y=False)

    criterion = nn.NLLLoss()
    optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()),
                          lr=0.01)
    dev_losses = []
    train_losses = []
    acceptances = []
    iterations = []
    print("Starting training loop")
    for idx in range(0, EPOCHS_TO_TRAIN):
        for iteration, batch in enumerate(train_data_loader, 1):
            x, y = Variable(batch[0]), Variable(batch[1])
            optimizer.zero_grad()
            output = net(x)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()

        if idx % 1 == 0:
            # calculate accuracy on validation set
            dev_loss = 0
            net.eval()
            correct = 0.0
            total = 0.0
            for dev_batch_idx, dev_batch in enumerate(dev_data_loader):
                x, y = Variable(dev_batch[0]), Variable(dev_batch[1])
                output = net(x)
                dev_loss = criterion(output, y)
                _, predicted = torch.max(output.data, 1)
                total += dev_batch[1].size(0)
                correct += (predicted == dev_batch[1]).sum()

            acc = correct / total

            acceptances.append(acc)
            train_losses.append(loss.data[0])
            dev_losses.append(dev_loss.data[0])
            iterations.append(idx)
            print(
                "Epoch {: >8}     TRAIN_LOSS: {: >8}      DEV_LOSS: {: >8}     ACC: {}"
                .format(idx, loss.data[0], dev_loss.data[0], acc))

    print("Predicting the test file")
    net.eval()

    test_file = open(os.path.join(data_utils.POS_DIR, "test_results.txt"), 'w')

    for test_batch_idx, test_batch in enumerate(test_data_loader):
        x, y = Variable(test_batch[0]), Variable(test_batch[1])
        output = net(x)
        predictions = torch.max(output.data, 1)[1].numpy()
        for pos_index in predictions:
            test_file.write(data_utils.get_pos_name_by_index(pos_index) + "\n")

    test_file.close()
    print('Finished execution!')

    print('Plotting graphs..')
    fig = plt.figure()
    fig.suptitle("POS - random word vectors with prefix/suffix", fontsize=14)
    ax = plt.subplot(211)
    ax.set_xlabel('Iterations')
    ax.set_ylabel('Train loss')
    ax.plot(iterations, train_losses, 'k')

    ax = plt.subplot(212)
    ax.set_xlabel('Iterations')
    ax.set_ylabel('Dev loss')
    ax.plot(iterations, dev_losses, 'k')

    ax = plt.subplot(213)
    ax.set_xlabel('Iterations')
    ax.set_ylabel('Acc')
    ax.plot(iterations, acceptances, 'k')
    plt.show()
예제 #7
0
def train():
    train_config = get_train_config()
    _, word_to_id = read_vocab(train_config['vocab_file'])
    _, cat_to_id = read_category()
    # 获得日志
    logger = get_logger(os.path.join(FLAGS.log_path, 'train.log'))
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    with tf.Session(config=config) as sess:
        # 创建模型
        model = Model(train_config)
        # 获取数据训练
        x_train_data, y_train_data = process_file(train_config['train_file'],
                                                  word_to_id, cat_to_id,
                                                  train_config['seq_len'])
        #获取验证数据集
        x_val_data, y_val_data = process_file(train_config['val_file'],
                                              word_to_id, cat_to_id,
                                              train_config['seq_len'])
        #初始化变量
        sess.run(tf.global_variables_initializer())

        len_data = len(y_train_data)  #数据样本数量
        start_time = time.time()
        total_batch = 0  # 总批次
        best_acc_val = 0.0  # 最佳验证集准确率
        last_improved = 0  # 记录上一次提升批次
        require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练
        flag = False  #是否结束训练
        #num_epochs:防止前面的学习丢失了一些特征,需要重复学习样本
        for i in range(train_config['num_epochs']):
            for x_input, y_output in batch_iter(x_train_data, y_train_data,
                                                train_config['batch_size']):
                total_batch += 1

                step, acc, loss = model.run_step(sess, x_input, y_output)
                #迭代100次评估一次模型
                if (total_batch % FLAGS.evaluate_every == 0):
                    time_dif = get_time_dif(start_time)
                    logger.info(
                        "train: iterator{}: step:{}/{} acc:{} loss:{} time:{}".
                        format(i + 1, step % len_data, len_data, acc, loss,
                               time_dif))
                    val_acc, text_los = evaluate(sess, model, x_val_data,
                                                 y_val_data)
                    logger.info("test: acc:{} loss:{} ".format(
                        val_acc, text_los))
                    #保存模型
                    if acc > 0.5 and val_acc > 0.5 and val_acc > best_acc_val:
                        last_improved = total_batch
                        best_acc_val = val_acc
                        checkpoint_path = os.path.join(FLAGS.checkpoints_path,
                                                       'checkpoints')
                        saver = tf.train.Saver(
                            tf.global_variables(),
                            max_to_keep=FLAGS.num_checkpoints)
                        saver.save(sess, checkpoint_path, global_step=step)
                if total_batch - last_improved > require_improvement:
                    # 验证集正确率长期不提升,提前结束训练
                    print("No optimization for a long time, auto-stopping...")
                    flag = True
                    break  # 跳出循环

            if flag:
                time_dif = get_time_dif(start_time)
                logger.info('训练结束:{}'.format(time_dif))
                break
                # 好了。。。开始作死添加东西吧
                # 读取所有文件并统计词频
                word_collection = []
                for text, _ in read_text(data_path, args.max_input_len, args.max_output_len):
                    word_collection.extend([i for i in text])
                # counter_vocab = Counter(word_collection).most_common(config.new_vocab_size)

                counter_vocab = [word[0] for word in list(Counter(word_collection).items()) if word[1] > args.min_count]

                # build new_vocab
                _new_vocab = ['[PAD]', '[UNK]', '[CLS]', '[SEP]']
                _new_vocab.extend(counter_vocab)

                # 获取到了新的单词表,这样就需要设置一个单词表对应与旧单词表的字典,以及自己本身的一个字典
                # 这样就需要读取下旧字典
                read = read_vocab(vocab_path)
                old_vocab = {}
                for n, i in enumerate(read):
                    old_vocab[i] = n

                # 看看new_vocab的单词是否全在old_vocab上,如果不在的就删去, 同时获取到new_vocab在old_vocab中对应的id
                new_vocab = []
                vocab_dict = {}
                token_num = []
                for n, word in enumerate(_new_vocab):
                    if word in old_vocab:
                        new_vocab.append(word)
                        vocab_dict[word] = len(vocab_dict)
                        token_num.append(old_vocab[word])

                # 更改下保存的数据并保存为新的模型