Exemplo n.º 1
0
def main(rnn_type="rnn"):
    from data import loop_data, build_vocabulary, batchify

    np.random.seed(11)

    batch_size = 32
    n_steps = 20
    lr = 0.01
    lr_decay = 0.5

    train_text, valid_text = loop_data()

    vocab, rev_vocab = build_vocabulary(train_text)
    vocab_size = len(vocab)
    print "vocab size:", vocab_size

    model = RNNModel(vocab_size, n_steps=n_steps, rnn_type=rnn_type)

    # TODO: sample decoded sentence
    with tf.Session() as sess:
        tf.initialize_all_variables().run()

        prev_epoch_cost = 9999999  # arbitarily large number
        for epoch in range(5):
            print "epoch", epoch
            print "learning rate", lr

            list_of_costs = []
            model.assign_lr(sess, lr)

            for idx, (x, y) in tqdm(
                    enumerate(batchify(train_text, vocab, batch_size,
                                       n_steps))):
                list_of_costs.append(model.step(sess, x, y, is_train=True))
                if idx % 100 == 0:
                    print "cost", 2**np.mean(list_of_costs)
                    list_of_costs = []

            epoch_cost = np.mean(list_of_costs)
            print "train cost", 2**epoch_cost

            list_of_costs = []
            for idx, (x, y) in tqdm(
                    enumerate(batchify(valid_text, vocab, batch_size,
                                       n_steps))):
                list_of_costs.append(model.step(sess, x, y, is_train=False))

            epoch_cost = np.mean(list_of_costs)
            print "valid cost", 2**epoch_cost

            if epoch_cost > prev_epoch_cost:
                lr *= lr_decay
            prev_epoch_cost = epoch_cost
Exemplo n.º 2
0
def main(rnn_type="rnn"):
    from data import loop_data, build_vocabulary, batchify

    np.random.seed(11)

    batch_size = 32
    n_steps = 20
    lr = 0.01
    lr_decay = 0.5

    train_text, valid_text = loop_data()

    vocab, rev_vocab = build_vocabulary(train_text)
    vocab_size = len(vocab)
    print "vocab size:", vocab_size

    model = RNNModel(vocab_size, n_steps=n_steps, rnn_type=rnn_type)

    # TODO: sample decoded sentence
    with tf.Session() as sess:
        tf.initialize_all_variables().run()

        prev_epoch_cost = 9999999  # arbitarily large number
        for epoch in range(5):
            print "epoch", epoch
            print "learning rate", lr

            list_of_costs = []
            model.assign_lr(sess, lr)

            for idx, (x, y) in tqdm(enumerate(batchify(train_text, vocab, batch_size, n_steps))):
                list_of_costs.append(model.step(sess, x, y, is_train=True))
                if idx % 100 == 0:
                    print "cost", 2 ** np.mean(list_of_costs)
                    list_of_costs = []

            epoch_cost = np.mean(list_of_costs)
            print "train cost", 2 ** epoch_cost

            list_of_costs = []
            for idx, (x, y) in tqdm(enumerate(batchify(valid_text, vocab, batch_size, n_steps))):
                list_of_costs.append(model.step(sess, x, y, is_train=False))

            epoch_cost = np.mean(list_of_costs)
            print "valid cost", 2 ** epoch_cost

            if epoch_cost > prev_epoch_cost:
                lr *= lr_decay
            prev_epoch_cost = epoch_cost
Exemplo n.º 3
0
def _query_skeletons_to_responses(query, skeletons):
    all_d = []
    for skeleton in skeletons:
        all_d.append([query, query, skeleton, skeleton])

    batch_dict = batchify(all_d, vocab_src, vocab_tgt, set([]), None)
    hyps_batch = model.work(batch_dict, beam_size, max_time_step)

    responses = []
    for hyps in hyps_batch:
        hyps.sort(key=lambda x: x.score / ((1 + len(x.seq))**0.6),
                  reverse=True)
        best_hyp = hyps[0]
        predicted_tgt = [token.raw for token in best_hyp.seq]
        predicted_tgt = predicted_tgt[1:-1]
        response = ''.join(predicted_tgt)
        responses.append(response)
    return responses
Exemplo n.º 4
0
corpus = data.Corpus(args.data, max_seq_len=args.prefix_len)
embeddings = None
if not args.glove is None:
    embeddings = read_glove(args.glove, corpus.dictionary)

if not embeddings is None:
    first_tok = next(iter(embeddings))
    if len(embeddings[first_tok]) != args.emsize:
        print(
            "ERROR: Embedding size (--emsize) %d is not the same as pre-trained embedding size %d"
            % (len(embeddings[first_tok]), args.emsize))
        sys.exit(-1)

eval_batch_size = 100
device = torch.device("cuda" if args.cuda else "cpu")
train_data = batchify(corpus.train, args.batch_size, device)
val_data = batchify(corpus.valid, eval_batch_size, device)
test_data = batchify(corpus.test, eval_batch_size, device)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
if args.load is None:
    model = model.RNNModel(args.model,
                           ntokens,
                           args.emsize,
                           args.nhid,
                           args.nlayers,
                           args.dropout,
Exemplo n.º 5
0
BPE encoder
"""
encoder = json.load(open(encoder_path))
encoder['_pad_'] = len(encoder)
encoder['_start_'] = len(encoder)
encoder['_end_'] = len(encoder)
encoder['_unk_'] = len(encoder)
n_special = 4
"""
DATA
"""
train, valid, test = get_data(encoder, data_dir, prefix, params.cut_down_len,
                              label_size, params.ratio)
max_len = 0.
if params.corpus == 'sage':
    train['text'] = batchify(np.array(train['text'][0]), params.batch_size)
    valid['text'] = batchify(np.array(valid['text'][0]), params.batch_size)
    test['text'] = batchify(np.array(test['text'][0]), params.batch_size)
"""
Params
"""
if params.init_emb:
    word_embeddings = np.concatenate(
        [
            np.load(wordvec_path).astype(np.float32),
            np.zeros((1, params.d_model), np.float32),  # pad, zero-value!
            (np.random.randn(n_special - 1, params.d_model) * 0.02).astype(
                np.float32)
        ],
        0)
else:
Exemplo n.º 6
0
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


if __name__ == "__main__":
    # prepare data
    np_data, np_labels, np_vdata, np_vlabels = prepareData()
    batch_size = args.batch_size  #TODO: batchsize and seq_len is the issue to be addressed
    n_epoches = args.max_epochs

    batches = batchify(np_data, batch_size, np_labels)
    vbatches = batchify(np_vdata, batch_size, np_vlabels)

    device = torch.device("cuda")

    # setup model
    from model import RNN, NaiveRNN
    input_size = 2
    hidden_size = args.hidden_size
    output_size = 2

    rnn = RNN(input_size, hidden_size, output_size, batch_size).to(device)
    #rnn = NaiveRNN(input_size, hidden_size, output_size, batch_size).to(device)

    # define loss
    criterion = nn.NLLLoss(reduction='none')
Exemplo n.º 7
0
        output, hidden = model(input, hidden)
        output = output.squeeze()
        output = softmax(output, dim=0)
        p = output[current_idx].data  # 概率
        total_p += math.log(p)  #e为底
    return math.exp(-total_p * (1 / sentence_len))


def evaluate(model, test_dataset, dict):
    ppl = 0
    for sentence in test_dataset:
        ppl += evaluate_iter(model, sentence, dict)
    ppl = ppl / len(test_dataset)
    print("evaluation ppl:", ppl)
    return ppl


if __name__ == '__main__':
    dataset = data.get_dataset(file_path)
    dict = data.build_dict(dataset)
    config.vocab_size = len(dict)
    train_dataset, test_dataset = data.split_data(
        dataset, train_proportion=config.train_proportion)
    train_tokens = data.tokenize(train_dataset, dict)
    model = RNNModel(config)
    train_batch_source = data.batchify(train_tokens,
                                       config.batch_size)  #传入batchify好的数据直接训练
    train(model, batch_source=train_batch_source)

    #test
    evaluate(model, test_dataset, dict)
Exemplo n.º 8
0
print "getting data..."
corpus = data.Corpus(args.data)

eval_batch_size = 10

print "batching..."

stops = [
    i for i in range(len(corpus.train))
    if corpus.train[i] == corpus.dictionary.word2idx["<eos>"]
]

last = stops[args.nsentences - 1]
corpus.train = corpus.train[:last]

train_data = data.batchify(corpus.train, args.batch_size, args.cuda)
valid_data = data.batchify(corpus.valid, eval_batch_size, args.cuda)
test_data = data.batchify(corpus.test, eval_batch_size, args.cuda)

print "getting model..."

ntokens = len(corpus.dictionary)
lm = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers,
                    args.dropout, args.tied)

if args.cuda:
    lm.cuda()

criterion = nn.CrossEntropyLoss()

Exemplo n.º 9
0
            for file in files:
                prefix = file.split('_')[0]
                if prefix == 'train':
                    train_files.append(file)
                if prefix == 'valid':
                    valid_files.append(file)
            print('Start training!!!')
            for epoch in range(1, args.epochs+1):
                valid_fname = random.choice(valid_files)
                for train_fname in train_files:
                    train_fname = random.choice(train_files)
                    corpus = data.SentenceCorpus(args.bptt, args.lm_data, args.tag_data, 
                                                 word2idx, tag2idx, idx2word, idx2tag,
                                                 train_fname, valid_fname, None, testflag=args.test)
    
                    train_lm_data = batchify(corpus.train_lm, args.batch_size)
                    train_masking = batchify(corpus.train_maksing, args.batch_size)
                    train_ccg_data = batchify(corpus.train_tag, args.batch_size)
                    
                    epoch_start_time = time.time()
                    train(args, model, train_lm_data, train_masking, train_ccg_data, criterion, optimizer)

                    val_lm_data = batchify(corpus.valid_lm, args.batch_size)
                    val_masking = batchify(corpus.valid_maksing, args.batch_size)
                    val_ccg_data = batchify(corpus.valid_tag, args.batch_size)
                    val_loss = evaluate(args, model, val_lm_data, val_masking, val_ccg_data)
                    print('-' * 80)
                    print('| end of {} | time: {:5.2f}s | valid loss {:5.4f} '.format(train_fname, 
                          (time.time() - epoch_start_time), val_loss))
                    print('-' * 80)
                    # Save the model if the validation loss is the best we've seen so far.
Exemplo n.º 10
0
def run(args):

    device = torch.device("cuda" if args.cuda else "cpu")

    dir_path = os.path.dirname(os.path.realpath(__file__))
    debug_msg = (
        "\n\nFirst download the PTB dataset and dump it to sandbox/data/penn"
        "\nSee: https://github.com/townie/PTB-dataset-from-Tomas-Mikolov-s-webpage/tree/master/data"
    )
    assert (Path(dir_path) / "data/penn").exists(), debug_msg
    for f in ["train.txt", "test.txt", "valid.txt"]:
        assert (Path(dir_path) / f"data/penn/{f}").exists()

    eval_batch_size = 20
    corpus = Corpus("sandbox/data/penn")
    train_data = batchify(corpus.train, args.batch_size, device)
    val_data = batchify(corpus.valid, eval_batch_size, device)
    test_data = batchify(corpus.test, eval_batch_size, device)
    rev_test_data = batchify(
        corpus.test[
            torch.arange(corpus.test.shape[0] - 1, -1, step=-1).to(corpus.test.device)
        ],
        eval_batch_size,
        device,
    )

    ntokens = len(corpus.dictionary)
    model = LanguageModel(ntokens, args.hidden_size, args.num_layers).to(device)

    criterion = nn.CrossEntropyLoss(reduction="sum")

    # Loop over epochs.
    lr = args.lr
    best_val_loss = 1E9

    # At any point you can hit Ctrl + C to break out of training early.
    for epoch in range(0, args.epochs):
        epoch_start_time = time.time()
        train_epoch(model, criterion, corpus, train_data, epoch, lr)
        val_loss, val_entropy = evaluate(
            model, criterion, corpus, val_data, eval_batch_size
        )
        print("-" * 89)
        print(
            "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | "
            "valid ppl {:8.2f} | valid entropy {:8.2f}".format(
                epoch,
                (time.time() - epoch_start_time),
                val_loss,
                math.exp(val_loss),
                val_entropy,
            )
        )
        print("-" * 89)
        # Learning rate annealing
        if epoch >= 19:
            lr = lr * args.lr_decay

        print("=" * 89)
        if val_loss < best_val_loss:
            torch.save(model.state_dict(), "bayesian_scratch_scaled.pt")

    # Run on test data.
    test_loss, test_entropy = evaluate(
        model, criterion, corpus, test_data, eval_batch_size
    )
    _, rev_test_entropy = evaluate(
        model, criterion, corpus, rev_test_data, eval_batch_size
    )
    print("=" * 89)
    print(
        "| End of training | test loss {:5.2f} | test ppl {:8.2f} |"
        " test entropy {:8.2f} | delta entropy {:8.2f}".format(
            test_loss,
            math.exp(test_loss),
            test_entropy,
            rev_test_entropy - test_entropy,
        )
    )
Exemplo n.º 11
0
def main(model=None):

    print(f'readying model & data @ {now()}')

    data = load_data()
    if not data:
        save_data(preprocess())
        data = load_data()

    if not model:
        if not config.fresh_model:
            model = load_model()
        if not model:
            model = make_model()
            save_model(model)
            model = load_model()
            print('created ',end='')
        else: print('loaded ',end='')
        print(f'model: {describe_model(model)}')

    print(f'total files: {len(data)}, ',end='')

    data, data_dev = split_dataset(data)

    if config.batch_size > len(data):
        config.batch_size = len(data)
    elif config.batch_size == -1:
        config.batch_size = len(data_dev)

    print(f'train: {len(data)}, dev: {len(data_dev)}, batch size: {config.batch_size}')

    print(f'hm train: {sum(len(datapoint) for datapoint in data)}, '
          f'hm dev: {sum(len(datapoint) for datapoint in data_dev)}, '
          f'learning rate: {config.learning_rate}, '
          f'optimizer: {config.optimizer}, '
          f'\ntraining for {config.hm_epochs} epochs.. ',end='\n')

    one_batch = (config.batch_size == len(data)) or (config.train_combined and config.train_parallel)
    config.shuffle_epoch &= not one_batch
    window_slide_multiplier = config.hm_bars_grouped//config.hm_bars_slide
    if config.ckp_save_epochs == -1: config.ckp_save_epochs = range(config.hm_epochs)

    data_losss, dev_losss = [], []

    if config.initialize_loss:

        print(f'initializing losses @ {now()}', flush=True)
        if not one_batch:
            data_losss.append(dev_loss(model,data))
        dev_losss.append(dev_loss(model,data_dev))
        print(f'initial losses: {data_losss, dev_losss}')

    print(f'training started @ {now()}', flush=True)

    for ep in range(config.hm_epochs):

        loss = 0

        if config.train_parallel and config.train_combined:
            l, g = process_data_onebatch(model, data)
            loss += l
            give_grads(model, g)
            batch_size = sum(sum(len(inp) * window_slide_multiplier for inp, lbl in datapoint) for datapoint in data)
            sgd(model, batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model, ep, batch_size=batch_size)

        else:
            for i,batch in enumerate(batchify(data)):

                if config.disp_batches:
                    print(f'\tbatch {i}, {sum(len(datapoint) for datapoint in batch)}', end='', flush=True)

                batch_size = sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in batch)

                if config.train_parallel:
                    l,g = process_batch_parallel(model,batch)
                    loss += l
                    give_grads(model,g)

                elif config.train_combined:
                    loss += process_batch_combined(model, batch)

                else:
                    for j,datapoint in enumerate(batch):
                        states = None
                        for k,(inp,lbl) in enumerate(datapoint):
                            out, states = respond_to(model, inp, states)
                            states = [state.detach() for state in states]
                            loss += sequence_loss(lbl,out)

                sgd(model,batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model,ep,batch_size=batch_size)

                if config.disp_batches:
                    print(f', completed @ {now()}' ,flush=True)

        loss /= sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in data)

        data_losss.append(loss)
        dev_losss.append(dev_loss(model,data_dev))
        
        print(f'epoch {ep}, loss {loss}, dev loss {dev_losss[-1]}, completed @ {now()}', flush=True)

        if ep in config.ckp_save_epochs:
            save_model(model,f'{config.model_save_path}_ckp{ep}')

    data_losss.append(dev_loss(model,data))
    dev_losss.append(dev_loss(model,data_dev))

    print(f'final losses: {[data_losss[-1],dev_losss[-1]]}')

    print(f'training ended @ {now()}', flush=True)

    plot(data_losss)
    show()
    plot(dev_losss)
    show()

    if config.overwrite_model or input(f'Save model as {config.model_save_path}? (y/n): ').lower() == 'y':
        save_model(load_model(),config.model_save_path+'_prev')
        save_model(model)

    return model, [data_losss, dev_losss]