示例#1
0
def train(args):
    config = load_config(args.model_dir)

    train_dataset = LMDataset(config["train_file"],
                              vocab_file=config["vocab_file"])

    vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl")
    with open(vocab_dump_path, 'wb') as fp:
        pickle.dump(train_dataset.vocab, fp)

    valid_dataset = LMDataset(config["valid_file"], vocab_dump=vocab_dump_path)

    config["vocab_size"] = len(train_dataset.vocab)
    model = LM(config, args.model_dir)

    if args.epoch is not None:
        print_time_info("Loading checkpoint {} from model_dir".format(
            args.epoch))
        model.load_model(args.model_dir, args.epoch)

    model.train(epochs=config["train_epochs"],
                batch_size=config["batch_size"],
                data_engine=train_dataset,
                valid_data_engine=valid_dataset,
                train_decoder_epochs=config.get("train_decoder_epochs", 0),
                max_iter_per_epoch=config.get("max_iter_per_epoch", 100000))
示例#2
0
def k_experiment(file, n):
	output = ''

	lmd = LMDataset(file)
	split = int(len(lmd.raw_data)*0.8)
	train, test = lmd.raw_data[:split], lmd.raw_data[split:]

	X = []
	Y = []

	for k in np.arange(0.1, 1.1, step=0.1):
		lm = NgramLM(data=train, n=n)
		lm.train()
		tune = lm.generate()
		acc, pp = lm.test(data=test, s='add-k', k=k)
		X.append(k)
		Y.append(pp)
		op = 'n: {}\nk: {}\n{}\nAccuracy: {}\nPerplexity: {}\n\n'.format(
			str(n), str(k), tune, str(acc), str(pp))
		print(op)
		output += op

	with open('data/lm_k_exp', 'w') as f:
		f.write(output)

	plt.plot(X, Y, '-')
	plt.xlabel('k of add-k smoothing')
	plt.ylabel('Perplexity')
	plt.show()
示例#3
0
def evaluate(model, sentences, vocab, reverse_vocab, hy, writer, device):
    dataset = LMDataset(sentences, vocab, reverse_vocab, hy.window_size)
    loader = DataLoader(dataset, batch_size=hy.batch_size, shuffle=True, drop_last=True)
    vocab_size = len(vocab.keys())
    print("Loaded vocab of size {} for evaluation".format(vocab_size))

    perplexity = compute_model_accuracy(model, loader, device, writer)

    return perplexity
示例#4
0
def ngram_lm(file, n, gen, test, smoothing, k):
    lmd = LMDataset(file)
    split = int(len(lmd.raw_data) * 0.8)
    train, test = lmd.raw_data[:split], lmd.raw_data[split:]
    lm = NgramLM(data=train, n=n)
    lm.train()
    if gen:
        print 'Generated Tune: \n{}'.format(lm.generate())
    if test:
        acc, pp = lm.test(data=test, s=smoothing, k=k)
        print 'Accuracy: {}\nPerplexity: {}'.format(acc, pp)
示例#5
0
def train(args):
    args.save_dir += "_" + args.model_type + "_lm" if not args.seq2seq else "_seq2seq"
    os.makedirs(args.save_dir, exist_ok=True)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if args.model_type == "lstm":
        from lstm import LMModel, Seq2SeqModel
    elif args.model_type == "transformer":
        from transformer import LMModel, Seq2SeqModel

    if args.seq2seq:
        train_set = Seq2SeqDataset(device=device)
        valid_set = Seq2SeqDataset(split="valid", device=device)
        model = Seq2SeqModel(args, train_set.dictionary).to(device)
    else:
        train_set = LMDataset(device=device)
        valid_set = LMDataset(split="valid", device=device)
        model = LMModel(args, train_set.dictionary).to(device)

    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    train_loader = DataLoader(train_set, batch_size=args.batch_size, collate_fn=train_set.collate_fn, shuffle=True)

    evaluate(model, valid_set)
    for epoch in range(args.num_epoch):
        model.train()
        with tqdm(train_loader, desc="training") as pbar:
            losses = []
            for samples in pbar:

                optimizer.zero_grad()
                loss = model.get_loss(**samples)
                loss.backward()
                optimizer.step()
                losses.append(loss.item())
                pbar.set_description("Epoch: %d, Loss: %0.8f, lr: %0.6f" % (epoch + 1, np.mean(losses), optimizer.param_groups[0]['lr']))

        if epoch % args.save_interval == 0:
            torch.save(model, args.save_dir + "/{}_{}.pt".format(args.model_type, epoch + 1))
        evaluate(model,valid_set)
示例#6
0
def test(args):
    config = load_config(args.model_dir)

    vocab_dump_path = os.path.join(args.model_dir, "vocab.pkl")

    test_file = config["test_file"] if len(
        args.test_file) == 0 else args.test_file
    test_dataset = LMDataset(test_file, vocab_dump=vocab_dump_path)

    config["vocab_size"] = len(test_dataset.vocab)
    model = LM(config, args.model_dir)

    if args.epoch is not None:
        print_time_info("Loading checkpoint {} from model_dir".format(
            args.epoch))
        epoch = model.load_model(args.model_dir, args.epoch)
    else:
        print_time_info("Loading last checkpoint from model_dir")
        epoch = model.load_model(args.model_dir)

    loss = model.test(batch_size=config["batch_size"],
                      data_engine=test_dataset)
示例#7
0
def n_experiment(file, smoothing, k):
	output = ''

	lmd = LMDataset(file)
	split = int(len(lmd.raw_data)*0.8)
	train, test = lmd.raw_data[:split], lmd.raw_data[split:]

	X = []
	Y = []
	Y_acc = []

	for n in range(2, 21):
		lm = NgramLM(data=train, n=n)
		lm.train()
		tune = lm.generate()
		acc, pp = lm.test(data=test, s=smoothing, k=k)
		X.append(n)
		Y.append(pp)
		Y_acc.append(acc)
		op = 'n: {}\nk: {}\n{}\nAccuracy: {}\nPerplexity: {}\n\n'.format(
			str(n), str(k), tune, str(acc), str(pp))
		print(op)
		output += op

	with open('data/lm_n_exp_t', 'w') as f:
		f.write(output)

	plt.plot(X, Y, '-')
	plt.xlabel('n of n-gram model')
	plt.ylabel('Perplexity')
	plt.savefig('data/lm_n_exp_pp_t')
	plt.clf()
	plt.plot(X, Y_acc, '-')
	plt.xlabel('n of n-gram model')
	plt.ylabel('Accuracy')
	plt.savefig('data/lm_n_exp_acc_t')
示例#8
0
def train(args):
    if args.logdir is None:
        args.logdir = "Models-{}".format(time.strftime("%Y%m%d-%H%M%S"))
    task = "lm" if not args.seq2seq else "seq2seq"
    args.logdir += "_" + args.model_type + "_" + task
    os.makedirs(args.logdir, exist_ok=True)
    os.makedirs(os.path.join(args.logdir, "models"), exist_ok=True)
    print("Experiment dir : {}".format(args.logdir))

    log_format = '%(asctime)s %(message)s'
    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format=log_format,
                        datefmt='%m/%d %I:%M:%S %p')
    fh = logging.FileHandler(os.path.join(args.logdir, 'log.txt'))
    fh.setFormatter(logging.Formatter(log_format))
    logging.getLogger().addHandler(fh)

    device = "cuda:" + str(args.gpuid) if torch.cuda.is_available() else "cpu"

    mem_crammer = []

    if args.model_type == "lstm":
        from lstm import LMModel, Seq2SeqModel
    elif args.model_type == "transformer":
        from transformer import LMModel, Seq2SeqModel

    if args.seq2seq:
        train_set = Seq2SeqDataset(device=device)
        valid_set = Seq2SeqDataset(split="valid", device=device)
        model = Seq2SeqModel(args, train_set.dictionary).to(device)
    else:
        train_set = LMDataset(device=device)
        valid_set = LMDataset(split="valid", device=device)
        model = LMModel(args, train_set.dictionary).to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)

    warmup_epoch = args.num_epoch * 0.1
    scheduler = ExponentialLR(optimizer,
                              0.1**(1 / (args.num_epoch - warmup_epoch)))
    iter_per_epoch = (len(train_set) + args.batch_size - 1) // args.batch_size
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * warmup_epoch)

    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              collate_fn=train_set.collate_fn,
                              shuffle=True)

    bestppl = 1e9
    for epoch in range(args.num_epoch):
        model.train()

        if args.cram:
            while True:
                try:
                    junk = torch.rand((9999, 9999), dtype=float, device=device)
                except:
                    with torch.cuda.device(device):
                        torch.cuda.empty_cache()
                    break
                mem_crammer.append(junk)

        with tqdm(train_loader, desc="training") as pbar:
            losses = []
            for samples in pbar:
                if epoch < warmup_epoch:
                    warmup_scheduler.step()
                optimizer.zero_grad()

                while True:
                    success = True
                    try:
                        loss = model.get_loss(**samples)
                        loss.backward()
                        optimizer.step()
                    except:
                        del mem_crammer[-1]
                        with torch.cuda.device(device):
                            torch.cuda.empty_cache()
                        success = False
                        optimizer.zero_grad()
                    if success:
                        break

                losses.append(loss.item())
                pbar.set_description("Epoch: %d, Loss: %0.8f, lr: %0.6f" %
                                     (epoch + 1, np.mean(losses),
                                      optimizer.param_groups[0]['lr']))

            logging.info(
                "Epoch: %d, Loss: %0.8f, lr: %0.6f" %
                (epoch + 1, np.mean(losses), optimizer.param_groups[0]['lr']))

        if epoch % args.save_interval == 0:
            savepath = os.path.join(
                args.logdir, "models/{}_{}.pt".format(args.model_type,
                                                      epoch + 1))
            torch.save(model, savepath)
            logging.info("Saving to {}".format(savepath))

        if task == "lm":
            print("好    -->", model.generate("好", beam_size=3, device=device))
            print("秋水  -->", model.generate("秋水", beam_size=3, device=device))
            print("寒烟翠-->", model.generate("寒烟翠", beam_size=3, device=device))
        elif task == "seq2seq":
            print("改革春风吹满地-->",
                  model.generate("改革春风吹满地", beam_size=2, device=device))
            print("牛津大学聪明人不及蟾蜍一半-->",
                  model.generate("牛津大学聪明人不及蟾蜍一半", beam_size=2, device=device))
            print("一支穿云箭,青天白日重新现-->",
                  model.generate("一支穿云箭,青天白日重新现", beam_size=2, device=device))

        loss, ppl = evaluate(model, valid_set, False)
        logging.info("Valid, Loss: %0.8f, ppl: %0.8f" % (loss, ppl))

        if ppl < bestppl:
            bestppl = ppl
            savepath = os.path.join(
                args.logdir, "models/{}_{}.pt".format(args.model_type, task))
            torch.save(model, savepath)
            logging.info("Best ppl! Saving to {}".format(savepath))

        if epoch >= warmup_epoch:
            scheduler.step()
示例#9
0
def tunes_test(file, n):
	lmd = LMDataset(file, byKey=True)
	for key, tunes in lmd.key_date.items():
		lm = NgramLM(tunes, n)
		lm.train()
		print(key, lm.generate())
示例#10
0
def tunes_parse(file):
	lmd = LMDataset(file)
	return lmd.get_data()
示例#11
0
# check cuda
if opt.cuda and not torch.cuda.is_available():
    raise RuntimeError('Cannot train on GPU because cuda is not available')

device = 'cuda' if opt.cuda else 'cpu'
torch.manual_seed(opt.seed)
if opt.cuda:
    torch.cuda.manual_seed(opt.seed)

# Initialize all except model
# vocab = torchtext.vocab.GloVe(name='840B', dim='300', cache='/media/data/nlp/wv/glove')
# vocab = pickle.load(open(opt.vocab_path, 'rb'))
lmdataset = LMDataset(vocab_path=opt.vocab_path,
                      corpus_path=opt.data_path,
                      bptt=opt.bptt,
                      device=device,
                      min_counts=opt.min_counts)
opt.vocab_size = len(lmdataset.vocab)
opt.device = device
lmloader = DataLoader(lmdataset, batch_size=opt.batch_size, shuffle=False)

# prefix is added to model name and to tensorboard scalar name
start_time = str(datetime.datetime.now()).replace(' ', '_').replace(':',
                                                                    '_')[:-10]
prefix = 'vocab_{}.emb_{}.hidden_{}.lr_{}.start_time_{}'.format(
    opt.vocab_size, opt.embedding_size, opt.hidden_size, opt.learning_rate,
    start_time)

if opt.tensorboard:
    from tensorboardX import SummaryWriter
示例#12
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='sampler.py')
    opts.model_opts(parser)
    opts.model_io_opts(parser)
    opts.data_opts(parser)
    opts.sample_opts(parser)
    opt = parser.parse_args()

    if opt.cuda and not torch.cuda.is_available():
        raise RuntimeError(
            'Cannot sample on GPU because cuda is not available')

    device = 'cuda' if opt.cuda else 'cpu'
    model = torch.load(opt.checkpoint)
    model.device = device
    model.to(device)

    lmdataset = LMDataset(
        vocab_path=opt.vocab_path,
        corpus_path=opt.data_path,
        bptt=opt.length,
        device=device,
        min_counts=0  # TODO: make sure it works
    )
    sampler = Sampler(model, lmdataset)

    sampler.sample(opt.batch_size,
                   strategy=opt.sampling_strategy,
                   temperature=opt.temperature,
                   n_sampled=opt.length)
示例#13
0
def train():
    parser = argparse.ArgumentParser()
    parser.add_argument("-conf", type=str)
    parser.add_argument("--debug", action="store_true")
    parser.add_argument(
        "--gpu",
        type=str,
        default=None,
        help=
        "binary flag which gpu to use (For example '10100000' means use device_id=0 and 2)"
    )

    args = parser.parse_args()
    config = configparser.ConfigParser()
    config.read(args.conf)

    hidden_size = int(config["model"]["hidden_size"])
    num_hidden_layers = int(config["model"]["num_hidden_layers"])
    num_attention_heads = int(config["model"]["num_attention_heads"])
    intermediate_size = int(config["model"]["intermediate_size"])
    max_position_embeddings = int(config["model"]["max_position_embeddings"])
    #
    vocab_size = int(config["vocab"]["vocab_size"])
    mask_id = int(config["vocab"]["mask_id"])
    #
    log_path = config["log"]["log_path"]
    log_dir = os.path.dirname(log_path)
    os.makedirs(log_dir, exist_ok=True)
    log_step = int(config["log"]["log_step"])
    #
    train_size = int(config["data"]["train_size"])
    #
    save_prefix = config["save"]["save_prefix"]
    save_dir = os.path.dirname(save_prefix)
    os.makedirs(save_dir, exist_ok=True)
    save_epoch = int(config["save"]["save_epoch"])
    #
    batch_size = int(config["train"]["batch_size"])
    if args.debug:
        batch_size = 10
    num_epochs = int(config["train"]["num_epochs"])
    learning_rate = float(config["train"]["learning_rate"])
    warmup_proportion = float(config["train"]["warmup_proportion"])
    weight_decay = float(config["train"]["weight_decay"])
    #
    num_to_mask = int(config["mask"]["num_to_mask"])
    max_seq_len = int(config["mask"]["max_seq_len"])

    if args.debug:
        logging.basicConfig(format="%(asctime)s %(message)s",
                            level=logging.DEBUG)
    else:
        logging.basicConfig(filename=log_path,
                            format="%(asctime)s %(message)s",
                            level=logging.DEBUG)

    bertconfig = modeling_bert.BertConfig(
        vocab_size_or_config_json_file=vocab_size,
        hidden_size=hidden_size,
        num_hidden_layers=num_hidden_layers,
        num_attention_heads=num_attention_heads,
        intermediate_size=intermediate_size,
        max_position_embeddings=max_position_embeddings)
    model = BertForMaskedLM(config=bertconfig)
    total_params = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)

    if args.gpu is not None:
        device_ids = []
        for device_id, flag in enumerate(args.gpu):
            if flag == "1":
                device_ids.append(device_id)
        multi_gpu = True
        device = torch.device("cuda:{}".format(device_ids[0]))
    else:
        multi_gpu = False
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    logging.info(f"device: {device}")
    if "model_path" in config["train"]:
        model_path = config["train"]["model_path"]
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        logging.info(f"load model from {model_path}")
    model.to(device)
    if multi_gpu:
        logging.info(f"GPU: device_id={device_ids}")
        model = torch.nn.DataParallel(model, device_ids=device_ids)
    model.train()

    # optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = (train_size // batch_size) * num_epochs
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         weight_decay=weight_decay,
                         t_total=t_total)
    logging.info("start training...")

    for epoch in range(num_epochs):
        if "train_dir" in config["data"]:
            train_dir = config["data"]["train_dir"]
            datpaths = os.listdir(train_dir)
            random.shuffle(datpaths)
            for step_ds, path in enumerate(datpaths):
                path = os.path.join(train_dir, path)
                dataset = LMDataset(path)
                num_steps = (len(dataset) // batch_size) + 1
                logging.info(f"dataset from: {path}")
                loss_ds = train_dataset(dataset=dataset,
                                        model=model,
                                        optimizer=optimizer,
                                        multi_gpu=multi_gpu,
                                        device=device,
                                        epoch=epoch,
                                        batch_size=batch_size,
                                        num_steps=num_steps,
                                        log_step=log_step,
                                        num_to_mask=num_to_mask,
                                        mask_id=mask_id,
                                        max_seq_len=max_seq_len)
                logging.info(
                    f"step {step_ds + 1} / {len(datpaths)}: {(loss_ds / num_steps):.6f}"
                )
        else:
            train_path = config["data"]["train_path"]
            dataset = LMDataset(train_path)
            num_steps = (len(dataset) // batch_size) + 1
            loss_epoch = train_dataset(dataset=dataset,
                                       model=model,
                                       optimizer=optimizer,
                                       multi_gpu=multi_gpu,
                                       device=device,
                                       epoch=epoch,
                                       batch_size=batch_size,
                                       num_steps=num_steps,
                                       log_step=log_step,
                                       num_to_mask=num_to_mask,
                                       mask_id=mask_id,
                                       max_seq_len=max_seq_len)
            logging.info(
                f"epoch {epoch + 1} / {num_epochs} : {(loss_epoch / num_steps):.6f}"
            )

        if (epoch + 1) % save_epoch == 0:
            save_path = f"{save_prefix}.network.epoch{(epoch + 1):d}"
            optimizer_save_path = f"{save_prefix}.optimizer.epoch{(epoch + 1):d}"
            if multi_gpu:
                torch.save(model.module.state_dict(),
                           save_path.format(epoch + 1))
            else:
                torch.save(model.state_dict(), save_path.format(epoch + 1))
            logging.info(f"model saved: {save_path}")
            torch.save(optimizer.state_dict(),
                       optimizer_save_path.format(epoch + 1))
            logging.info(f"optimizer saved: {optimizer_save_path}")