예제 #1
0
    def __init__(self,
                 train_batch_size=20,
                 eval_batch_size=10,
                 pred_batch_size=1,
                 bptt=35):
        # 'train': 36718, 'valid': 3760, 'test': 4358,

        self.bptt = bptt
        train_iter = WikiText2(split='train')
        self.tokenizer = get_tokenizer('basic_english')
        counter = Counter()
        txtline = []
        for line in train_iter:
            txtline.append(line)
            counter.update(self.tokenizer(line))
        self.vocab = Vocab(counter)
        train_iter, val_iter, test_iter = WikiText2()

        train_data = self.data_process(train_iter)
        val_data = self.data_process(val_iter)
        test_data = self.data_process(test_iter)
        pred_data = train_data

        self.train_data = self.batchify(train_data, train_batch_size)
        self.val_data = self.batchify(val_data, eval_batch_size)
        self.test_data = self.batchify(test_data, eval_batch_size)
        self.pred_data = self.batchify(pred_data, pred_batch_size)  # 用于单行预测
        self.text = txtline
예제 #2
0
def get_data():

    train_iter = WikiText2(split='train')
    counter = Counter()
    for line in train_iter:
        counter.update(tokenizer(line))
    vocab = Vocab(counter)

    train_iter, val_iter, test_iter = WikiText2()
    '''
    i = 0
    for item in train_iter:
        print(item)
        if i == 5:
            break
    
        i+=1
    '''
    train_data = data_process(train_iter, vocab)
    val_data = data_process(val_iter, vocab)
    test_data = data_process(test_iter, vocab)

    train_data = batchify(train_data, batch_size)
    val_data = batchify(val_data, eval_batch_size)
    test_data = batchify(test_data, eval_batch_size)
    return train_data, val_data, test_data, vocab
예제 #3
0
    def __init__(self, device):

        train_iter, val_iter, test_iter = WikiText2()
        self.device = device
        self.tokenizer = get_tokenizer('basic_english')
        self.counter = Counter()

        self.counter.update(self.tokenizer('<sos>'))

        for line in train_iter:
            self.counter.update(self.tokenizer(line))

        for line in val_iter:
            self.counter.update(self.tokenizer(line))

        for line in test_iter:
            self.counter.update(self.tokenizer(line))

        self.vocab = Vocab(self.counter)

        train_iter, val_iter, test_iter = WikiText2()

        self.train = self.data_process(train_iter).to(self.device)
        self.val = self.data_process(val_iter).to(self.device)
        self.test = self.data_process(test_iter).to(self.device)
예제 #4
0
def get_wiki2(conf):
    """
    Return WikiText 2 iterators
    """
    # raw data
    train_iter, test_iter, valid_iter = WikiText2(split=('train', 'test', 'valid'))
    train_iter_copy, test_iter_copy, valid_iter_copy = WikiText2(split=('train', 'test', 'valid'))
    # loader
    train, test, valid, vocab = load_dataset(train_iter, test_iter, valid_iter, train_iter_copy, test_iter_copy, valid_iter_copy, conf)
    return train, test, valid, vocab
예제 #5
0
def get_accuracy(ps_rref, data_dir, test_batch_size, job_name, target_loss):
    logger = Logger(
        job_name=job_name,
        file_dir=f'./measurement/logs/{job_name}_tester.log').logger

    train_iter = WikiText2(root=data_dir, split='train')
    tokenizer = get_tokenizer('basic_english')
    vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
                                      specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    bptt = 35

    train_iter, val_iter, test_iter = WikiText2(root=data_dir)
    val_data = data_process(val_iter, vocab, tokenizer)
    val_data = batchify(val_data, test_batch_size)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()

    t0 = time.time()
    logger.info("Start!")
    init = t0
    while True:
        t1 = time.time()
        if t1 - t0 > 20:
            t0 = t1
            m = ps_rref.rpc_sync().get_model().to(device)

            test_loss = 0.

            with torch.no_grad():
                hidden = m.init_hidden(test_batch_size)
                for batch_idx, i in enumerate(
                        range(0,
                              val_data.size(0) - 1, bptt)):
                    data, targets = get_batch(val_data, i, bptt)
                    data, targets = data.to(device), targets.to(device)
                    hidden = repackage_hidden(hidden)
                    output, hidden = m(data, hidden)
                    loss = criterion(output, targets)
                    test_loss += len(data) * loss.item()

            test_loss /= (len(val_data) - 1)

            logger.info("Test Loss: {:7.3f} | Time: {:7.2f} seconds".format(
                test_loss, (t1 - init)))

            if test_loss < target_loss:
                ps_rref.rpc_sync().stop()
                break
예제 #6
0
    def test_wikitext2(self):
        from torchtext.experimental.datasets import WikiText2
        cachedir = os.path.join(self.project_root, ".data", "wikitext-2")
        conditional_remove(cachedir)
        cachefile = os.path.join(self.project_root, ".data",
                                 "wikitext-2-v1.zip")
        conditional_remove(cachefile)

        train_dataset, valid_dataset, test_dataset = WikiText2()
        train_data = torch.cat(
            tuple(filter(lambda t: t.numel() > 0, train_dataset)))
        valid_data = torch.cat(
            tuple(filter(lambda t: t.numel() > 0, valid_dataset)))
        test_data = torch.cat(
            tuple(filter(lambda t: t.numel() > 0, test_dataset)))
        self._helper_test_func(len(train_data), 2049990, train_data[20:25],
                               [5024, 89, 21, 3, 1838])
        self._helper_test_func(len(test_data), 241859, test_data[30:35],
                               [914, 4, 36, 11, 569])
        self._helper_test_func(len(valid_data), 214417, valid_data[40:45],
                               [925, 8, 2, 150, 8575])

        vocab = train_dataset.get_vocab()
        tokens_ids = [
            vocab[token] for token in 'the player characters rest'.split()
        ]
        self.assertEqual(tokens_ids, [2, 286, 503, 700])

        # Add test for the subset of the standard datasets
        train_iter, valid_iter, test_iter = torchtext.experimental.datasets.raw.WikiText2(
            data_select=('train', 'valid', 'test'))
        self._helper_test_func(len(train_iter), 36718, next(iter(train_iter)),
                               ' \n')
        self._helper_test_func(len(valid_iter), 3760, next(iter(valid_iter)),
                               ' \n')
        self._helper_test_func(len(test_iter), 4358, next(iter(test_iter)),
                               ' \n')
        del train_iter, valid_iter, test_iter
        train_dataset, test_dataset = WikiText2(data_select=('train', 'test'))
        train_data = torch.cat(
            tuple(filter(lambda t: t.numel() > 0, train_dataset)))
        test_data = torch.cat(
            tuple(filter(lambda t: t.numel() > 0, test_dataset)))
        self._helper_test_func(len(train_data), 2049990, train_data[20:25],
                               [5024, 89, 21, 3, 1838])
        self._helper_test_func(len(test_data), 241859, test_data[30:35],
                               [914, 4, 36, 11, 569])

        conditional_remove(cachedir)
        conditional_remove(cachefile)
예제 #7
0
    def test_wikitext2(self):
        from torchtext.experimental.datasets import WikiText2
        # smoke test to ensure wikitext2 works properly

        # NOTE
        # test_wikitext2 and test_wikitext2_legacy have some cache incompatibility.
        # Keeping one's cache make the other fail. So we need to clean up the cache dir
        cachedir = os.path.join(self.project_root, ".data", "wikitext-2")
        conditional_remove(cachedir)
        cachefile = os.path.join(self.project_root, ".data",
                                 "wikitext-2-v1.zip")
        conditional_remove(cachefile)

        train_dataset, test_dataset, valid_dataset = WikiText2()
        self.assertEqual(len(train_dataset), 2049990)
        self.assertEqual(len(test_dataset), 241859)
        self.assertEqual(len(valid_dataset), 214417)

        vocab = train_dataset.get_vocab()
        tokens_ids = [
            vocab[token] for token in 'the player characters rest'.split()
        ]
        self.assertEqual(tokens_ids, [2, 286, 503, 700])

        conditional_remove(cachedir)
        conditional_remove(cachefile)
예제 #8
0
    def __init__(self, train_batch_size=20, eval_batch_size=10, bptt=35):
        self.bptt = bptt
        train_iter = WikiText2(split='train')
        self.tokenizer = get_tokenizer('basic_english')
        counter = Counter()
        for line in train_iter:
            counter.update(self.tokenizer(line))
        self.vocab = Vocab(counter)
        train_iter, val_iter, test_iter = WikiText2()
        train_data = self.data_process(train_iter)
        val_data = self.data_process(val_iter)
        test_data = self.data_process(test_iter)

        self.train_data = self.batchify(train_data, train_batch_size)
        self.val_data = self.batchify(val_data, eval_batch_size)
        self.test_data = self.batchify(test_data, eval_batch_size)
예제 #9
0
def gen_tokenizer_and_vocab():
    train_iter = WikiText2(split='train')
    tokenizer = get_tokenizer('basic_english')
    counter = Counter()
    for line in train_iter:
        counter.update(tokenizer(line))
    vocab = RetiredVocab(counter)
    return tokenizer, vocab
예제 #10
0
 def create_datasets(self):
     field = Field(tokenize=list)
     train, val, test = WikiText2.splits(field, root='wikitext2_data')
     field.build_vocab(train, vectors=None)
     trains, vals, _ = BPTTIterator.splits((train, val, test),
                                           batch_size=self.args.batch,
                                           bptt_len=self.args.bptt_len,
                                           device=torch.device('cpu'))
     return trains, vals
예제 #11
0
def get_data():
    train_iter = WikiText2(split='train')  # download the train iterator
    counter = Counter()                    # instantiate a Counter istance
    # update the counter with the tokens (kind of a dictionary)
    for line in train_iter:
        counter.update(tokenizer(line))
    vocab = Vocab(counter)                 # create a Vocab from the counter

    train_iter, val_iter, test_iter = WikiText2()

    train_data = preprocess(train_iter, vocab)
    val_data = preprocess(val_iter, vocab)
    test_data = preprocess(test_iter, vocab)

    train_data = batchify(train_data, batch_size)
    val_data = batchify(val_data, eval_batch_size)
    test_data = batchify(test_data, eval_batch_size)

    return train_data, val_data, test_data, vocab
예제 #12
0
파일: dataloader.py 프로젝트: hyunbool/TIL
def load_data(device):
    train_iter = WikiText2(split='train')
    tokenizer = get_tokenizer('basic_english')
    counter = Counter()
    for line in train_iter:
        counter.update(tokenizer(line))
    vocab = Vocab(counter)

    train_iter, val_iter, test_iter = WikiText2()
    train_data = data_process(train_iter, tokenizer, vocab)
    val_data = data_process(val_iter, tokenizer, vocab)
    test_data = data_process(test_iter, tokenizer, vocab)

    batch_size = 20
    eval_batch_size = 10
    train_data = batchfy(train_data, batch_size, device)
    val_data = batchfy(val_data, eval_batch_size, device)
    test_data = batchfy(test_data, eval_batch_size, device)

    return vocab, train_data, val_data, test_data
예제 #13
0
def WikiTexts(batch_size=32, bptt=30, vectors="glove.6B.100d"):
    my_tok = spacy.load('en')
    #my_tok.tokenizer.add_special_case('<eos>', [{ORTH: '<eos>'}])
    #my_tok.tokenizer.add_special_case('<bos>', [{ORTH: '<bos>'}])
    #my_tok.tokenizer.add_special_case('<unk>', [{ORTH: '<unk>'}])
    TEXT = data.Field(lower=True, tokenize=spacy_tok)
    train, valid, test = WikiText2.splits(TEXT)
    TEXT.build_vocab(train, vectors=vectors)
    train_loader, val_loader, test_loader = data.BPTTIterator.splits(
        (train, valid, test),
        batch_size=batch_size,
        bptt_len=bptt,  # this is where we specify the sequence length
        #device=(0 if USE_GPU else -1),
        repeat=False)

    return train_loader, val_loader, test_loader, TEXT
예제 #14
0
    def get_data(self):
        '''
        Retrieves data in a format that can
        be used in training by loading in batches.

        Returns
        -------
            obj
                Object loaded with language data.
            obj
                Torchtext data iterator.
            int
                Vocab size in the text dataset.
            obj
                Field object from Torchtext.
            obj
                Vocabulary taken from Torchtext Field.
        '''
        TEXT = Field(tokenize=self.tokenizer, lower=True)

        train, valid, test = WikiText2.splits(TEXT)

        TEXT.build_vocab()
        vocab_size = len(TEXT.vocab)

        train_iter, valid_iter = BPTTIterator.splits(
            (train, valid),
            batch_size=self.config.batch_size,
            bptt_len=8,
            device=self.device,
            repeat=False)

        train_loader = Batch(dl=train_iter, x_var='text')
        valid_loader = Batch(dl=valid_iter, x_var='text')

        print(len(train_loader))

        data_dict = edict({
            'train_loader': train_loader,
            'valid_loader': valid_loader,
            'train_iter': train_iter,
            'vocab_size': vocab_size,
            'vocab': TEXT.vocab
        })

        return data_dict
예제 #15
0
    def test_wikitext2(self):
        from torchtext.experimental.datasets import WikiText2
        # smoke test to ensure wikitext2 works properly
        train_dataset, test_dataset, valid_dataset = WikiText2()
        self.assertEqual(len(train_dataset), 2049990)
        self.assertEqual(len(test_dataset), 241859)
        self.assertEqual(len(valid_dataset), 214417)

        vocab = train_dataset.get_vocab()
        tokens_ids = [
            vocab[token] for token in 'the player characters rest'.split()
        ]
        self.assertEqual(tokens_ids, [2, 286, 503, 700])

        # Delete the dataset after we're done to save disk space on CI
        datafile = os.path.join(self.project_root, ".data", "wikitext-2")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data",
                                "wikitext-2-v1.zip")
        conditional_remove(datafile)
def evaluate_lm(model_path):
    """
    Evaluate language model against Wiki2
    Arguments
    ---------
    model_path: string
        Can be "RNN", "QRNN"
    """


    device = "cuda" if torch.cuda.is_available() else "cpu"

    model, TEXT = load_model(model_path, device)


    train, valid, test = WikiText2.splits(TEXT)


    BATCH_SIZE = 32
    BPTT_LEN = 30

    train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
        (train, valid, test),
        batch_size=BATCH_SIZE,
        bptt_len=BPTT_LEN, # this is where we specify the sequence length
        device=device,
        repeat=False)

    criterion = nn.CrossEntropyLoss()

    model.eval()

    valid_loss, valid_perplexity = evaluate(model, valid_iter, criterion)
    test_loss, test_perplexity = evaluate(model, test_iter, criterion)


    print(f"Valid loss      : {valid_loss:.3f}")
    print(f"Valid perplexity: {valid_perplexity:.2f}\n")

    print(f"Test loss      : {test_loss:.3f}")
    print(f"Test perplexity: {test_perplexity:.2f}")
예제 #17
0
    for epoch in range(20):
        train(model)
        val_loss = evaluate(model, val_data)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
        scheduler.step()

    best_val_ppl = math.exp(best_val_loss)
    nni.report_final_result(
        best_val_ppl
    )  # reports best validation ppl to nni as final result of one trial


if __name__ == "__main__":

    train_iter = WikiText2(split='train')
    tokenizer = get_tokenizer('basic_english')
    vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
                                      specials=['<unk>'])
    vocab.set_default_index(vocab['<unk>'])

    n_token = len(vocab)
    base_model = Transformer(n_token)

    evaluator = FunctionalEvaluator(fit)
    exp = RetiariiExperiment(base_model, evaluator, [], strategy.Random())
    exp_config = RetiariiExeConfig('local')
    exp_config.experiment_name = 'transformer tuning'
    exp_config.trial_concurrency = 3  # please change configurations accordingly
    exp_config.max_trial_number = 25
    exp_config.trial_gpu_number = 1
예제 #18
0
LABELS.build_vocab(train)


a = next(iter(data.BPTTIterator(train, 20, 20)))


train_iter, dev_iter, test_iter = data.BPTTIterator.splits(
    ([i.text for i in train], dev, test),
    bptt_len=13,
    batch_size=7,
    sort_key=lambda x: len(x.text),
    device='cpu')




# https://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/
from torchtext.datasets import WikiText2
train, valid, test = WikiText2.splits(TEXT) # loading custom datas
len(train)


data.Example?







def main(args):
    if args.device:
        device = args.device
    else:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    text_field = data.Field(tokenize=list)
    datasets = WikiText2.splits(text_field)
    text_field.build_vocab(datasets[0])

    train_iter, test_iter, val_iter = data.BPTTIterator.splits(datasets,
                                                               batch_size=32,
                                                               bptt_len=512,
                                                               device=device)

    vocab = text_field.vocab

    print(f'Vocab size: {len(vocab)}')

    model_args = dict(rnn_type='lstm',
                      ntoken=args.num_latents,
                      ninp=256,
                      nhid=1024,
                      nlayers=2)
    if args.model_args:
        model_args.update(dict(eval(args.model_args)))

    model = SHARNN(**model_args).to(device)
    model.train()

    criterion = nn.NLLLoss()

    #optim = torch.optim.SGD(model.parameters(), lr=5.0)
    optim = torch.optim.Adam(model.parameters(), lr=2e-3)

    for epoch in range(10):
        hidden = None
        mems = None

        total_loss = 0

        for step, batch in enumerate(train_iter):
            optim.zero_grad()

            if hidden is not None:
                hidden = repackage_hidden(hidden)
            if mems is not None:
                mems = repackage_hidden(mems)

            output, hidden, mems, attn_outs, _ = model(batch.text,
                                                       hidden,
                                                       return_h=True,
                                                       mems=mems)

            logits = model.decoder(output)
            logits = F.log_softmax(logits, dim=-1)

            assert logits.size(1) == batch.target.size(1)

            loss = criterion(logits.view(-1, logits.size(-1)),
                             batch.target.view(-1))
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)

            optim.step()

            total_loss += loss.data

            if step % args.log_interval == 0 and step > 0:
                cur_loss = total_loss / args.log_interval
                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | '
                      'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                          epoch, step, len(train_iter),
                          optim.param_groups[0]['lr'], cur_loss,
                          math.exp(cur_loss), cur_loss / math.log(2)))
                total_loss = 0
예제 #20
0
def main():
    args = parser.parse_args()
    tqdm.monitor_interval = 0
    tmp = os.environ.get('SLURM_TMPDIR')
    scratch = os.environ.get('SCRATCH')
    project = os.environ.get('project')

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(1)

    if args.dataset == 'WikiText103':
        train_iter = WikiText103(root=tmp, split='train')
        print(f'dataset {args.dataset}')
    elif args.dataset == 'WikiText2':
        train_iter = WikiText2(root=tmp, split='train')
        print(f'dataset {args.dataset}')
    else:
        print('dataset not implemented!')
    tokenizer = get_tokenizer('basic_english')
    vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
                                      specials=['<unk>'])
    vocab.set_default_index(vocab['<unk>'])
    if args.dataset == 'WikiText103':
        train_iter, val_iter, test_iter = torchtext.datasets.WikiText103(
            root=tmp, split=('train', 'valid', 'test'))
    elif args.dataset == 'WikiText2':
        train_iter, val_iter, test_iter = torchtext.datasets.WikiText2(
            root=tmp, split=('train', 'valid', 'test'))
    else:
        print('dataset not implemented!')
    path = Path.cwd()
    if args.dataset == 'WikiText103':
        pathLog = path / 'logs/wikitext103'
        pathSaved = path / 'saved'
    else:
        pathLog = path / 'logs/wikitext2'
        pathSaved = path / 'saved/wikitext2'

    def data_process(raw_text_iter):
        """Converts raw text into a flat Tensor."""
        data = [
            torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
            for item in raw_text_iter
        ]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_data = data_process(train_iter)
    val_data = data_process(val_iter)
    test_data = data_process(test_iter)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def batchify(data, bsz):
        """Divides the data into bsz separate sequences, removing extra elements
        that wouldn't cleanly fit.

        Args:
            data: Tensor, shape [N]
            bsz: int, batch size

        Returns:
            Tensor of shape [N // bsz, bsz]
        """
        seq_len = data.size(0) // bsz
        data = data[:seq_len * bsz]
        data = data.view(bsz, seq_len).t().contiguous()
        return data

    batch_size = args.batch_size
    eval_batch_size = int(args.batch_size // 2)
    train_data = batchify(train_data,
                          batch_size)  # shape [seq_len, batch_size]
    val_data = batchify(val_data, eval_batch_size)
    test_data = batchify(test_data, eval_batch_size)

    bptt = args.bptt

    def get_batch(source, i):
        """
        Args:
            source: Tensor, shape [full_seq_len, batch_size]
            i: int

        Returns:
            tuple (data, target), where data has shape [seq_len, batch_size] and
            target has shape [seq_len * batch_size]
        """
        seq_len = min(bptt, len(source) - 1 - i)
        data = source[i:i + seq_len]
        target = source[i + 1:i + 1 + seq_len].reshape(-1)
        return data, target

    ntokens = len(vocab)  # size of vocabulary
    emsize = args.emsize  # embedding dimension
    d_hid = args.d_hid  # dimension of the feedforward network model in nn.TransformerEncoder
    nlayers = args.nlayers  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
    nhead = args.nhead  # number of heads in nn.MultiheadAttention
    dropout = args.dropout  # dropout probability
    model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers,
                             dropout).to(device)

    criterion = nn.CrossEntropyLoss()
    n_gpus = torch.cuda.device_count()
    print(
        f'batch size: {batch_size}, bptt: {bptt}, seed: {args.seed}, ngpus: {n_gpus}'
    )
    print(f'len vocab: {ntokens}, embeddingSize: {emsize}, hiddenDim: {d_hid}')
    print(f'nlayers: {nlayers}, nAttentionHead: {nhead}, dropout: {dropout}')

    def train(model, train_data, bptt):
        model.train()  # turn on train mode
        total_loss = 0.
        count = 0
        log_interval = 5000
        # start_time = time.time()
        src_mask = generate_square_subsequent_mask(bptt).to(device)

        num_batches = len(range(0, train_data.size(0) - 1, bptt))
        progress = tqdm(total=num_batches)
        for batch, i in enumerate((range(0, train_data.size(0) - 1, bptt))):
            # if batch<140000: continue
            data, targets = get_batch(train_data, i)
            data = data.to(device)
            targets = targets.to(device)
            batch_size = data.size(0)
            if batch_size != bptt:  # only on last batch
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            loss = criterion(output.view(-1, ntokens), targets)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

            total_loss += loss.detach() * batch_size
            count += 1

            if batch % log_interval == 0 and batch != 0:
                progress.update(log_interval)

        return total_loss / (len(train_data) - 1), count

    def evaluate(model, eval_data, bptt):
        model.eval()  # turn on evaluation mode
        total_loss = 0.
        src_mask = generate_square_subsequent_mask(bptt).to(device)

        with torch.no_grad():
            for batch, i in enumerate(range(0, eval_data.size(0) - 1, bptt)):
                data, targets = get_batch(eval_data, i)
                data = data.to(device)
                targets = targets.to(device)
                batch_size = data.size(0)
                if batch_size != bptt:
                    src_mask = src_mask[:batch_size, :batch_size]
                output = model(data, src_mask)
                output_flat = output.view(-1, ntokens)
                total_loss += batch_size * criterion(output_flat, targets)

        return total_loss / (len(eval_data) - 1)

    stepSize = len(range(0, train_data.size(0) - 1, bptt))
    lr = args.lr
    if args.optim == 'AdamW':
        optimizer = O.AdamW(model.parameters(), lr, weight_decay=args.wd)
    elif args.optim == 'Cons':
        optimizer = ConsciousLR(model.parameters(),
                                stepSize,
                                lr,
                                weight_decay=args.wd)
    elif args.optim == 'Agg':
        optimizer = ConsciousLR(model.parameters(),
                                stepSize,
                                lr,
                                weight_decay=args.wd,
                                lrHigh=2.,
                                lrLow=.5)
    elif args.optim == 'RAdamCons':
        optimizer = RAdamConsciousLR(model.parameters(),
                                     stepSize,
                                     lr,
                                     weight_decay=args.wd)
    elif args.optim == 'RAdamAgg':
        optimizer = RAdamConsciousLR(model.parameters(),
                                     stepSize,
                                     lr,
                                     weight_decay=args.wd,
                                     lrHigh=2.,
                                     lrLow=.5)
    elif args.optim == 'RAdam':
        optimizer = RAdam(model.parameters(), lr, weight_decay=args.wd)
    elif args.optim == 'AdaBelief':
        optimizer = AdaBelief(model.parameters(), lr, weight_decay=args.wd)
    else:
        print('optimizer not implemented!!!')
    print(optimizer)

    best_test_loss = float('inf')
    epochs = args.max_epochs
    train_losses = []
    val_ppls = []
    test_ppls = []
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        trainLoss, count = train(model, train_data, bptt)
        trainLoss = trainLoss.item()
        train_losses.append(trainLoss)

        val_loss = evaluate(model, val_data, bptt)
        val_loss = val_loss.item()
        val_ppl = math.exp(val_loss)
        val_ppls.append(val_ppl)

        test_loss = evaluate(model, test_data, bptt)
        test_loss = test_loss.item()
        test_ppl = math.exp(test_loss)
        test_ppls.append(test_ppl)
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(
            f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | trainLoss: {trainLoss:5.2f}'
            f' | valid ppl {val_ppl:8.2f}| test ppl {test_ppl:8.2f} |')
        print('-' * 89)

        if test_loss < best_test_loss:
            dic = {
                'model': model,
                'epoch': epoch,
                'val_loss': val_loss,
                'val_ppl': val_ppl,
                'train_loss': trainLoss,
                'test_ppl': test_ppl
            }
            if args.dataset == 'WikiText103':
                torch.save(dic,
                           pathSaved / f'{args.optim}_{args.lr}_103model.pt')
            else:
                torch.save(dic,
                           pathSaved / f'{args.optim}_{args.lr}_2model.pt')

            best_val_loss = val_loss
            best_val_ppl = val_ppl
            best_epoch = epoch
            best_train_loss = trainLoss
            best_test_loss = test_loss
            best_test_ppl = test_ppl
        log = {
            'train_losses': train_losses,
            "val_ppls": val_ppls,
            'test_ppls': test_ppls,
            'best_epoch': best_epoch,
            'best_val_ppl': best_val_ppl,
            'best_val_loss': best_val_loss,
            'best_train_loss': best_train_loss,
            'best_test_ppl': best_test_ppl,
            'best_test_loss': best_test_loss
        }
        if args.dataset == 'WikiText103':
            with open(pathLog / f'{args.optim}_{args.lr}_103.json', 'w') as fp:
                json.dump(log, fp)
        else:
            with open(pathLog / f'{args.optim}_{args.lr}_2.json', 'w') as fp:
                json.dump(log, fp)

    print(f'best test ppl: {best_test_ppl}')
    print(log)
예제 #21
0
#   \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} &
#   \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} &
#   \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix}
#   \end{bmatrix}
#
# These columns are treated as independent by the model, which means that
# the dependence of ``G`` and ``F`` can not be learned, but allows more
# efficient batch processing.
#

import torch
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
                                  specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])


def data_process(raw_text_iter):
    data = [
        torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
        for item in raw_text_iter
    ]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))


train_iter, val_iter, test_iter = WikiText2()
예제 #22
0
def segment(doc):
    """
    用 Spacy 库做分词, 将一段文档切割到若干词汇.
    """

    tokenizer = tokenize.tokenizer
    return [token.text for token in tokenizer(doc)]


# 定义特征域, 表示一段文本, 要求按规则分词并小写化预处理数据集.
TEXT = data.Field(lower=True, tokenize=segment)

# datasets 中存在一些准备好的数据集, 例如下面的 WikiText2, 另外这个
# 命令会在项目目录下自动创建目录 .data 并下载数据 (4.4M), 当然为了能
# 减少读者的疑惑, 在 data 文件夹下 copy 了一份相同的.
train_set, valid_set, test_set = WikiText2.splits(TEXT)

# 下面看看 train/valid/test 分别有多少条数据在其中 (没分词).
print(len(train_set), len(valid_set), len(test_set), end="\n\n")

# 在构建数据集的同时也可以加入预训练的词向量, 当然这里注释掉了.
TEXT.build_vocab(train_set)  # vectors="data/glove.6B.200d"

# 语言模型的核心便是 Iterator, 有子类为 BPTTIterator. 其特殊功能便
# 是将文本连续地切成一段段等长的序列并做 batch, 称为 bbpt, 例如:
#
#   "Machine learning is a field of computer science
#    that gives computers the ability to learn without
#    being explicitly programmed"
#
# 如果规定连续切割长度为 5, 则上述文本会生成一下列表:
예제 #23
0
def run_worker(rank, world_size):

    ######################################################################
    # Load and batch data
    # -------------------
    #

    ######################################################################
    # The training process uses Wikitext-2 dataset from ``torchtext``. The
    # vocab object is built based on the train dataset and is used to numericalize
    # tokens into tensors. Starting from sequential data, the ``batchify()``
    # function arranges the dataset into columns, trimming off any tokens remaining
    # after the data has been divided into batches of size ``batch_size``.
    # For instance, with the alphabet as the sequence (total length of 26)
    # and a batch size of 4, we would divide the alphabet into 4 sequences of
    # length 6:
    #
    # .. math::
    #   \begin{bmatrix}
    #   \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z}
    #   \end{bmatrix}
    #   \Rightarrow
    #   \begin{bmatrix}
    #   \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} &
    #   \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} &
    #   \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} &
    #   \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix}
    #   \end{bmatrix}
    #
    # These columns are treated as independent by the model, which means that
    # the dependence of ``G`` and ``F`` can not be learned, but allows more
    # efficient batch processing.
    #

    # In 'run_worker'
    def print_with_rank(msg):
        print('[RANK {}]: {}'.format(rank, msg))

    from torchtext.datasets import WikiText2
    from torchtext.data.utils import get_tokenizer
    from torchtext.vocab import build_vocab_from_iterator

    train_iter = WikiText2(split='train')
    tokenizer = get_tokenizer('basic_english')
    vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
                                      specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])

    def data_process(raw_text_iter):
        data = [
            torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
            for item in raw_text_iter
        ]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_iter, val_iter, test_iter = WikiText2()
    train_data = data_process(train_iter)
    val_data = data_process(val_iter)
    test_data = data_process(test_iter)

    device = torch.device(2 * rank)

    def batchify(data, bsz, rank, world_size, is_train=False):
        # Divide the dataset into bsz parts.
        nbatch = data.size(0) // bsz
        # Trim off any extra elements that wouldn't cleanly fit (remainders).
        data = data.narrow(0, 0, nbatch * bsz)
        # Evenly divide the data across the bsz batches.
        data = data.view(bsz, -1).t().contiguous()
        # Divide the data across the ranks only for training data.
        if is_train:
            data_per_rank = data.size(0) // world_size
            data = data[rank * data_per_rank:(rank + 1) * data_per_rank]
        return data.to(device)

    batch_size = 20
    eval_batch_size = 10
    train_data = batchify(train_data, batch_size, rank, world_size, True)
    val_data = batchify(val_data, eval_batch_size, rank, world_size)
    test_data = batchify(test_data, eval_batch_size, rank, world_size)

    ######################################################################
    # Functions to generate input and target sequence
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #

    ######################################################################
    # ``get_batch()`` function generates the input and target sequence for
    # the transformer model. It subdivides the source data into chunks of
    # length ``bptt``. For the language modeling task, the model needs the
    # following words as ``Target``. For example, with a ``bptt`` value of 2,
    # we’d get the following two Variables for ``i`` = 0:
    #
    # .. image:: ../_static/img/transformer_input_target.png
    #
    # It should be noted that the chunks are along dimension 0, consistent
    # with the ``S`` dimension in the Transformer model. The batch dimension
    # ``N`` is along dimension 1.
    #

    # In 'run_worker'
    bptt = 35

    def get_batch(source, i):
        seq_len = min(bptt, len(source) - 1 - i)
        data = source[i:i + seq_len]
        target = source[i + 1:i + 1 + seq_len].view(-1)
        # Need batch dimension first for pipeline parallelism.
        return data.t(), target

######################################################################
# Model scale and Pipe initialization
# -----------------------------------
#

######################################################################
# To demonstrate training large Transformer models using pipeline parallelism,
# we scale up the Transformer layers appropriately. We use an embedding
# dimension of 4096, hidden size of 4096, 16 attention heads and 8 total
# transformer layers (``nn.TransformerEncoderLayer``). This creates a model with
# **~1 billion** parameters.
#
# We need to initialize the `RPC Framework <https://pytorch.org/docs/stable/rpc.html>`__
# since Pipe depends on the RPC framework via `RRef <https://pytorch.org/docs/stable/rpc.html#rref>`__
# which allows for future expansion to cross host pipelining. We need to
# initialize the RPC framework with only a single worker since we're using a
# single process to drive multiple GPUs.
#
# The pipeline is then initialized with 8 transformer layers on one GPU and 8
# transformer layers on the other GPU. One pipe is setup across GPUs 0 and 1 and
# another across GPUs 2 and 3. Both pipes are then replicated using DistributedDataParallel.

# In 'run_worker'

    ntokens = len(vocab)  # the size of vocabulary
    emsize = 4096  # embedding dimension
    nhid = 4096  # the dimension of the feedforward network model in nn.TransformerEncoder
    nlayers = 8  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
    nhead = 16  # the number of heads in the multiheadattention models
    dropout = 0.2  # the dropout value

    from torch.distributed import rpc
    tmpfile = tempfile.NamedTemporaryFile()
    rpc.init_rpc(
        name="worker",
        rank=0,
        world_size=1,
        rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
            init_method="file://{}".format(tmpfile.name),
            # Specifying _transports and _channels is a workaround and we no longer
            # will have to specify _transports and _channels for PyTorch
            # versions >= 1.8.1
            _transports=["ibv", "uv"],
            _channels=["cuda_ipc", "cuda_basic"],
        ))

    # Num gpus for model parallelism.
    num_gpus = 2
    partition_len = ((nlayers - 1) // num_gpus) + 1

    # Add encoder in the beginning.
    tmp_list = [Encoder(ntokens, emsize, dropout).cuda(2 * rank)]
    module_list = []

    # Add all the necessary transformer blocks.
    for i in range(nlayers):
        transformer_block = TransformerEncoderLayer(emsize, nhead, nhid,
                                                    dropout)
        if i != 0 and i % (partition_len) == 0:
            module_list.append(nn.Sequential(*tmp_list))
            tmp_list = []
        device = i // (partition_len)
        tmp_list.append(transformer_block.to(2 * rank + device))

    # Add decoder in the end.
    tmp_list.append(Decoder(ntokens, emsize).cuda(2 * rank + num_gpus - 1))
    module_list.append(nn.Sequential(*tmp_list))

    # Need to use 'checkpoint=never' since as of PyTorch 1.8, Pipe checkpointing
    # doesn't work with DDP.
    from torch.distributed.pipeline.sync import Pipe
    chunks = 8
    model = Pipe(torch.nn.Sequential(*module_list),
                 chunks=chunks,
                 checkpoint="never")

    # Initialize process group and wrap model in DDP.
    from torch.nn.parallel import DistributedDataParallel
    import torch.distributed as dist
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
    model = DistributedDataParallel(model)

    def get_total_params(module: torch.nn.Module):
        total_params = 0
        for param in module.parameters():
            total_params += param.numel()
        return total_params

    print_with_rank('Total parameters in model: {:,}'.format(
        get_total_params(model)))

    ######################################################################
    # Run the model
    # -------------
    #

    ######################################################################
    # `CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__
    # is applied to track the loss and
    # `SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd#torch.optim.SGD>`__
    # implements stochastic gradient descent method as the optimizer. The initial
    # learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr#torch.optim.lr_scheduler.StepLR>`__ is
    # applied to adjust the learn rate through epochs. During the
    # training, we use
    # `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm#torch.nn.utils.clip_grad_norm_>`__
    # function to scale all the gradient together to prevent exploding.
    #

    # In 'run_worker'
    criterion = nn.CrossEntropyLoss()
    lr = 5.0  # learning rate
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

    import time

    def train():
        model.train()  # Turn on the train mode
        total_loss = 0.
        start_time = time.time()
        ntokens = len(vocab)

        # Train only for 50 batches to keep script execution time low.
        nbatches = min(50 * bptt, train_data.size(0) - 1)

        for batch, i in enumerate(range(0, nbatches, bptt)):
            data, targets = get_batch(train_data, i)
            optimizer.zero_grad()
            # Since the Pipe is only within a single host and process the ``RRef``
            # returned by forward method is local to this node and can simply
            # retrieved via ``RRef.local_value()``.
            output = model(data).local_value()
            # Need to move targets to the device where the output of the
            # pipeline resides.
            loss = criterion(output.view(-1, ntokens),
                             targets.cuda(2 * rank + 1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

            total_loss += loss.item()
            log_interval = 10
            if batch % log_interval == 0 and batch > 0:
                cur_loss = total_loss / log_interval
                elapsed = time.time() - start_time
                print_with_rank('| epoch {:3d} | {:5d}/{:5d} batches | '
                                'lr {:02.2f} | ms/batch {:5.2f} | '
                                'loss {:5.2f} | ppl {:8.2f}'.format(
                                    epoch, batch, nbatches // bptt,
                                    scheduler.get_last_lr()[0],
                                    elapsed * 1000 / log_interval, cur_loss,
                                    math.exp(cur_loss)))
                total_loss = 0
                start_time = time.time()

    def evaluate(eval_model, data_source):
        eval_model.eval()  # Turn on the evaluation mode
        total_loss = 0.
        ntokens = len(vocab)
        # Evaluate only for 50 batches to keep script execution time low.
        nbatches = min(50 * bptt, data_source.size(0) - 1)
        with torch.no_grad():
            for i in range(0, nbatches, bptt):
                data, targets = get_batch(data_source, i)
                output = eval_model(data).local_value()
                output_flat = output.view(-1, ntokens)
                # Need to move targets to the device where the output of the
                # pipeline resides.
                total_loss += len(data) * criterion(
                    output_flat, targets.cuda(2 * rank + 1)).item()
        return total_loss / (len(data_source) - 1)

######################################################################
# Loop over epochs. Save the model if the validation loss is the best
# we've seen so far. Adjust the learning rate after each epoch.

# In 'run_worker'

    best_val_loss = float("inf")
    epochs = 3  # The number of epochs
    best_model = None

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(model, val_data)
        print_with_rank('-' * 89)
        print_with_rank(
            '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                       val_loss, math.exp(val_loss)))
        print_with_rank('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model

        scheduler.step()

######################################################################
# Evaluate the model with the test dataset
# -------------------------------------
#
# Apply the best model to check the result with the test dataset.

# In 'run_worker'
    test_loss = evaluate(best_model, test_data)
    print_with_rank('=' * 89)
    print_with_rank(
        '| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
            test_loss, math.exp(test_loss)))
    print_with_rank('=' * 89)
예제 #24
0
from src.model.train_evaluate import train, evaluate
from src.model.model_utils import data_process, batchify, gen_tokenizer_and_vocab
from src.settings import DIR_MODELS

"""
Note: the ipynb has modified version of code below; this should be functionalized and integrated
"""


if __name__ == '__main__':

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer, vocab = gen_tokenizer_and_vocab()

    train_iter, val_iter, test_iter = WikiText2()
    train_data = data_process(train_iter, vocab, tokenizer)
    val_data = data_process(val_iter, vocab, tokenizer)
    test_data = data_process(test_iter, vocab, tokenizer)

    batch_size = 20
    eval_batch_size = 10
    train_data = batchify(train_data, batch_size, device)
    val_data = batchify(val_data, eval_batch_size, device)
    test_data = batchify(test_data, eval_batch_size, device)

    ntokens = len(vocab.stoi)  # the size of vocabulary
    emsize = 200  # embedding dimension
    nhid = 200  # the dimension of the feedforward network model in nn.TransformerEncoder
    nlayers = 2  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
    nhead = 2  # the number of heads in the multiheadattention models
예제 #25
0
import spacy

from spacy.symbols import ORTH


def spacy_tok(x):
    return [tok.lower() for tok in x]


TEXT = data.Field(lower=True, tokenize=spacy_tok)

from torchtext.datasets import WikiText2

train, valid, test = WikiText2.splits(
    TEXT
)  # loading custom datasets requires passing in the field, but nothing else.

TEXT.build_vocab(train, vectors="glove.6B.200d")
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=BATCH_SIZE,
    bptt_len=30,  # this is where we specify the sequence length
    device=(0 if USE_GPU else -1),
    repeat=False)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable as V
예제 #26
0
    def forward(self, x):
        # x += [x.size(0), 1, d_model], x内的每个元素对应加
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


# %% [markdown]
# # 1️⃣Load and batch data
import os
import torch
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, vocab

train_iter = WikiText2('data', split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
                                  specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])


def data_process(raw_text_iter):
    # tokenizer to seg text, and vocab to trans to num
    data = [
        torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
        for item in raw_text_iter
    ]
    # discard 0 element text and cat them. numel func to vector element num.
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
예제 #27
0
def train_lm(
    model_name, output_path, epochs=5, batch_size=32, bptt_len=35,
    lr=1e-3, optimizer="adam", min_freq=5, model_args={},
    scheduler_patience=5, scheduler_threshold=1e-4, early_stopping_tolerance=5):
    """
    Train and save a language model
    Arguments
    ---------
    model_name: string
        Can be "RNN", "QRNN"

    output_path: a path
        Where to save the model

    lr: float
        Learning rate, default = 1e-3

    model_args: dict
        Arguments to be passed to the createdmodel


    """

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    TEXT = data.Field(
        tokenizer_language='en',
        lower=True,
        init_token='<sos>',
        eos_token='<eos>',
        batch_first=True,
    )


    train, valid, test = WikiText2.splits(TEXT)

    TEXT.build_vocab(train, min_freq=min_freq)

    print(f"We have {len(TEXT.vocab)} tokens in our vocabulary")

    device = "cuda" if torch.cuda.is_available() else "cpu"


    train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
        (train, valid, test),
        batch_size=batch_size,
        bptt_len=bptt_len, # this is where we specify the sequence length
        device=device,
        repeat=False
    )

    model = create_model(model_name, TEXT, model_args=model_args)
    if "awd" in model_name:
        optimizer = "asgd"
    optimizer = create_optimizer(model, optimizer, lr)
    criterion = nn.CrossEntropyLoss()

    print(f"Using LR Scheduler with patience {scheduler_patience} and threshold {scheduler_threshold}")
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 'min', patience=scheduler_patience, threshold=scheduler_threshold
    )

    model = model.to(device)
    criterion = criterion.to(device)

    model_path = output_path

    training_cycle(
        epochs=epochs,
        model=model, train_iter=train_iter, valid_iter=valid_iter,
        optimizer=optimizer, criterion=criterion, scheduler=lr_scheduler,
        model_path=model_path, early_stopping_tolerance=early_stopping_tolerance
    )

    model.load_state_dict(torch.load(model_path))
    model.eval()

    valid_loss, valid_perplexity = evaluate(model, valid_iter, criterion)
    test_loss, test_perplexity = evaluate(model, test_iter, criterion)


    print(f"Valid loss      : {valid_loss:.2f}")
    print(f"Valid perplexity: {valid_perplexity:.2f}\n")

    print(f"Test loss      : {test_loss:.2f}")
    print(f"Test perplexity: {test_perplexity:.2f}")


    save_model(model, TEXT, output_path)
예제 #28
0
def fit(model_cls):

    train_iter = WikiText2(split='train')
    tokenizer = get_tokenizer('basic_english')
    vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
                                      specials=['<unk>'])
    vocab.set_default_index(vocab['<unk>'])

    def process_data(raw_text_iter):
        """Converts raw text into a flat Tensor."""
        data = [
            torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
            for item in raw_text_iter
        ]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_iter, val_iter, _ = WikiText2()
    train_data = process_data(train_iter)
    val_data = process_data(val_iter)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def generate_batches(data, bsz):
        """Divides the data into bsz separate sequences."""
        seq_len = data.size(0) // bsz
        data = data[:seq_len * bsz]
        data = data.view(bsz, seq_len).t().contiguous()
        return data.to(device)

    batch_size = 20
    eval_batch_size = 10
    train_data = generate_batches(train_data, batch_size)
    val_data = generate_batches(val_data, eval_batch_size)

    seq_len = 35

    def get_seq(source, i):
        """
        Args:
            source: Tensor, with size [full_seq_len, batch_size]
            i: int
            
        Returns:
            tuple (data, target): data has size [seq_len, batch_size]
            and target has size [seq_len * batch_size]
        """
        part_len = min(seq_len, len(source) - 1 - i)
        data = source[i:i + part_len]
        target = source[i + 1:i + 1 + part_len].reshape(-1)
        return data, target

    def generate_square_subsequent_mask(sz):
        """Generates an upper-triangular matrix of -inf, with zeros on diag."""
        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

    model = model_cls().to(device)
    lr = 5.0
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

    def train(model):
        model.train()
        src_mask = generate_square_subsequent_mask(seq_len).to(device)
        for i in range(0, train_data.size(0) - 1, seq_len):
            data, target = get_seq(train_data, i)
            part_len = data.size(0)
            if part_len != seq_len:
                src_mask = src_mask[:part_len, :part_len]
            output = model(data, src_mask)
            loss = F.cross_entropy(output.view(-1, output.size(-1)), target)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

    def evaluate(model, eval_data):
        model.eval()
        src_mask = generate_square_subsequent_mask(seq_len).to(device)
        total_loss = 0.
        with torch.no_grad():
            for i in range(0, eval_data.size(0) - 1, seq_len):
                data, target = get_seq(eval_data, i)
                part_len = data.size(0)
                if part_len != seq_len:
                    src_mask = src_mask[:part_len, :part_len]
                output = model(data, src_mask)
                output_flat = output.view(-1, output.size(-1))
                total_loss += part_len * F.cross_entropy(output_flat,
                                                         target).item()
        return total_loss / (len(eval_data) - 1)

    best_val_loss = float('inf')

    for epoch in range(20):
        train(model)
        val_loss = evaluate(model, val_data)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
        scheduler.step()

    best_val_ppl = math.exp(best_val_loss)
    nni.report_final_result(
        best_val_ppl
    )  # reports best validation ppl to nni as final result of one trial
예제 #29
0
def run_worker(ps_rref, data_dir, batch_size, num_epochs, worker, job_name):
    worker_rank = int(worker[-1])
    info_socketm = znet.SocketMsger.tcp_connect(DORKER0_IP, INFO_PORT)
    info_socketm.send("WORKER")
    info_socketm.send(f"1.0\n/home/ubuntu/measurement/logs/{job_name}_info{worker_rank}.log\n{job_name}")

    logger = Logger(job_name=job_name, file_dir=f"./measurement/logs/{job_name}_{worker}.log").logger

    train_iter = WikiText2(root=data_dir, split='train')
    tokenizer = get_tokenizer('basic_english')
    vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    ntokens = len(vocab)
    bptt = 35

    train_iter, val_iter, test_iter = WikiText2(root=data_dir)
    train_data = data_process(train_iter, vocab, tokenizer)
    train_data = batchify(train_data, batch_size)

    device_id = 0
    device = torch.device(f"cuda:{device_id}" if torch.cuda.is_available() else "cpu")
    name = rpc.get_worker_info().name

    ps_rref.rpc_sync().set_ps_launched_to_true()

    m = ps_rref.rpc_sync().get_model().to(device)
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)
    stop_flag = False

    info_socketm.send("START")
    if info_socketm.recv() != "CONFIRM":
        return

    cm_t1_end = time.time()
    tt0 = time.time()

    for epoch in range(num_epochs):
        for batch_idx, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
            data, target = get_batch(train_data, i, bptt)
            data, target = data.to(device), target.to(device)
            output = m(data)
            output = output.view(-1, ntokens)
            loss = criterion(output, target)
            loss.backward()

            cm_t0_start = time.time()
            cp_t = 1000 * (cm_t0_start - cm_t1_end)

            logger.info("{:8s} | Epoch: {:3d} | Batch: {:3d} | Loss: {:6.2f} | Computation Time: {:7.2f} ms"
                        .format(name, (epoch + 1), (batch_idx + 1), loss.item(), cp_t))

            m, stop_flag = rpc.rpc_sync(
                to=ps_rref.owner(),
                func=ParameterServer.update_and_fetch_model,
                args=(ps_rref, [p.grad for p in m.cpu().parameters()], name, epoch, batch_idx, cm_t0_start, cm_t1_end))
            m.to(device)

            cm_t1_end = time.time()

            if stop_flag:
                break

        if stop_flag:
            break

    tt1 = time.time()

    info_socketm.send("END")

    logger.info("Time: {:.2f} seconds".format((tt1 - tt0)))
예제 #30
0
            loss = criterion(outs.view(-1, outs.size(-1)), targets.view(-1))
            epoch_loss += loss.item()
    return epoch_loss / len(devLoader)


###############################################################################
# Load data
###############################################################################
configfile = open('./config.yaml')
config = AttrDict(yaml.load(configfile, Loader=yaml.FullLoader))
device = torch.device(args.device)

# ? include lenghts
TEXT = Field(lower=True, include_lengths=False, batch_first=False)
# TEXT: split string into tokens
trainSet, devSet, testSet = WikiText2.splits(text_field=TEXT,
                                             root=config.data.data_root)
if config.model.rnn.pretrained_embedding:
    vec = torchtext.vocab.FastText(language='en',
                                   cache=config.data.fasttext_root)
    assert vec.dim == config.model.rnn.nemd
else:
    vec = None
TEXT.build_vocab(trainSet, vectors=vec)
# TEXT: numericalize, pad, add init_token and eos_token
trainLoader, devLoader, testLoader = BPTTIterator.splits(
    (trainSet, devSet, testSet),
    batch_size=config.data.BSZ,
    bptt_len=config.data.bptt_len,
    device=device)
assert len(TEXT.vocab) == config.data.vocabSize