def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs, devices=d2l.try_all_gpus(), evaluator=None, **kwargs): timer = d2l.Timer() animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2], legend=['train loss', 'test RMSE']) for epoch in tqdm.tqdm(range(num_epochs)): metric, l = d2l.Accumulator(3), 0. for i, values in enumerate(train_iter): timer.start() input_data = [] values = values if isinstance(values, list) else [values] for v in values: input_data.append(gluon.utils.split_and_load(v, devices)) train_feat = input_data[0:-1] if len(values) > 1 else input_data train_label = input_data[-1] with autograd.record(): preds = [net(*t) for t in zip(*train_feat)] ls = [loss(p, s) for p, s in zip(preds, train_label)] [l.backward() for l in ls] l += sum([l.asnumpy() for l in ls]).mean() / len(devices) trainer.step(values[0].shape[0]) metric.add(l, values[0].shape[0], values[0].size) timer.stop() if len(kwargs) > 0: # It will be used in section AutoRec test_rmse = evaluator(net, test_iter, kwargs['inter_mat'], devices) else: test_rmse = evaluator(net, test_iter, devices) train_l = l / (i + 1) animator.add(epoch + 1, (train_l, test_rmse)) print(f'train loss {metric[0] / metric[1]:.3f}, ' f'test RMSE {test_rmse:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(devices)}')
metric.add(l, values[0].shape[0], values[0].size) timer.stop() if len(kwargs) > 0: # It will be used in section AutoRec test_rmse = evaluator(net, test_iter, kwargs['inter_mat'], devices) else: test_rmse = evaluator(net, test_iter, devices) train_l = l / (i + 1) animator.add(epoch + 1, (train_l, test_rmse)) print(f'train loss {metric[0] / metric[1]:.3f}, ' f'test RMSE {test_rmse:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(devices)}') # %% devices = d2l.try_all_gpus() num_users, num_items, train_iter, test_iter = d2l.split_and_load_ml100k( test_ratio=0.1, batch_size=512) net = MF(30, num_users, num_items) net.initialize(ctx=devices, force_reinit=True, init=mx.init.Normal(0.01)) lr, num_epochs, wd, optimizer = 0.002, 20, 1e-5, 'adam' loss = gluon.loss.L2Loss() trainer = gluon.Trainer(net.collect_params(), optimizer, { "learning_rate": lr, "wd": wd }) train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs, devices, evaluator) #%% scores = net(np.array([20], dtype='int', ctx=devices[0]), np.array([30], dtype='int', ctx=devices[0]))
""" from d2l import mxnet as d2l from mxnet import autograd, gluon, init, np, npx batch_size, max_len = 512, 64 train_iter, vocab = d2l.load_data_wiki(batch_size, max_len) net = d2l.BERTModel(len(vocab), num_hiddens=128, ffn_num_hiddens=256, num_heads=2, num_layers=2, dropout=0.2) ctx = d2l.try_all_gpus() net.initialize(init.Xavier(), ctx=ctx) loss = gluon.loss.SoftmaxCELoss() #Get two loss functions def _get_batch_loss_bert(net, loss, vocab_size, tokens_X_shards, segments_X_shards, valid_lens_x_shards, pred_positions_X_shards, mlm_weights_X_shards, mlm_Y_shards, nsp_y_shards): mlm_ls, nsp_ls, ls = [], [], [] for (tokens_X_shard, segments_X_shard, valid_lens_x_shard, pred_positions_X_shard, mlm_weights_X_shard, mlm_Y_shard, nsp_y_shard) in zip(tokens_X_shards, segments_X_shards, valid_lens_x_shards, pred_positions_X_shards, mlm_weights_X_shards, mlm_Y_shards, nsp_y_shards):
Y = conv_trans(X) out_img = Y[0].transpose(1, 2, 0) d2l.set_figsize() print('input image shape:', img.shape) d2l.plt.imshow(img.asnumpy()) print('output image shape:', out_img.shape) d2l.plt.imshow(out_img.asnumpy()) W = bilinear_kernel(num_classes, num_classes, 64) net[-1].initialize(init.Constant(W)) net[-2].initialize(init=init.Xavier()) batch_size, crop_size = 32, (320, 480) train_iter, test_iter = d2l.load_data_voc(batch_size, crop_size) num_epochs, lr, wd, devices = 5, 0.1, 1e-3, d2l.try_all_gpus() loss = gluon.loss.SoftmaxCrossEntropyLoss(axis=1) net.collect_params().reset_ctx(devices) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': lr, 'wd': wd }) d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices) def predict(img): X = test_iter._dataset.normalize_image(img) X = np.expand_dims(X.transpose(2, 0, 1), axis=0) pred = net(X.as_in_ctx(devices[0])).argmax(axis=1) return pred.reshape(pred.shape[1], pred.shape[2])
# rnn.LSTM, it only returns the hidden states of the last hidden layer # at different time step (outputs). The shape of `outputs` is # (no. of words, batch size, 2 * no. of hidden units). outputs = self.encoder(embeddings) # Concatenate the hidden states of the initial time step and final # time step to use as the input of the fully connected layer. Its # shape is (batch size, 4 * no. of hidden units) encoding = np.concatenate((outputs[0], outputs[-1]), axis=1) outs = self.decoder(encoding) return outs """Create a bidirectional recurrent neural network with two hidden layers. """ embed_size, num_hiddens, num_layers, devices = 100, 100, 2, d2l.try_all_gpus() net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers) net.initialize(init.Xavier(), ctx=devices) """### Loading Pre-trained Word Vectors Because the training dataset for sentiment classification is not very large, in order to deal with overfitting, we will directly use word vectors pre-trained on a larger corpus as the feature vectors of all words. Here, we load a 100-dimensional GloVe word vector for each word in the dictionary `vocab`. """ glove_embedding = d2l.TokenEmbedding('glove.6b.300d') """Query the word vectors that in our vocabulary. """
animator.add(epoch + 1, (None, valid_loss)) if valid_iter is not None: print(f'train loss {metric[0] / metric[1]:.3f}, ' f'valid loss {valid_loss:.3f}') else: print(f'train loss {metric[0] / metric[1]:.3f}') print(f'{metric[1] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(devices)}') """## Training and Validating the Model Now, we can train and validate the model. The following hyperparameters can be tuned. For example, we can increase the number of epochs. Because `lr_period` and `lr_decay` are set to 10 and 0.1 respectively, the learning rate of the optimization algorithm will be multiplied by 0.1 after every 10 epochs. """ devices, num_epochs, lr, wd = d2l.try_all_gpus(), 5, 0.01, 1e-4 lr_period, lr_decay, net = 10, 0.1, get_net(devices) net.hybridize() train(net, train_iter, valid_iter, 100, lr, wd, devices, lr_period, lr_decay) """## Classifying the Testing Set and Submitting Results on Kaggle After obtaining a satisfactory model design and hyperparameters, we use all training datasets (including validation sets) to retrain the model and then classify the testing set. Note that predictions are made by the output network we just trained. """ net = get_net(devices) net.hybridize() train(net, train_valid_iter, None, num_epochs, lr, wd, devices, lr_period, lr_decay)
# There are 3 possible outputs: entailment, contradiction, and neutral self.aggregate = Aggregate(num_hiddens, 3) def forward(self, X): premises, hypotheses = X A = self.embedding(premises) B = self.embedding(hypotheses) beta, alpha = self.attend(A, B) V_A, V_B = self.compare(A, B, beta, alpha) Y_hat = self.aggregate(V_A, V_B) return Y_hat batch_size, num_steps = 256, 50 train_iter, test_iter, vocab = load_data_snli(batch_size, num_steps) embed_size, num_hiddens, devices = 100, 200, d2l.try_all_gpus() net = DecomposableAttention(vocab, embed_size, num_hiddens) net.initialize(init.Xavier(), ctx=devices) #download glove.6B.100d before running below line glove_embedding = d2l.TokenEmbedding('glove.6B.100d.txt') embeds = glove_embedding[vocab.idx_to_token] net.embedding.weight.set_data(embeds) #@save def split_batch_multi_inputs(X, y, devices): """Split multi-input `X` and `y` into multiple devices.""" X = list(zip(*[gluon.utils.split_and_load( feature, devices, even_split=False) for feature in X])) return (X, gluon.utils.split_and_load(y, devices, even_split=False)) lr, num_epochs = 0.001, 4