def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, device, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = utils.data_iter_random
    else:
        data_iter_fn = utils.data_iter_consecutive
    params = get_params()
    loss = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:  # 如使用相邻采样,在epoch开始时初始化隐藏状态
            state = init_rnn_state(batch_size, num_hiddens, device)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device)
        for X, Y in data_iter:
            if is_random_iter:  # 如使用随机采样,在每个小批量更新前初始化隐藏状态
                state = init_rnn_state(batch_size, num_hiddens, device)
            else:
                # 否则需要使用detach函数从计算图分离隐藏状态, 这是为了
                # 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大)
                for s in state:
                    s.detach_()

            inputs = to_onehot(X, vocab_size)
            # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
            (outputs, state) = rnn(inputs, state, params)
            # 拼接之后形状为(num_steps * batch_size, vocab_size)
            outputs = torch.cat(outputs, dim=0)
            # Y的形状是(batch_size, num_steps),转置后再变成长度为
            # batch * num_steps 的向量,这样跟输出的行一一对应
            y = torch.transpose(Y, 0, 1).contiguous().view(-1)
            # 使用交叉熵损失计算平均分类误差
            l = loss(outputs, y.long())

            # 梯度清0
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()
            grad_clipping(params, clipping_theta, device)  # 裁剪梯度
            utils.sgd(params, lr, 1)  # 因为误差已经取过均值,梯度不用再做平均
            l_sum += l.item() * y.shape[0]
            n += y.shape[0]

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' %
                  (epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(
                    ' -',
                    predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                                num_hiddens, vocab_size, device, idx_to_char,
                                char_to_idx))
示例#2
0
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, device, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):  # 250
        if not is_random_iter:  # 如使用相邻采样,在epoch开始时初始化隐藏状态
            state = init_rnn_state(batch_size, num_hiddens, device)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device)
        for X, Y in data_iter:  # [32, 35], [32, 35]
            if is_random_iter:  # 如使用随机采样,在每个小批量更新前初始化隐藏状态
                state = init_rnn_state(batch_size, num_hiddens,
                                       device)  # [32, 256]
            else:  # 否则需要使用detach函数从计算图分离隐藏状态
                for s in state:
                    s.detach_()
            # inputs是num_steps个形状为(batch_size, vocab_size)的矩阵
            inputs = to_onehot(X, vocab_size)  # [35, 32, 1027]
            # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
            (outputs, state) = rnn(inputs, state,
                                   params)  # [35, 32, 1027], [1, 32, 256]
            # 拼接之后形状为(num_steps * batch_size, vocab_size)
            outputs = torch.cat(outputs, dim=0)  # [1120, 1027]
            # Y的形状是(batch_size, num_steps),转置后再变成形状为
            # (num_steps * batch_size,)的向量,这样跟输出的行一一对应
            y = torch.flatten(Y.t())  # [1120,]
            # 使用交叉熵损失计算平均分类误差
            l = loss(outputs, y.long())

            # 梯度清0
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()
            grad_clipping(params, clipping_theta, device)  # 裁剪梯度
            d2l.sgd(params, lr, 1)  # 因为误差已经取过均值,梯度不用再做平均
            l_sum += l.item() * y.shape[0]
            n += y.shape[0]

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' %
                  (epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(
                    ' -',
                    predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                                num_hiddens, vocab_size, device, idx_to_char,
                                char_to_idx))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = us.data_iter_random
    else:
        data_iter_fn = us.data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):  #单纯的训练轮数,与训练数据集分成多少批无关
        if not is_random_iter:  # 如使用相邻采样,在epoch开始时初始化隐藏状态
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)  # 用所有训练数据产生多批歌词用于小批量随机梯度下降
        for X, Y in data_iter:  # 每一批数据产生一组歌词段索引序列X(每个序列都作为RNN模型的一个多时间步输入),和一组对应的歌词下一个字标签序列Y
            if is_random_iter:  # 如使用随机采样,在每个小批量更新前初始化隐藏状态
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:  # 否则需要使用detach函数从计算图分离隐藏状态
                '''
                当多个相邻小批量通过传递隐藏状态串联起来时,模型参数的梯度计算将依赖所有串联起来的小批量序列。同一迭代周期中(epoch),随着迭代次数的增加,梯度的计算开销会越来越大。 为了使模型参数的梯度计算只依赖一次迭代读取的小批量序列,我们可以在每次读取小批量前将隐藏状态从计算图中分离出来
                批之间单行(歌词段)语义连续,用于利用RNN模型在批之间传递隐藏状态,产生语义连贯加强的训练效果
                '''
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = us.to_onehot(X, vocab_size)  # 单批各歌词段的字符都转为one-hot
                # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
                (outputs, state) = rnn(inputs, state, params)
                # 连结之后形状为(num_steps * batch_size, vocab_size)
                outputs = nd.concat(*outputs, dim=0)
                # Y的形状是(batch_size, num_steps),转置后再变成长度为
                # batch * num_steps 的向量,这样跟输出的行一一对应
                y = Y.T.reshape((-1,)) # Y,X都是转置,处理,然后按行序并为一列,因为Y,X原来就是对应的(歌词序列对应歌词下一次标签序列),所以这里也是一一对应的,用于计算交叉熵损失
                # 使用交叉熵损失计算平均分类误差 每个one-hot向量(对应一个字)计算交叉熵再求和取平均
                l = loss(outputs, y).mean()
            l.backward()
            us.grad_clipping(params, clipping_theta, ctx)  # 裁剪梯度
            # 每个小批量的所有歌词输出结果与对应标签计算交叉熵求和,并求了均值,这里对此采用梯度下降
            us.sgd(params, lr, 1)  # 因为误差已经取过均值,梯度不用再做平均
            l_sum += l.asscalar() * y.size  #所有批总损失
            n += y.size  # 所有批总字数

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))  # 计算一个平均总损失,这里做了指数运算
            for prefix in prefixes:  #每pred_period轮,每轮用所有批进行的完整训练后,打印损失,并打印预测出的歌词段
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
示例#4
0
def train(options, train_data, valid_data, test_data):
    np.random.seed(12345)

    if not os.path.exists(options['saveto']):
        os.makedirs(options['saveto'])

    print 'Building the model...'
    params = init_params(options)
    users_id, items_id, bow, y, y_pred, bow_pred, mse, nll, cost = build_model(options, params)

    print 'Computing gradients...'
    lrt = sharedX(options['lr'])
    grads = T.grad(cost, params.values())
    updates = sgd(params.values(), grads, lrt)

    print 'Compiling theano functions...'
    eval_fn = theano.function([users_id, items_id, y], mse)
    train_fn = theano.function([users_id, items_id, bow, y], [cost, mse, nll],
                               updates=updates)

    print "Training..."
    train_iter = MultiFixDimIterator(*train_data, batch_size=options['batch_size'],
                                     shuffle=True)
    valid_iter = MultiFixDimIterator(*valid_data, batch_size=100)
    test_iter  = MultiFixDimIterator(*test_data,  batch_size=100)
    best_valid = float('inf')
    best_test  = float('inf')

    n_batches = np.ceil(train_data[0].shape[0]*1./options['batch_size']).astype('int')
    disp_str = ['Train COST', 'Train MSE', 'Train NLL']

    for eidx in range(options['n_epochs']):
        accum_cost, accum_mse, accum_nll = 0., 0., 0.
        for batch in train_iter:
            batch = prepare_batch_data(options, batch)
            b_cost, b_mse, b_nll = train_fn(*batch)
            accum_cost += b_cost
            accum_mse  += b_mse
            accum_nll  += b_nll

        disp_val = [val/n_batches for val in [accum_cost, accum_mse, accum_nll]]
        res_str = ('[%d] ' % eidx) + ", ".join("%s: %.4f" %(s,v) for s,v in
                                               zip(disp_str, disp_val))
        print res_str

        if (eidx+1) % options['valid_freq'] == 0:
            disp_val = [np.mean([eval_fn(*vbatch) for vbatch in valid_iter]),
                        np.mean([eval_fn(*tbatch) for tbatch in test_iter])]
            res_str = ", ".join("%s: %.4f" %(s,v) for s,v in
                                zip(['Valid MSE', 'Test MSE'], disp_val))
            print res_str

            if best_valid > disp_val[0]:
                best_valid, best_test = disp_val
                dump_params(options['saveto'], eidx, "best_params", params)

    print "Done training..."
    print "Best Valid MSE: %.4f and Test MSE: %.4f" % best_test
示例#5
0
def fit_and_plot(lambd):
    w, b = init_params()
    train_ls, test_ls = [], []
    for _ in range(num_epochs):
        for X, y in train_iter:
            l = loss(net(X, w, b), y) + lambd * l2_penalty(w)
            l = l.sum()

            if w.grad is not None:
                w.grad.data.zero_()
                b.grad.data.zero_()
            l.backward()
            utils.sgd([w, b], lr, batch_size)
        train_ls.append(loss(net(train_features, w, b), train_labels).mean().item())
        test_ls.append(loss(net(test_features, w, b), test_labels).mean().item())
    utils.semilogy(range(1, num_epochs+1), train_ls, 'epochs', 'loss',
                   range(1, num_epochs+1), test_ls, ['train', 'test'])
    print('L2 norm of w:', w.norm().item())
def fit_and_plot(lambd):
    w = nd.random.normal(scale=1, shape=true_w.shape)
    b = nd.zeros(shape=(1, ))
    w.attach_grad()
    b.attach_grad()
    train_ls, test_ls = [], []
    for _ in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                l = loss(net(X, w, b), y) + lambd * l2_penalty(w)
            l.backward()
            utils.sgd([w, b], learning_rate, batch_size)
        train_ls.append(
            loss(net(train_features, w, b), train_labels).mean().asscalar())
        test_ls.append(
            loss(net(test_features, w, b), test_labels).mean().asscalar())
    utils.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
                   range(1, num_epochs + 1), test_ls, ['train', 'test'])
    print("L2 norm of w:", w.norm().asscalar())
示例#7
0
 def fit(self, X, y, eta=0.1, epochs=10000):
     """Fits multiclass SVM
     
     :param X: array-like, shape = [num_samples,num_inFeatures], input data
     :param y: array-like, shape = [num_samples,], input classes
     :param eta: learning rate for SGD
     :param T: maximum number of iterations
     :return: self
     """
     self.coef_ = sgd(X, y, self.n_out, self.subgradient, eta, epochs)
     self.is_fit = True
     return self
示例#8
0
    def configure(self, flags):

        for name, para in self.network.named_parameters():
            print(name, para.size())

        self.optimizer = sgd(model=self.network,
                             parameters=self.network.parameters(),
                             lr=flags.lr,
                             weight_decay=flags.weight_decay,
                             momentum=flags.momentum)

        self.scheduler = lr_scheduler.StepLR(optimizer=self.optimizer, step_size=flags.step_size, gamma=0.1)
        self.loss_fn = crossentropyloss()
def train(rnn, get_params, init_rnn_state, num_hiddens, vocabulary_size,
          context, indices, index_to_char, char_to_index, is_random_iter,
          num_epochs, num_steps, learning_rate, clipping_theta, batch_size,
          predict_period, predict_length, prefixes):
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        l_sum = 0.0
        n = 0
        start = time.time()
        if is_random_iter == False:
            state = init_rnn_state(batch_size, num_hiddens, context)
        for X, y in data_iter(indices, batch_size, num_steps, is_random_iter,
                              context):
            if is_random_iter:
                state = init_rnn_state(batch_size, num_hiddens, context)
            else:
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(nd.array(X), vocabulary_size)
                (outputs, state) = rnn(inputs, state, params)
                outputs = nd.concat(*outputs, dim=0)
                y = nd.array(y).T.reshape((-1, ))
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, context)
            utils.sgd(params, learning_rate, 1)
            l_sum += l.asscalar() * y.size
            n += y.size
        if (epoch + 1) % predict_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' %
                  (epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(
                    '-',
                    predict(prefix, predict_length, rnn, params,
                            init_rnn_state, num_hiddens, vocabulary_size,
                            context, index_to_char, char_to_index))
    def fit_and_plot(self, lambd):
        [w, b] = self.init_weights()
        train_iter, train_features, test_features, train_labels, test_labels = self.load_dataset(
        )
        train_ls, test_ls = [], []
        for _ in range(self.n_epochs):
            for X, y in train_iter:
                l = self.loss()(self.net()(X, w, b),
                                y) + lambd * self.l2_penalty(w)
                l = l.sum()

                if w.grad is not None:
                    w.grad.data.zero_()
                    b.grad.data.zero_()
                l.backward()
                utils.sgd([w, b], self.lr, self.batch_size)
            train_ls.append(self.loss()(self.net()(train_features, w, b),
                                        train_labels).mean().item())
            test_ls.append(self.loss()(self.net()(test_features, w, b),
                                       test_labels).mean().item())
        utils.semilogy(range(1, self.n_epochs + 1), train_ls, 'epoch', 'loss',
                       range(1, self.n_epochs + 1), test_ls, ['train', 'test'])
        print('L2 norm of w:', w.norm().item())
示例#11
0
 def fit_sgd(self, Y, R):
     n_jokes = Y.shape[0]
     n_users = Y.shape[1]
     X, Theta = utils.init_par(n_users, n_jokes, self.n_features)
     start = time.time()
     for i in range(self.n_iter):
         
         X, Theta = utils.sgd(X, Theta, Y, self.lamb, R, init_learning_rate=self.learning_rate, max_iter=8)
         J = utils.cost(X, Theta, Y, self.lamb, R)
         print('cost: ' + str(J),', n_iter: '+str(i))
         if J < 200:
             break
     self.features = X
     self.coef = Theta
     self.cost = utils.cost(X, Theta, Y, self.lamb, R)
     end = time.time()
     self.train_time = end-start
     print('final cost: '+ str(self.cost),'\n'
           'train time: '+str(self.train_time))
     return
# insert 1 in every row for intercept b
X.insert(loc=len(X.columns), column='intercept', value=1)

# split data into train and test set
print("splitting dataset into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

# train
print("Training started...")
W, lossHistory = sgd(X_train, y_train, learning_rate, regularization_strength,
                     max_epochs)

print("Training finished.")

# testing
print("Testing...")
y_train_predicted = np.array([])
for i in range(X_train.shape[0]):
    yp = np.sign(np.dot(X_train[i], W))
    y_train_predicted = np.append(y_train_predicted, yp)

y_test_predicted = np.array([])
for i in range(X_test.shape[0]):
    yp = np.sign(np.dot(X_test[i], W))
    y_test_predicted = np.append(y_test_predicted, yp)
示例#13
0
def net(X):
    X = X.reshape((-1, 784))
    H = relu(nd.dot(X, W1) + b1)
    H1 = relu(nd.dot(H, W2) + b2)
    return nd.dot(H1, W3) + b3


#train
loss_func = gluon.loss.SoftmaxCrossEntropyLoss()
lr = 0.03
#trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':0.03})
num_epochs = 10
for epoch in range(num_epochs):
    train_loss, train_acc = 0., 0.
    for data, label in train_data:
        with ag.record():
            out = net(data)
            loss = loss_func(out, label)
        loss.backward()
        #trainer.step(batch_size)
        utils.sgd(params, lr, batch_size)

        label = label.astype('float32')
        train_loss += nd.mean(loss).asscalar()
        train_acc += nd.mean(out.argmax(axis=1) == label).asscalar()

    test_acc = utils.evaluate_accuracy(test_data, net)
    print('epoch %d. Loss: %f, Train acc %f, Test acc %f' %
          (epoch, train_loss / len(train_data), train_acc / len(train_data),
           test_acc))
示例#14
0
# print(softmax(x).sum(axis=1))


def cross_entropy(yhat, y):
    return -nd.pick(nd.log(yhat), y)


def net(X):
    return softmax(nd.dot(X.reshape((-1, num_inputs)), W) + b)


epochs = 10
base_rate = 0.001

for epoch in range(epochs):
    train_loss = .0
    train_acc = .0
    for data, label in train_iter:
        with autograd.record():
            output = net(data)
            loss = cross_entropy(output, label)
        loss.backward()
        learning_rate = base_rate / (epoch + 1)
        sgd(params, learning_rate)
        train_loss += nd.mean(loss).asscalar()
        train_acc += accuracy(output, label)
    test_acc = evaluate_accuracy(test_iter, net)
    print('Epoch %d. Loss: %f, Train acc: %f, Test acc:%f' %
          (epoch, train_loss / len(train_iter), train_acc / len(train_iter),
           test_acc))
示例#15
0
    return nd.maximum(X, 0)


def net(data):
    h1 = nd.dot(data.reshape((-1, num_inputs)), w1) + b1
    h1 = relu(h1)
    output = nd.dot(h1, w2) + b2
    return output


learing_rate = 0.1

softmax_cross_loss = gluon.loss.SoftmaxCrossEntropyLoss()

epochs = 5
for epoch in range(epochs):
    total_loss = .0
    total_acc = .0
    for data, label in train_iter:
        with autograd.record():
            output = net(data)
            loss = softmax_cross_loss(output, label)
        loss.backward()
        sgd(params, learing_rate / batch_size)

        total_loss += nd.mean(loss).asscalar()
        total_acc += accuracy(output, label)
    test_acc = evaluate_accuracy(test_iter, net)
    print('Epoch %d, Train Loss: %f, Train Acc: %f, Test Acc: %f' %
          (epoch, total_loss / len(train_iter), total_acc / len(train_iter),
           test_acc))