def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): """Train an RNN model and predict the next item in the sequence.""" data_iter_fn = data_iter_random #corpus_indices = my_seq params = get_params() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: state = init_rnn_state(batch_size, num_hiddens, ctx) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) for X, Y in data_iter: if is_random_iter: state = init_rnn_state(batch_size, num_hiddens, ctx) else: for s in state: s.detach() with autograd.record(): #print("X ",X) inputs = to_onehot(X, vocab_size) #print("len inputs ",len(inputs)) #print("shape ",inputs) (outputs, state) = rnn(inputs, state, params) #print('len outputs',len(outputs)) outputs = nd.concat(*outputs, dim=0) #print("concat output : ",len(outputs)) #print("Y ", Y) y = Y.T.reshape((-1, )) #print("y ",y) l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, ctx) sgd(params, lr, 1) l_sum += l.asscalar() * y.size n += y.size if (epoch + 1) % pred_period == 0: #print("n ",n) print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx): state = init_rnn_state(1, num_hiddens, ctx) output = [char_to_idx[prefix[0]]] for t in range(num_chars + len(prefix) - 1): # 将上⼀时间步的输出作为当前时间步的输⼊。 X = ld.to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size) # 计算输出和更新隐藏状态。 (Y, state) = rnn(X, state, params) # 下⼀个时间步的输⼊是 prefix ⾥的字符或者当前的最佳预测字符。 if t < len(prefix) - 1: output.append(char_to_idx[prefix[t + 1]]) else: output.append(int(Y[0].argmax(axis=1).asscalar())) return ''.join([idx_to_char[i] for i in output])
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: # 如使用相邻采样,在epoch开始时初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, ctx) l_sum, n,start=0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) for X, Y in data_iter: if is_random_iter: # 如使用随机采样,在每个小批量更新前初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, ctx) else: # 否则需要使用detach函数从计算图分离隐藏状态 for s in state: s.detach() with autograd.record(): inputs = to_onehot(X, vocab_size) # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵 (outputs, state) = rnn(inputs, state, params) # 拼接之后形状为(num_steps * batch_size, vocab_size) outputs = nd.concat(*outputs, dim=0) # Y的形状是(batch_size, num_steps),转置后再变成长度为 # batch * num_steps 的向量,这样跟输出的行一一对应 y = Y.T.reshape((-1,)) # 使用交叉熵损失计算平均分类误差 l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, ctx) # 裁剪梯度 d2l.sgd(params, lr, 1) # 因为误差已经取过均值,梯度不用再做平均 l_sum += l.asscalar() * y.size n += y.size if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % ( epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print(' -', predict_rnn( prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx)) print(y)
def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx): """Predict next chars with a RNN model""" state = init_rnn_state(1, num_hiddens, ctx) output = [char_to_idx[int(prefix[0])]] #print("output ",output) for t in range(num_chars + len(prefix) - 1): X = to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size) (Y, state) = rnn(X, state, params) if t < len(prefix) - 1: output.append(char_to_idx[prefix[t + 1]]) else: output.append(int(Y[0].argmax(axis=1).asscalar())) # print("idx_to_char ",idx_to_char) # for i in output: # print("idx_to_char ",idx_to_char[i]) return ''.join(str([idx_to_char[i] for i in output]))
rnn = BiRNN(hidden_size, num_layers, num_classes) rnn.initialize(ctx=ctx) # Loss and Optimizer criterion = gluon.loss.SoftmaxCrossEntropyLoss() optimizer = gluon.Trainer(rnn.collect_params(), 'adam', {'learning_rate': learning_rate}) # Train the Model for epoch in range(num_epochs): for i, (images, labels) in enumerate(train_loader): images = images.astype('float32').reshape((-1, sequence_length, input_size)) / 255 images, labels = images.as_in_context(ctx), labels.as_in_context(ctx) # Forward + Backward + Optimize with autograd.record(): outputs = rnn(images) loss = criterion(outputs, labels) loss.backward() optimizer.step(batch_size) if (i + 1) % 100 == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch + 1, num_epochs, i + 1, len(train_dataset) // batch_size, loss.sum().asscalar())) # Test the Model total, correct = 0, 0 for images, labels in test_loader: images = images.astype('float32').reshape((-1, sequence_length, input_size)) / 255 images, labels = images.as_in_context(ctx), labels.as_in_context(ctx) outputs = rnn(images) predict = outputs.argmax(1).astype('int32')
'learning_rate':0.005, "wd":0.001 }) dataLoader = DataLoader(a,b) for epoch in range(500): total_L = 0.0 hidden = rnn.begin_state(func=mx.nd.zeros,batch_size = batch_size,ctx=mx.cpu()) for data,label in dataLoader.dataIter(batch_size): label = nd.array(label) #label = nd.ones(shape=(5,32)) * label #label = label.reshape((-1,)) dd = nd.array(data.reshape((batch_size,5,11)).swapaxes(0,1)) hidden = detach(hidden) with mx.autograd.record(): output, hidden = rnn(dd,hidden) output = output.reshape((5,batch_size,2)) output = nd.sum(output,axis=0)/5 lv = loss(output,label) lv.backward() grads = [i.grad() for i in rnn.collect_params().values()] mx.gluon.utils.clip_global_norm(grads,clipping_norm*num_steps*batch_size) trainer.step(batch_size) total_L += mx.nd.sum(lv).asscalar() test_loss = evals(rnn,c,d,batch_size) print("Epoch %d loss %.4f test loss %.4f train acc %.4f test acc %.4f" %(epoch, total_L/len(a), test_loss,predict(rnn,a,b),predict(rnn,c,d)))
dataLoader = DataLoader(a,b) trl = [] tel = [] for epoch in range(500): total_L = 0.0 hidden = rnn.begin_state(func=mx.nd.zeros,batch_size = batch_size,ctx=mx.cpu()) for data,label in dataLoader.dataIter(batch_size): label = nd.array(label) # print("label shape" ,label.shape) #label = nd.ones(shape=(5,32)) * label #label = label.reshape((-1,)) dd = nd.array(data.reshape((batch_size,5,11)).swapaxes(0,1)) hidden = detach(hidden) with mx.autograd.record(): output, hidden = rnn(dd,hidden) output = output.reshape((5,256,1)) output = nd.sum(output,axis=0)/5 # print(output.shape) lv = loss(output,label) lv.backward() grads = [i.grad() for i in rnn.collect_params().values()] mx.gluon.utils.clip_global_norm(grads,clipping_norm*num_steps*batch_size) trainer.step(batch_size) total_L += mx.nd.sum(lv).asscalar() test_loss = evals(rnn,c,d,batch_size) trl.append(total_L/len(a)) tel.append(test_loss) print("Epoch %d loss %.4f test loss %.4f train acc %.4f test acc %.4f" %(epoch, total_L/len(a), test_loss,predict(rnn,a,b),predict(rnn,c,d))) with open("rnn.csv",'w',newline='') as f: import csv