def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay, batch_size): """ 数据训练 :return: """ train_ls, test_ls = [], [] train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features, train_labels), batch_size, shuffle=True) # 这里使用了Adam优化算法(adam,sgd) trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': learning_rate, 'wd': weight_decay }) for epoch in range(num_epochs): for X, y in train_iter: with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(batch_size) train_ls.append(log_rmse(net, train_features, train_labels)) if test_labels is not None: test_ls.append(log_rmse(net, test_features, test_labels)) return train_ls, test_ls
def get_data_set(): corpus = pickle.load(open('sampled_chat_data.pkl', 'rb')) max_seq_len = 30 # in和out分别是input和output的缩写 in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], [] MBTI = [] lines = corpus for line in lines: in_seq, out_seq, MBTI_cur = line[0], line[1], line[2] in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ') if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1: continue # 如果加上EOS后长于max_seq_len,则忽略掉此样本 process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len) process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len) MBTI.append(MBTI_cur) MBTI = nd.array(MBTI) all_vocab = build_vocab(in_tokens + out_tokens) pickle.dump(all_vocab, open('all_vocab.pkl', 'wb')) in_data = build_data(in_seqs, all_vocab) out_data = build_data(out_seqs, all_vocab) dataset = gdata.ArrayDataset(in_data, out_data, MBTI) # In[32]: print('process success') return all_vocab, dataset
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay, batch_size): """ 模型训练 :param net: 模型 :param train_features: 训练集 :param train_labels: 训练标签 :param test_features: 测试集 :param test_labels: 测试标签 :param num_epochs: 迭代次数 :param learning_rate: 学习率 :param weight_decay: 权重衰减超参数 :param batch_size: 每次训练数据集大小 :return: """ # 训练集损失,测试集损失 train_ls, test_ls = [], [] train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features, train_labels), batch_size, shuffle=True) # 使用adam优化算法 trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': learning_rate, 'wd': weight_decay}) for epoch in range(num_epochs): for X, y in train_iter: with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(batch_size) train_ls.append(log_rmse(net, train_features, train_labels)) if test_labels is not None: test_ls.append(log_rmse(net, test_features, test_labels)) return train_ls, test_ls
def get_train_base(): x_0 = pickle.load(open('../../train/x_0.pkl', 'rb')) x = np.array(x_0[0]).astype(np.float32) y = np.array(x_0[1]).astype(np.float32) ctx = mx.cpu() batch_size = 512 random.seed(47) train_iter = gdata.DataLoader(gdata.ArrayDataset(x, y), batch_size, shuffle=False) net = SPP_CNN() net.load_parameters('../model/model.params') print('Build Net Success!') # Predict for train for i, (data, label) in enumerate(train_iter): data = data.as_in_context(ctx) label = label.as_in_context(ctx) if i == 0: X = net(data).asnumpy() Y = label.asnumpy() else: X = np.concatenate((X, net(data).asnumpy())) Y = np.concatenate((Y, label.asnumpy())) pickle.dump([X, Y], open('../../train/train_base.pkl', 'wb'))
def fit_and_plot(train_features, test_features, train_labels, test_labels): net = nn.Sequential() net.add(nn.Dense(1)) net.initialize() batch_size = min(10, train_labels.shape[0]) train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features, train_labels), batch_size, shuffle=True) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01}) train_ls, test_ls = [], [] for _ in range(num_epochs): for X, y in train_iter: with autograd.record(): l = loss(net(X), y) # 神经网络自生成的参数不用attach_grad() l.backward() trainer.step(batch_size) # 每一次训练完成之后,使用记录下来的Wb来计算出此时的loss train_ls.append( loss(net(train_features), train_labels).mean().asscalar()) test_ls.append(loss(net(test_features), test_labels).mean().asscalar()) print('final epoch : train_loss', train_ls[-1], 'test_loss', test_ls[-1]) # 画出train_Loss和test_loss随着epoch的增加而变化的情况 semilogy(range(1, num_epochs + 1), train_ls, 'epoch', 'loss', range(1, num_epochs + 1), test_ls, ['train', 'test']) print('weight:', net[0].weight.data().asnumpy(), '\nbias:', net[0].bias.data().asnumpy())
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learnning_rate, weight_decay, batch_size): train_ls, test_ls = [], [] train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features, train_labels), batch_size, shuffle=True)
def fit_and_plot(train_features, test_features, train_labels, test_labels): net = nn.Sequential() net.add(nn.Dense(1)) net.initialize() batch_size = min(10, train_labels.shape[0]) # 定义数据迭代器 train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features, train_labels), batch_size, shuffle=True) # 定义训练器 # 优化函数,学习率 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01}) train_ls, test_ls = [], [] for _ in range(num_epochs): for X, y in train_iter: with autograd.record(): # 定义 loss l = loss(net(X), y) # 反向传播 l.backward() # 训练轮数 trainer.step(batch_size) train_ls.append( loss(net(train_features), train_labels).mean().asscalar()) test_ls.append(loss(net(test_features), test_labels).mean().asscalar()) print('final epoch: train loss', train_ls[-1], 'test loss', test_ls[-1]) semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss', range(1, num_epochs + 1), test_ls, ['train', 'test']) print('weights:', net[0].weight.data().asnumpy(), '\nbias:', net[0].bias.data().asnumpy())
def method1(): num_inputs = 2 num_examples = 1000 true_w = [2, -3.4] true_b = 4.2 features = nd.random.normal(scale=1, shape=(num_examples, num_inputs)) labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b labels += nd.random.normal(scale=0.01, shape=labels.shape) batch_size = 10 dataset = gdata.ArrayDataset(features, labels) data_iters = gdata.DataLoader(dataset, batch_size, shuffle=True) net = nn.Sequential() net.add(nn.Dense(1)) net.initialize(init.Normal(sigma=0.01)) loss = gloss.L2Loss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03}) num_epochs = 3 for epoch in range(1, num_epochs + 1): for x, y in data_iters: with autograd.record(): l = loss(net(x), y) l.backward() trainer.step(batch_size) l = loss(net(features), labels) print('epoch: %d, loss: %f' % (epoch, l.mean().asnumpy())) dense = net[0] print(dense.weight.data()) print(dense.bias.data())
def train_gluon_ch7(trainer_name, trainer_hyperparams, features, labels, batch_size=10, num_epochs=2): """Train a linear regression model with a given Gluon trainer.""" net = nn.Sequential() net.add(nn.Dense(1)) net.initialize(init.Normal(sigma=0.01)) loss = gloss.L2Loss() def eval_loss(): return loss(net(features), labels).mean().asscalar() ls = [eval_loss()] data_iter = gdata.DataLoader( gdata.ArrayDataset(features, labels), batch_size, shuffle=True) trainer = gluon.Trainer(net.collect_params(), trainer_name, trainer_hyperparams) for _ in range(num_epochs): start = time.time() for batch_i, (X, y) in enumerate(data_iter): with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(batch_size) if (batch_i + 1) * batch_size % 100 == 0: ls.append(eval_loss()) print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start)) set_figsize() plt.plot(np.linspace(0, num_epochs, len(ls)), ls) plt.xlabel('epoch') plt.ylabel('loss')
def train_ch9(trainer_fn, states, hyperparams, features, labels, batch_size=10, num_epochs=2): """Train a linear regression model.""" net, loss = linreg, squared_loss w, b = nd.random.normal(scale=0.01, shape=(features.shape[1], 1)), nd.zeros(1) w.attach_grad() b.attach_grad() def eval_loss(): return loss(net(features, w, b), labels).mean().asscalar() ls = [eval_loss()] data_iter = gdata.DataLoader(gdata.ArrayDataset(features, labels), batch_size, shuffle=True) for _ in range(num_epochs): start = time.time() for batch_i, (X, y) in enumerate(data_iter): with autograd.record(): l = loss(net(X, w, b), y).mean() l.backward() trainer_fn([w, b], states, hyperparams) if (batch_i + 1) * batch_size % 100 == 0: ls.append(eval_loss()) print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start)) set_figsize() plt.plot(np.linspace(0, num_epochs, len(ls)), ls) plt.xlabel('epoch') plt.ylabel('loss')
def train( net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay, batch_size, ): train_ls, test_ls = [], [] train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features, train_labels), batch_size, shuffle=True) # 使用 Adam 优化算法 trainer = gluon.Trainer( net.collect_params(), "adam", { "learning_rate": learning_rate, "wd": weight_decay }, ) for epoch in range(num_epochs): for X, y in train_iter: with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(batch_size) train_ls.append(log_rmse(net, train_features, train_labels)) if test_labels is not None: test_ls.append(log_rmse(net, test_features, test_labels)) return train_ls, test_ls
def read_data(max_seq_len): """ 使用一份小的 法语-英语 数据集 :param max_seq_len: :return: """ # in 和 out 分别是 input 和 output 的缩写 in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], [] local_file = WORKING_PATH + r"\NLP_data\fr-en-small.txt" with io.open(local_file) as f: lines = f.readlines() for line in lines: in_seq, out_seq = line.strip().split("\t") in_seq_tokens, out_seq_tokens = in_seq.split(" "), out_seq.split(" ") if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1: continue process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len) process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len) in_vocab, in_data = build_data(in_tokens, in_seqs) out_vocab, out_data = build_data(out_tokens, out_seqs) return in_vocab, out_vocab, gdata.ArrayDataset(in_data, out_data)
def optimize_with_trainer(trainer, features, labels, net, decay_epoch=None, batch_size=10, log_interval=10, num_epochs=3): """Optimize an objective function with a Gluon trainer.""" dataset = gdata.ArrayDataset(features, labels) data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True) loss = gloss.L2Loss() ls = [loss(net(features), labels).mean().asnumpy()] for epoch in range(1, num_epochs + 1): # Decay the learning rate. if decay_epoch and epoch > decay_epoch: trainer.set_learning_rate(trainer.learning_rate * 0.1) for batch_i, (X, y) in enumerate(data_iter): with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(batch_size) if batch_i * batch_size % log_interval == 0: ls.append(loss(net(features), labels).mean().asnumpy()) print( 'w[0]=%.2f, w[1]=%.2f, b=%.2f' % (net[0].weight.data()[0][0].asscalar(), net[0].weight.data()[0][1].asscalar(), net[0].bias.data().asscalar())) es = np.linspace(0, num_epochs, len(ls), endpoint=True) semilogy(es, ls, 'epoch', 'loss')
def load_data_nmt(batch_size, max_len, num_examples=1000): """Download an NMT dataset, return its vocabulary and data iterator.""" # Download and preprocess def preprocess_raw(text): text = text.replace('\u202f', ' ').replace('\xa0', ' ') out = '' for i, char in enumerate(text.lower()): if char in (',', '!', '.') and text[i - 1] != ' ': out += ' ' out += char return out fname = gutils.download('http://www.manythings.org/anki/fra-eng.zip') with zipfile.ZipFile(fname, 'r') as f: raw_text = f.read('fra.txt').decode("utf-8") text = preprocess_raw(raw_text) # Tokenize source, target = [], [] for i, line in enumerate(text.split('\n')): if i >= num_examples: break parts = line.split('\t') if len(parts) == 2: source.append(parts[0].split(' ')) target.append(parts[1].split(' ')) # Build vocab def build_vocab(tokens): tokens = [token for line in tokens for token in line] return Vocab(tokens, min_freq=3, use_special_tokens=True) src_vocab, tgt_vocab = build_vocab(source), build_vocab(target) # Convert to index arrays def pad(line, max_len, padding_token): if len(line) > max_len: return line[:max_len] return line + [padding_token] * (max_len - len(line)) def build_array(lines, vocab, max_len, is_source): lines = [vocab[line] for line in lines] if not is_source: lines = [[vocab.bos] + line + [vocab.eos] for line in lines] array = nd.array([pad(line, max_len, vocab.pad) for line in lines]) valid_len = (array != vocab.pad).sum(axis=1) return array, valid_len src_array, src_valid_len = build_array(source, src_vocab, max_len, True) tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False) # Construct data iterator train_set = gdata.ArrayDataset(src_array, src_valid_len, tgt_array, tgt_valid_len) train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True) return src_vocab, tgt_vocab, train_iter
def __init__(self, dataset: mxdata.Dataset, num_cls=None, test_ratio=0.2, batch_size=None): super().__init__(batch_size) self.test_ratio = test_ratio self.test_count = int(len(dataset) * test_ratio) testdata = dataset[0:self.test_count] traindata = dataset[self.test_count:] #构建dataset self.trainset = mxdata.ArrayDataset(*traindata) self.testset = mxdata.ArrayDataset(*testdata) #如果num_cls为None则自动测量 if num_cls is None: self.num_cls = len(testdata[1]) else: self.num_cls = num_cls
def fit(self, train_x, train_y, optimizer, hyper_params, batch_size=64, epochs=10, test_x=None, test_y=None): """ fit x, y""" train_data_set = gdata.ArrayDataset(train_x, train_y) train_iter = gdata.DataLoader(train_data_set, batch_size=batch_size, shuffle=True) self.net = nn.Sequential() self.net.add(nn.Dense(1)) self.net.initialize() if optimizer != "sgd": raise ValueError("only support sgd optimizer") if self.regularization == "l2": weight_decay_params = {k: v for k, v in hyper_params} weight_decay_params["wd"] = 3 self.trainers = [ gluon.Trainer(self.net.collect_params(".*weight"), "sgd", weight_decay_params), gluon.Trainer(self.net.collect_params(".*bias"), "sgd", hyper_params) ] else: self.trainers = [ gluon.Trainer(self.net.collect_params(), "sgd", hyper_params) ] train_loss, test_loss = [], [] for epoch in range(epochs): for x, y in train_iter: self._train(x, y, optimizer, hyper_params) train_loss.append(self.evaluate(train_x, train_y)) test_loss.append(self.evaluate(test_x, test_y)) logger.info("final epoch train loss: {}, test loss: {}".format( train_loss[-1], test_loss[-1])) display_utils.semilogy(range(1, epochs + 1), train_loss, 'epochs', 'loss', x2_vals=range(1, epochs + 1), y2_vals=test_loss, legend=["train", "test"]) logger.info("weight: {}\nbias: {}".format( self.net[0].weight.data().asnumpy(), self.net[0].bias.data().asnumpy()))
def __init__(self, true_w, true_b, num_inputs: int, num_examples: int, batch_size: int): super(MxDataLoader, self).\ __init__(true_w, true_b, num_inputs, num_examples, batch_size) self.dataset = gdata.ArrayDataset(self.features, self.labels) self.data_iter = gdata.DataLoader(self.dataset, batch_size, shuffle=True)
def load_data_imdb(batch_size, max_len=500): """Download a IMDB dataset, return the vocabulary and iterators""" data_dir = '../data' url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' fname = gutils.download(url, data_dir) with tarfile.open(fname, 'r') as f: f.extractall(data_dir) def read_imdb(folder='train'): data, labels = [], [] for label in ['pos', 'neg']: folder_name = os.path.join(data_dir, 'aclImdb', folder, label) for file in os.listdir(folder_name): with open(os.path.join(folder_name, file), 'rb') as f: review = f.read().decode('utf-8').replace('\n', '') data.append(review) labels.append(1 if label == 'pos' else 0) return data, labels train_data, test_data = read_imdb('train'), read_imdb('test') def tokenize(sentences): return [line.split(' ') for line in sentences] train_tokens = tokenize(train_data[0]) test_tokens = tokenize(test_data[0]) vocab = Vocab([tk for line in train_tokens for tk in line], min_freq=5) def pad(x): return x[:max_len] if len( x) > max_len else x + [vocab.unk] * (max_len - len(x)) train_features = nd.array([pad(vocab[line]) for line in train_tokens]) test_features = nd.array([pad(vocab[line]) for line in test_tokens]) train_set = gdata.ArrayDataset(train_features, train_data[1]) test_set = gdata.ArrayDataset(test_features, test_data[1]) train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True) test_iter = gdata.DataLoader(test_set, batch_size) return vocab, train_iter, test_iter
def predict(net,feature_values): train_iter = gdata.DataLoader(gdata.ArrayDataset(feature_values), batch_size=net.batch_size, shuffle=False) # pd.read_csv去掉name result = [] for X in train_iter: X = X.as_in_context(net.ctx) target = net.forward(X) result.append(target.reshape((-1,)).asnumpy().tolist()) result = [num for l in result for num in l] return result
def evaluate_accuracy_fold(net, features, labels, batch_size, ctx): acc_sum, n = 0, 0 # create data loader to prevent GPU from running out of memory eval_iter = gdata.DataLoader(gdata.ArrayDataset(features, labels), batch_size, shuffle=True) for X, y in eval_iter: X, y = X.as_in_context(ctx), y.as_in_context(ctx).astype('float32') acc_sum += (net(X).argmax(axis=1) == y).sum().asscalar() n += y.size return acc_sum / n
def get_data_iter(features, labels, batch): """ 获取数据 :param features: :param labels: :param batch: :return: """ dataset = gdata.ArrayDataset(features, labels) data_iter = gdata.DataLoader(dataset, batch) return data_iter
def main(column,DIM_NUM): Params = RNNParameter(column,DIM_NUM) Params.load_dir() num_outputs = Params.num_outputs lr = Params.lr num_epochs = Params.num_epochs batch_size = Params.batch_size embed_size = DIM_NUM num_hiddens = Params.num_hiddens num_layers = Params.num_layers bidirectional = Params.bidirectional ctx = utils.try_all_gpus() csvfile = Params.train_file vocab = utils.read_vocab(Params.vocab_file) glove_embedding = text.embedding.CustomEmbedding(pretrained_file_path=Params.embedding_file, vocabulary=vocab) net = utils.BiRNN(vocab, embed_size, num_hiddens, num_layers, bidirectional,num_outputs) net.initialize(init.Xavier(), ctx=ctx) # 设置 embedding 层的 weight 为预训练的词向量。 net.embedding.weight.set_data(glove_embedding.idx_to_vec) # 训练中不更新词向量(net.embedding 中的模型参数)。 net.embedding.collect_params().setattr('grad_req', 'null') trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) loss = gloss.SoftmaxCrossEntropyLoss() trainSet,valSet = utils.select_sample_by_class(csvfile,ratio=0.85) train_features,test_features,train_labels,test_labels=utils.read_dg_data(trainSet,valSet,vocab,column,MAX_LEN=2500) train_set = gdata.ArrayDataset(train_features, train_labels) #训练集 test_set = gdata.ArrayDataset(test_features, test_labels) #测试集 train_loader = gdata.DataLoader(train_set, batch_size=batch_size,shuffle=True) test_loader = gdata.DataLoader(test_set, batch_size=batch_size, shuffle=False) logging.info("开始训练rnn {}文本分类模型".format(column)) best_acc = utils.train(train_loader, test_loader, net, loss, trainer, ctx, num_epochs,column,Params.best_param_file) logging.info("模型训练完成,最佳模型的acc:{:.4f} 开始测试.".format(best_acc)) net.load_parameters(Params.best_param_file,ctx=ctx) f1= utils.evaluate_valset(net,valSet,vocab,column) logging.info("rnn网络在验证集的f1_score:{:.4f}".format(f1)) logging.info("对数据进行测试") textSet = pd.read_csv(os.path.join(Params.data_dir,'test_set.csv')) y_probs = utils.predict_test_result(net,vocab,textSet,column,'result/rnn_{}_{:.4f}.csv'.format(column,f1)) logging.info("保存概率数据") utils.save_prob_file(y_probs,'result/rnn_{}_{:.4f}_prob.csv'.format(column,f1)) logging.info("保存完毕,请查看目录result.")
def get_iter_data(batch_size=64): """ 获取迭代的训练数据和测试数据 :param batch_size: :return: """ # gb.download_imdb() # train_data, test_data = gb.read_imdb("train"), gb.read_imdb("test") train_data, test_data = read_imdb("train"), read_imdb("test") vocab = gb.get_vocab_imdb(train_data) train_iter = gdata.DataLoader( gdata.ArrayDataset(*gb.preprocess_imdb(train_data, vocab)), batch_size=batch_size, shuffle=True) test_iter = gdata.DataLoader( gdata.ArrayDataset(*gb.preprocess_imdb(test_data, vocab)), batch_size) return vocab, train_iter, test_iter
def init(bsize): data, label = load("Kaggle.npz") #转换到球极坐标 # norm=np.sqrt(np.sum(data**2,axis=1,keepdims=True)) # ag=data/norm # data=np.concatenate([data,norm,ag],axis=1) #使用sin和cos信息 # data=np.concatenate([np.sin(data),np.cos(data)],axis=1) # 下采样 制造平衡样本 cr = under_sampling.NearMiss() data, label = cr.fit_sample(data, label) #上采样 制造平衡样本 # ocr=over_sampling.ADASYN() # data,label=ocr.fit_sample(data,label) #混肴 idx = list(range(len(data))) random.shuffle(idx) data, label = data[idx], label[idx] #onehot olabel = np.zeros(shape=(len(label), 2)) for i, l in enumerate(label): olabel[i][int(l - 1)] = 1 #类型转换 data = data.astype("float32") olabel = olabel.astype("float32") # train_sum = int(len(data) / 1.3) tdata, tlabel = data[:train_sum], olabel[:train_sum] test_data, test_label = data[train_sum:], olabel[train_sum:] train_set = mxdata.ArrayDataset(nd.array(tdata), nd.array(tlabel)) test_set = mxdata.ArrayDataset(nd.array(test_data), nd.array(test_label)) # #loader train_loader = mxdata.DataLoader(train_set, batch_size=bsize) test_loader = mxdata.DataLoader(test_set, batch_size=bsize) return train_loader, test_loader
def next(self): """切换到下一状态""" self.nowcount += 1 if self.nowcount >= self.splitcount: print("交叉验证次数已用完") return startpoint = self.nowcount * self.splen #测试集起始位置 #这里直接引用 testar = (self.wholedata[startpoint:startpoint + self.splen], self.wholelabel[startpoint:startpoint + self.splen] ) #截取一段作为测试集 self.testset = mxdata.ArrayDataset(*testar) #截取一段做训练集 将第一段复制过来并截取后续 这里由于要操作原始数据 于是创建副本 trainar = [self.wholedata.copy(), self.wholelabel.copy()] ##框架BUG!!!!切片赋值只能先转到numpy数组再赋值否则无效 trainar[0][startpoint:startpoint + self.splen] = trainar[0][:self.splen].asnumpy() trainar[1][startpoint:startpoint + self.splen] = trainar[1][:self.splen].asnumpy() trainar[0] = trainar[0][self.splen:] trainar[1] = trainar[1][self.splen:] self.trainset = mxdata.ArrayDataset(*trainar)
def make_dataset(src_data, trg_data, src_vocab, trg_vocab): s_bos = src_vocab.to_indices(BOS) s_eos = src_vocab.to_indices(EOS) s_pad = src_vocab.to_indices(PAD) t_bos = src_vocab.to_indices(BOS) t_eos = src_vocab.to_indices(EOS) t_pad = src_vocab.to_indices(PAD) t = gdata.ArrayDataset(src_data, trg_data) dataset = t.transform(lambda x, y: _transform(x, y, s_bos, s_eos, s_pad, t_bos, t_eos, t_pad), lazy=True) return dataset
def train(trainer_fn, states, hyperparams, features, labels, batch_size, num_epochs): # version 1 # w = nd.random.normal(scale=0.01, shape=(features.shape[1], 1)) # b = nd.zeros(1) # w.attach_grad() # b.attach_grad() # version 2 w_ = gluon.Parameter("w", shape=(features.shape[1], 1)) w_.initialize() w_.set_data(nd.random.normal(scale=0.01, shape=(features.shape[1], 1))) b_ = gluon.Parameter("b", shape=(1, )) b_.initialize() b_.set_data(nd.zeros(1)) net = gb.linreg loss = gb.squared_loss trainer = gluon.Trainer([w_, b_], "sgd", hyperparams) def eval_loss(): # version 1 # return loss(net(features, w, b), labels).mean().asscalar() # version 2 return loss(net(features, w_.data(), b_.data()), labels).mean().asscalar() ls = [eval_loss()] dataset = gdata.ArrayDataset(features, labels) data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True) for epoch in range(num_epochs): start = time.time() for i, (X, y) in enumerate(data_iter): with autograd.record(): # version 1 # l = loss(net(X, w, b), y).mean() # version 2 l = loss(net(X, w_.data(), b_.data()), y).mean() l.backward() # version 1 # trainer_fn([w, b], states, hyperparams) # version 2 trainer.step(1) ls.append(eval_loss()) print('epoch %d, loss: %f, %f sec per epoch' % (epoch + 1, ls[-1], time.time() - start)) gb.set_figsize() gb.plt.plot(np.linspace(0, num_epochs, len(ls)), ls) gb.plt.xlabel('epoch') gb.plt.ylabel('loss')
def train(self, datas, labels, batch_size=3, epoch=10): dataset = gdata.ArrayDataset(datas, labels) data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True) for _ in range(epoch): for X, y in data_iter: with autograd.record(): res = self.nn(X) l = self.loss(res, y) l.backward() self.trainer.step(batch_size) for X, y in data_iter: l = self.loss(self.nn(X), y) print 'loss', l break
def read_data(max_seq_len): # in和out分别是input和output的缩写 in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], [] with io.open('./data/fr-en-small.txt') as f: lines = f.readlines() for line in lines: in_seq, out_seq = line.rstrip().split('\t') in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ') if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1: continue # 如果加上EOS后长于max_seq_len,则忽略掉此样本 process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len) process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len) in_vocab, in_data = build_data(in_tokens, in_seqs) out_vocab, out_data = build_data(out_tokens, out_seqs) return in_vocab, out_vocab, gdata.ArrayDataset(in_data, out_data)
def read_data(max_seq_len): # in和out分别是input和output的缩写 in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], [] with io.open('C:/Users/mayao/Desktop/d2l-zh/data/fr-en-small.txt') as f: lines = f.readlines() for line in lines: in_seq, out_seq = line.rstrip().split('\t') in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ') if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1: continue # If length of adding eos larger than max_seq_len, then break process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len) process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len) in_vocab, in_data = build_data(in_tokens, in_seqs) out_vocab, out_data = build_data(out_tokens, out_seqs) return in_vocab, out_vocab, gdata.ArrayDataset(in_data, out_data)