示例#1
0
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    """
    数据训练
    :return:
    """
    train_ls, test_ls = [], []
    train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features,
                                                     train_labels),
                                  batch_size,
                                  shuffle=True)
    # 这里使用了Adam优化算法(adam,sgd)
    trainer = gluon.Trainer(net.collect_params(), 'adam', {
        'learning_rate': learning_rate,
        'wd': weight_decay
    })
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer.step(batch_size)
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls
def get_data_set():
	corpus = pickle.load(open('sampled_chat_data.pkl', 'rb'))
	max_seq_len = 30
	# in和out分别是input和output的缩写
	in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], []
	MBTI = []
	lines = corpus
	for line in lines:
		in_seq, out_seq, MBTI_cur = line[0], line[1], line[2]
		in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ')
		if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1:
			continue  # 如果加上EOS后长于max_seq_len,则忽略掉此样本
		process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
		process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len)
		MBTI.append(MBTI_cur)
	MBTI = nd.array(MBTI)
	all_vocab = build_vocab(in_tokens + out_tokens)
	pickle.dump(all_vocab, open('all_vocab.pkl', 'wb'))
	in_data = build_data(in_seqs, all_vocab)
	out_data = build_data(out_seqs, all_vocab)
	dataset = gdata.ArrayDataset(in_data, out_data, MBTI)
	
	# In[32]:
	print('process success')
	return all_vocab, dataset
示例#3
0
def train(net, train_features, train_labels, test_features, test_labels, num_epochs,
          learning_rate, weight_decay, batch_size):
    """
    模型训练
    :param net:             模型
    :param train_features:  训练集
    :param train_labels:    训练标签
    :param test_features:   测试集
    :param test_labels:     测试标签
    :param num_epochs:      迭代次数
    :param learning_rate:   学习率
    :param weight_decay:    权重衰减超参数
    :param batch_size:      每次训练数据集大小
    :return:
    """
    # 训练集损失,测试集损失
    train_ls, test_ls = [], []
    train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features, train_labels), batch_size, shuffle=True)
    # 使用adam优化算法
    trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': learning_rate, 'wd': weight_decay})
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer.step(batch_size)
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls
示例#4
0
def get_train_base():

    x_0 = pickle.load(open('../../train/x_0.pkl', 'rb'))

    x = np.array(x_0[0]).astype(np.float32)
    y = np.array(x_0[1]).astype(np.float32)

    ctx = mx.cpu()
    batch_size = 512
    random.seed(47)

    train_iter = gdata.DataLoader(gdata.ArrayDataset(x, y),
                                  batch_size,
                                  shuffle=False)

    net = SPP_CNN()
    net.load_parameters('../model/model.params')
    print('Build Net Success!')

    # Predict for train
    for i, (data, label) in enumerate(train_iter):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)

        if i == 0:
            X = net(data).asnumpy()
            Y = label.asnumpy()
        else:
            X = np.concatenate((X, net(data).asnumpy()))
            Y = np.concatenate((Y, label.asnumpy()))

    pickle.dump([X, Y], open('../../train/train_base.pkl', 'wb'))
示例#5
0
def fit_and_plot(train_features, test_features, train_labels, test_labels):
    net = nn.Sequential()
    net.add(nn.Dense(1))
    net.initialize()
    batch_size = min(10, train_labels.shape[0])
    train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features,
                                                     train_labels),
                                  batch_size,
                                  shuffle=True)
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': 0.01})
    train_ls, test_ls = [], []
    for _ in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                l = loss(net(X), y)
            # 神经网络自生成的参数不用attach_grad()
            l.backward()
            trainer.step(batch_size)
            # 每一次训练完成之后,使用记录下来的Wb来计算出此时的loss
        train_ls.append(
            loss(net(train_features), train_labels).mean().asscalar())
        test_ls.append(loss(net(test_features), test_labels).mean().asscalar())
    print('final epoch : train_loss', train_ls[-1], 'test_loss', test_ls[-1])

    # 画出train_Loss和test_loss随着epoch的增加而变化的情况
    semilogy(range(1, num_epochs + 1), train_ls, 'epoch', 'loss',
             range(1, num_epochs + 1), test_ls, ['train', 'test'])
    print('weight:', net[0].weight.data().asnumpy(), '\nbias:',
          net[0].bias.data().asnumpy())
示例#6
0
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learnning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features,
                                                     train_labels),
                                  batch_size,
                                  shuffle=True)
示例#7
0
def fit_and_plot(train_features, test_features, train_labels, test_labels):
    net = nn.Sequential()
    net.add(nn.Dense(1))
    net.initialize()
    batch_size = min(10, train_labels.shape[0])

    # 定义数据迭代器
    train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features,
                                                     train_labels),
                                  batch_size,
                                  shuffle=True)

    # 定义训练器
    # 优化函数,学习率
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': 0.01})
    train_ls, test_ls = [], []
    for _ in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                # 定义 loss
                l = loss(net(X), y)
            # 反向传播
            l.backward()
            # 训练轮数
            trainer.step(batch_size)
        train_ls.append(
            loss(net(train_features), train_labels).mean().asscalar())
        test_ls.append(loss(net(test_features), test_labels).mean().asscalar())
    print('final epoch: train loss', train_ls[-1], 'test loss', test_ls[-1])
    semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
             range(1, num_epochs + 1), test_ls, ['train', 'test'])
    print('weights:', net[0].weight.data().asnumpy(), '\nbias:',
          net[0].bias.data().asnumpy())
def method1():
    num_inputs = 2
    num_examples = 1000
    true_w = [2, -3.4]
    true_b = 4.2
    features = nd.random.normal(scale=1, shape=(num_examples, num_inputs))
    labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
    labels += nd.random.normal(scale=0.01, shape=labels.shape)
    batch_size = 10
    dataset = gdata.ArrayDataset(features, labels)
    data_iters = gdata.DataLoader(dataset, batch_size, shuffle=True)
    net = nn.Sequential()
    net.add(nn.Dense(1))
    net.initialize(init.Normal(sigma=0.01))
    loss = gloss.L2Loss()
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': 0.03})
    num_epochs = 3
    for epoch in range(1, num_epochs + 1):
        for x, y in data_iters:
            with autograd.record():
                l = loss(net(x), y)
            l.backward()
            trainer.step(batch_size)
        l = loss(net(features), labels)
        print('epoch: %d, loss: %f' % (epoch, l.mean().asnumpy()))
    dense = net[0]
    print(dense.weight.data())
    print(dense.bias.data())
示例#9
0
def train_gluon_ch7(trainer_name, trainer_hyperparams, features, labels,
                    batch_size=10, num_epochs=2):
    """Train a linear regression model with a given Gluon trainer."""
    net = nn.Sequential()
    net.add(nn.Dense(1))
    net.initialize(init.Normal(sigma=0.01))
    loss = gloss.L2Loss()

    def eval_loss():
        return loss(net(features), labels).mean().asscalar()

    ls = [eval_loss()]
    data_iter = gdata.DataLoader(
        gdata.ArrayDataset(features, labels), batch_size, shuffle=True)
    trainer = gluon.Trainer(net.collect_params(),
                            trainer_name, trainer_hyperparams)
    for _ in range(num_epochs):
        start = time.time()
        for batch_i, (X, y) in enumerate(data_iter):
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer.step(batch_size)
            if (batch_i + 1) * batch_size % 100 == 0:
                ls.append(eval_loss())
    print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start))
    set_figsize()
    plt.plot(np.linspace(0, num_epochs, len(ls)), ls)
    plt.xlabel('epoch')
    plt.ylabel('loss')
示例#10
0
def train_ch9(trainer_fn,
              states,
              hyperparams,
              features,
              labels,
              batch_size=10,
              num_epochs=2):
    """Train a linear regression model."""
    net, loss = linreg, squared_loss
    w, b = nd.random.normal(scale=0.01,
                            shape=(features.shape[1], 1)), nd.zeros(1)
    w.attach_grad()
    b.attach_grad()

    def eval_loss():
        return loss(net(features, w, b), labels).mean().asscalar()

    ls = [eval_loss()]
    data_iter = gdata.DataLoader(gdata.ArrayDataset(features, labels),
                                 batch_size,
                                 shuffle=True)
    for _ in range(num_epochs):
        start = time.time()
        for batch_i, (X, y) in enumerate(data_iter):
            with autograd.record():
                l = loss(net(X, w, b), y).mean()
            l.backward()
            trainer_fn([w, b], states, hyperparams)
            if (batch_i + 1) * batch_size % 100 == 0:
                ls.append(eval_loss())
    print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start))
    set_figsize()
    plt.plot(np.linspace(0, num_epochs, len(ls)), ls)
    plt.xlabel('epoch')
    plt.ylabel('loss')
示例#11
0
def train(
    net,
    train_features,
    train_labels,
    test_features,
    test_labels,
    num_epochs,
    learning_rate,
    weight_decay,
    batch_size,
):
    train_ls, test_ls = [], []
    train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features,
                                                     train_labels),
                                  batch_size,
                                  shuffle=True)
    # 使用 Adam 优化算法
    trainer = gluon.Trainer(
        net.collect_params(),
        "adam",
        {
            "learning_rate": learning_rate,
            "wd": weight_decay
        },
    )
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer.step(batch_size)
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls
def read_data(max_seq_len):
    """
        使用一份小的 法语-英语 数据集
    :param max_seq_len:
    :return:
    """
    # in 和 out 分别是 input 和 output 的缩写
    in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], []
    local_file = WORKING_PATH + r"\NLP_data\fr-en-small.txt"
    with io.open(local_file) as f:
        lines = f.readlines()

    for line in lines:
        in_seq, out_seq = line.strip().split("\t")
        in_seq_tokens, out_seq_tokens = in_seq.split(" "), out_seq.split(" ")

        if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1:
            continue
        process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
        process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len)

    in_vocab, in_data = build_data(in_tokens, in_seqs)
    out_vocab, out_data = build_data(out_tokens, out_seqs)

    return in_vocab, out_vocab, gdata.ArrayDataset(in_data, out_data)
示例#13
0
def optimize_with_trainer(trainer,
                          features,
                          labels,
                          net,
                          decay_epoch=None,
                          batch_size=10,
                          log_interval=10,
                          num_epochs=3):
    """Optimize an objective function with a Gluon trainer."""
    dataset = gdata.ArrayDataset(features, labels)
    data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)
    loss = gloss.L2Loss()
    ls = [loss(net(features), labels).mean().asnumpy()]
    for epoch in range(1, num_epochs + 1):
        # Decay the learning rate.
        if decay_epoch and epoch > decay_epoch:
            trainer.set_learning_rate(trainer.learning_rate * 0.1)
        for batch_i, (X, y) in enumerate(data_iter):
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer.step(batch_size)
            if batch_i * batch_size % log_interval == 0:
                ls.append(loss(net(features), labels).mean().asnumpy())
    print(
        'w[0]=%.2f, w[1]=%.2f, b=%.2f' %
        (net[0].weight.data()[0][0].asscalar(),
         net[0].weight.data()[0][1].asscalar(), net[0].bias.data().asscalar()))
    es = np.linspace(0, num_epochs, len(ls), endpoint=True)
    semilogy(es, ls, 'epoch', 'loss')
示例#14
0
文件: nmt.py 项目: zhould1990/d2l-en
def load_data_nmt(batch_size, max_len, num_examples=1000):
    """Download an NMT dataset, return its vocabulary and data iterator."""

    # Download and preprocess
    def preprocess_raw(text):
        text = text.replace('\u202f', ' ').replace('\xa0', ' ')
        out = ''
        for i, char in enumerate(text.lower()):
            if char in (',', '!', '.') and text[i - 1] != ' ':
                out += ' '
            out += char
        return out

    fname = gutils.download('http://www.manythings.org/anki/fra-eng.zip')
    with zipfile.ZipFile(fname, 'r') as f:
        raw_text = f.read('fra.txt').decode("utf-8")
    text = preprocess_raw(raw_text)

    # Tokenize
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if i >= num_examples:
            break
        parts = line.split('\t')
        if len(parts) == 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))

    # Build vocab
    def build_vocab(tokens):
        tokens = [token for line in tokens for token in line]
        return Vocab(tokens, min_freq=3, use_special_tokens=True)

    src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)

    # Convert to index arrays
    def pad(line, max_len, padding_token):
        if len(line) > max_len:
            return line[:max_len]
        return line + [padding_token] * (max_len - len(line))

    def build_array(lines, vocab, max_len, is_source):
        lines = [vocab[line] for line in lines]
        if not is_source:
            lines = [[vocab.bos] + line + [vocab.eos] for line in lines]
        array = nd.array([pad(line, max_len, vocab.pad) for line in lines])
        valid_len = (array != vocab.pad).sum(axis=1)
        return array, valid_len

    src_array, src_valid_len = build_array(source, src_vocab, max_len, True)
    tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)

    # Construct data iterator
    train_set = gdata.ArrayDataset(src_array, src_valid_len, tgt_array,
                                   tgt_valid_len)
    train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True)

    return src_vocab, tgt_vocab, train_iter
示例#15
0
 def __init__(self,
              dataset: mxdata.Dataset,
              num_cls=None,
              test_ratio=0.2,
              batch_size=None):
     super().__init__(batch_size)
     self.test_ratio = test_ratio
     self.test_count = int(len(dataset) * test_ratio)
     testdata = dataset[0:self.test_count]
     traindata = dataset[self.test_count:]
     #构建dataset
     self.trainset = mxdata.ArrayDataset(*traindata)
     self.testset = mxdata.ArrayDataset(*testdata)
     #如果num_cls为None则自动测量
     if num_cls is None:
         self.num_cls = len(testdata[1])
     else:
         self.num_cls = num_cls
    def fit(self,
            train_x,
            train_y,
            optimizer,
            hyper_params,
            batch_size=64,
            epochs=10,
            test_x=None,
            test_y=None):
        """ fit x, y"""
        train_data_set = gdata.ArrayDataset(train_x, train_y)
        train_iter = gdata.DataLoader(train_data_set,
                                      batch_size=batch_size,
                                      shuffle=True)

        self.net = nn.Sequential()
        self.net.add(nn.Dense(1))
        self.net.initialize()

        if optimizer != "sgd":
            raise ValueError("only support sgd optimizer")

        if self.regularization == "l2":
            weight_decay_params = {k: v for k, v in hyper_params}
            weight_decay_params["wd"] = 3
            self.trainers = [
                gluon.Trainer(self.net.collect_params(".*weight"), "sgd",
                              weight_decay_params),
                gluon.Trainer(self.net.collect_params(".*bias"), "sgd",
                              hyper_params)
            ]
        else:
            self.trainers = [
                gluon.Trainer(self.net.collect_params(), "sgd", hyper_params)
            ]

        train_loss, test_loss = [], []
        for epoch in range(epochs):
            for x, y in train_iter:
                self._train(x, y, optimizer, hyper_params)

            train_loss.append(self.evaluate(train_x, train_y))
            test_loss.append(self.evaluate(test_x, test_y))
        logger.info("final epoch train loss: {}, test loss: {}".format(
            train_loss[-1], test_loss[-1]))
        display_utils.semilogy(range(1, epochs + 1),
                               train_loss,
                               'epochs',
                               'loss',
                               x2_vals=range(1, epochs + 1),
                               y2_vals=test_loss,
                               legend=["train", "test"])

        logger.info("weight: {}\nbias: {}".format(
            self.net[0].weight.data().asnumpy(),
            self.net[0].bias.data().asnumpy()))
示例#17
0
    def __init__(self, true_w, true_b, num_inputs: int, num_examples: int,
                 batch_size: int):
        super(MxDataLoader, self).\
            __init__(true_w, true_b, num_inputs, num_examples, batch_size)

        self.dataset = gdata.ArrayDataset(self.features, self.labels)

        self.data_iter = gdata.DataLoader(self.dataset,
                                          batch_size,
                                          shuffle=True)
示例#18
0
def load_data_imdb(batch_size, max_len=500):
    """Download a IMDB dataset, return the vocabulary and iterators"""

    data_dir = '../data'
    url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    fname = gutils.download(url, data_dir)
    with tarfile.open(fname, 'r') as f:
        f.extractall(data_dir)

    def read_imdb(folder='train'):
        data, labels = [], []
        for label in ['pos', 'neg']:
            folder_name = os.path.join(data_dir, 'aclImdb', folder, label)
            for file in os.listdir(folder_name):
                with open(os.path.join(folder_name, file), 'rb') as f:
                    review = f.read().decode('utf-8').replace('\n', '')
                    data.append(review)
                    labels.append(1 if label == 'pos' else 0)
        return data, labels

    train_data, test_data = read_imdb('train'), read_imdb('test')

    def tokenize(sentences):
        return [line.split(' ') for line in sentences]

    train_tokens = tokenize(train_data[0])
    test_tokens = tokenize(test_data[0])

    vocab = Vocab([tk for line in train_tokens for tk in line], min_freq=5)

    def pad(x):
        return x[:max_len] if len(
            x) > max_len else x + [vocab.unk] * (max_len - len(x))

    train_features = nd.array([pad(vocab[line]) for line in train_tokens])
    test_features = nd.array([pad(vocab[line]) for line in test_tokens])

    train_set = gdata.ArrayDataset(train_features, train_data[1])
    test_set = gdata.ArrayDataset(test_features, test_data[1])
    train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True)
    test_iter = gdata.DataLoader(test_set, batch_size)

    return vocab, train_iter, test_iter
示例#19
0
def predict(net,feature_values):
    train_iter = gdata.DataLoader(gdata.ArrayDataset(feature_values), batch_size=net.batch_size,
                                  shuffle=False)  # pd.read_csv去掉name
    result = []
    for X in train_iter:
        X = X.as_in_context(net.ctx)
        target = net.forward(X)
        result.append(target.reshape((-1,)).asnumpy().tolist())
    result = [num for l in result for num in l]
    return result
def evaluate_accuracy_fold(net, features, labels, batch_size, ctx):
    acc_sum, n = 0, 0
    # create data loader to prevent GPU from running out of memory
    eval_iter = gdata.DataLoader(gdata.ArrayDataset(features, labels),
                                 batch_size,
                                 shuffle=True)
    for X, y in eval_iter:
        X, y = X.as_in_context(ctx), y.as_in_context(ctx).astype('float32')
        acc_sum += (net(X).argmax(axis=1) == y).sum().asscalar()
        n += y.size
    return acc_sum / n
def get_data_iter(features, labels, batch):
    """
    获取数据
    :param features:
    :param labels:
    :param batch:
    :return:
    """
    dataset = gdata.ArrayDataset(features, labels)
    data_iter = gdata.DataLoader(dataset, batch)
    return data_iter
示例#22
0
def main(column,DIM_NUM):
    Params = RNNParameter(column,DIM_NUM)
    Params.load_dir()
    num_outputs = Params.num_outputs
    lr = Params.lr
    num_epochs = Params.num_epochs
    batch_size = Params.batch_size
    embed_size = DIM_NUM
    num_hiddens = Params.num_hiddens
    num_layers = Params.num_layers
    bidirectional = Params.bidirectional
    ctx = utils.try_all_gpus()
    csvfile = Params.train_file
    vocab = utils.read_vocab(Params.vocab_file)
    glove_embedding = text.embedding.CustomEmbedding(pretrained_file_path=Params.embedding_file, vocabulary=vocab)
    net = utils.BiRNN(vocab, embed_size, num_hiddens, num_layers, bidirectional,num_outputs)
    net.initialize(init.Xavier(), ctx=ctx)
    # 设置 embedding 层的 weight 为预训练的词向量。
    net.embedding.weight.set_data(glove_embedding.idx_to_vec)
    # 训练中不更新词向量(net.embedding 中的模型参数)。
    net.embedding.collect_params().setattr('grad_req', 'null')
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
    loss = gloss.SoftmaxCrossEntropyLoss()
    trainSet,valSet = utils.select_sample_by_class(csvfile,ratio=0.85)
    train_features,test_features,train_labels,test_labels=utils.read_dg_data(trainSet,valSet,vocab,column,MAX_LEN=2500)
    train_set = gdata.ArrayDataset(train_features, train_labels) #训练集
    test_set = gdata.ArrayDataset(test_features, test_labels) #测试集
    train_loader = gdata.DataLoader(train_set, batch_size=batch_size,shuffle=True)
    test_loader = gdata.DataLoader(test_set, batch_size=batch_size, shuffle=False)
    logging.info("开始训练rnn {}文本分类模型".format(column))
    best_acc = utils.train(train_loader, test_loader, net, loss, trainer, ctx, num_epochs,column,Params.best_param_file)
    logging.info("模型训练完成,最佳模型的acc:{:.4f} 开始测试.".format(best_acc))
    net.load_parameters(Params.best_param_file,ctx=ctx)
    f1= utils.evaluate_valset(net,valSet,vocab,column)
    logging.info("rnn网络在验证集的f1_score:{:.4f}".format(f1))
    logging.info("对数据进行测试")
    textSet = pd.read_csv(os.path.join(Params.data_dir,'test_set.csv'))
    y_probs = utils.predict_test_result(net,vocab,textSet,column,'result/rnn_{}_{:.4f}.csv'.format(column,f1))
    logging.info("保存概率数据")
    utils.save_prob_file(y_probs,'result/rnn_{}_{:.4f}_prob.csv'.format(column,f1))
    logging.info("保存完毕,请查看目录result.")
示例#23
0
def get_iter_data(batch_size=64):
    """
        获取迭代的训练数据和测试数据
    :param batch_size:
    :return:
    """
    # gb.download_imdb()
    # train_data, test_data = gb.read_imdb("train"), gb.read_imdb("test")

    train_data, test_data = read_imdb("train"), read_imdb("test")

    vocab = gb.get_vocab_imdb(train_data)

    train_iter = gdata.DataLoader(
        gdata.ArrayDataset(*gb.preprocess_imdb(train_data, vocab)),
        batch_size=batch_size,
        shuffle=True)
    test_iter = gdata.DataLoader(
        gdata.ArrayDataset(*gb.preprocess_imdb(test_data, vocab)), batch_size)

    return vocab, train_iter, test_iter
示例#24
0
def init(bsize):
    data, label = load("Kaggle.npz")
    #转换到球极坐标
    # norm=np.sqrt(np.sum(data**2,axis=1,keepdims=True))
    # ag=data/norm
    # data=np.concatenate([data,norm,ag],axis=1)
    #使用sin和cos信息
    # data=np.concatenate([np.sin(data),np.cos(data)],axis=1)

    # 下采样 制造平衡样本
    cr = under_sampling.NearMiss()
    data, label = cr.fit_sample(data, label)
    #上采样 制造平衡样本
    # ocr=over_sampling.ADASYN()
    # data,label=ocr.fit_sample(data,label)
    #混肴
    idx = list(range(len(data)))
    random.shuffle(idx)
    data, label = data[idx], label[idx]
    #onehot
    olabel = np.zeros(shape=(len(label), 2))
    for i, l in enumerate(label):
        olabel[i][int(l - 1)] = 1

    #类型转换
    data = data.astype("float32")
    olabel = olabel.astype("float32")
    #
    train_sum = int(len(data) / 1.3)
    tdata, tlabel = data[:train_sum], olabel[:train_sum]
    test_data, test_label = data[train_sum:], olabel[train_sum:]
    train_set = mxdata.ArrayDataset(nd.array(tdata), nd.array(tlabel))
    test_set = mxdata.ArrayDataset(nd.array(test_data), nd.array(test_label))

    #
    #loader
    train_loader = mxdata.DataLoader(train_set, batch_size=bsize)
    test_loader = mxdata.DataLoader(test_set, batch_size=bsize)

    return train_loader, test_loader
示例#25
0
 def next(self):
     """切换到下一状态"""
     self.nowcount += 1
     if self.nowcount >= self.splitcount:
         print("交叉验证次数已用完")
         return
     startpoint = self.nowcount * self.splen  #测试集起始位置
     #这里直接引用
     testar = (self.wholedata[startpoint:startpoint + self.splen],
               self.wholelabel[startpoint:startpoint + self.splen]
               )  #截取一段作为测试集
     self.testset = mxdata.ArrayDataset(*testar)
     #截取一段做训练集 将第一段复制过来并截取后续 这里由于要操作原始数据 于是创建副本
     trainar = [self.wholedata.copy(), self.wholelabel.copy()]
     ##框架BUG!!!!切片赋值只能先转到numpy数组再赋值否则无效
     trainar[0][startpoint:startpoint +
                self.splen] = trainar[0][:self.splen].asnumpy()
     trainar[1][startpoint:startpoint +
                self.splen] = trainar[1][:self.splen].asnumpy()
     trainar[0] = trainar[0][self.splen:]
     trainar[1] = trainar[1][self.splen:]
     self.trainset = mxdata.ArrayDataset(*trainar)
示例#26
0
def make_dataset(src_data, trg_data, src_vocab, trg_vocab):
    s_bos = src_vocab.to_indices(BOS)
    s_eos = src_vocab.to_indices(EOS)
    s_pad = src_vocab.to_indices(PAD)
    t_bos = src_vocab.to_indices(BOS)
    t_eos = src_vocab.to_indices(EOS)
    t_pad = src_vocab.to_indices(PAD)

    t = gdata.ArrayDataset(src_data, trg_data)
    dataset = t.transform(lambda x, y: _transform(x, y, s_bos, s_eos, s_pad,
                                                  t_bos, t_eos, t_pad),
                          lazy=True)
    return dataset
示例#27
0
def train(trainer_fn, states, hyperparams, features, labels, batch_size,
          num_epochs):
    # version 1
    # w = nd.random.normal(scale=0.01, shape=(features.shape[1], 1))
    # b = nd.zeros(1)
    # w.attach_grad()
    # b.attach_grad()
    # version 2
    w_ = gluon.Parameter("w", shape=(features.shape[1], 1))
    w_.initialize()
    w_.set_data(nd.random.normal(scale=0.01, shape=(features.shape[1], 1)))
    b_ = gluon.Parameter("b", shape=(1, ))
    b_.initialize()
    b_.set_data(nd.zeros(1))

    net = gb.linreg
    loss = gb.squared_loss
    trainer = gluon.Trainer([w_, b_], "sgd", hyperparams)

    def eval_loss():
        # version 1
        # return loss(net(features, w, b), labels).mean().asscalar()
        # version 2
        return loss(net(features, w_.data(), b_.data()),
                    labels).mean().asscalar()

    ls = [eval_loss()]
    dataset = gdata.ArrayDataset(features, labels)
    data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)

    for epoch in range(num_epochs):
        start = time.time()
        for i, (X, y) in enumerate(data_iter):
            with autograd.record():
                # version 1
                # l = loss(net(X, w, b), y).mean()
                # version 2
                l = loss(net(X, w_.data(), b_.data()), y).mean()
            l.backward()
            # version 1
            # trainer_fn([w, b], states, hyperparams)
            # version 2
            trainer.step(1)
        ls.append(eval_loss())
        print('epoch %d, loss: %f, %f sec per epoch' %
              (epoch + 1, ls[-1], time.time() - start))

    gb.set_figsize()
    gb.plt.plot(np.linspace(0, num_epochs, len(ls)), ls)
    gb.plt.xlabel('epoch')
    gb.plt.ylabel('loss')
    def train(self, datas, labels, batch_size=3, epoch=10):
        dataset = gdata.ArrayDataset(datas, labels)
        data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)
        for _ in range(epoch):
            for X, y in data_iter:
                with autograd.record():
                    res = self.nn(X)
                    l = self.loss(res, y)
                l.backward()
                self.trainer.step(batch_size)

            for X, y in data_iter:
                l = self.loss(self.nn(X), y)
                print 'loss', l
                break
def read_data(max_seq_len):
    # in和out分别是input和output的缩写
    in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], []
    with io.open('./data/fr-en-small.txt') as f:
        lines = f.readlines()
    for line in lines:
        in_seq, out_seq = line.rstrip().split('\t')
        in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ')
        if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1:
            continue  # 如果加上EOS后长于max_seq_len,则忽略掉此样本
        process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
        process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len)
    in_vocab, in_data = build_data(in_tokens, in_seqs)
    out_vocab, out_data = build_data(out_tokens, out_seqs)
    return in_vocab, out_vocab, gdata.ArrayDataset(in_data, out_data)
示例#30
0
def read_data(max_seq_len):
    # in和out分别是input和output的缩写
    in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], []
    with io.open('C:/Users/mayao/Desktop/d2l-zh/data/fr-en-small.txt') as f:
        lines = f.readlines()
    for line in lines:
        in_seq, out_seq = line.rstrip().split('\t')
        in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ')
        if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1:
            continue  # If length of adding eos larger than max_seq_len, then break
        process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
        process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len)
    in_vocab, in_data = build_data(in_tokens, in_seqs)
    out_vocab, out_data = build_data(out_tokens, out_seqs)
    return in_vocab, out_vocab, gdata.ArrayDataset(in_data, out_data)