예제 #1
0
class RnnModel:
    def __init__(self):
        self.categories, self.cat_to_id = read_category()
        self.words, self.word_to_id = read_vocab(vocab_file)
        self.model = TextRNN()
        self.model.load_state_dict(torch.load('model_params.pkl'))
 
    def predict(self, message):
        content = message
        data = [self.word_to_id[x] for x in content if x in self.word_to_id]
        data = kr.preprocessing.sequence.pad_sequences([data], 600)
        data = torch.LongTensor(data)
        y_pred_cls = self.model(data)
        class_index = torch.argmax(y_pred_cls[0]).item()
        return self.categories[class_index]
def train(args):
    train_iter, dev_iter = data_processor.load_data(args)  # 将数据分为训练集和验证集
    print('加载数据完成')
    model = TextRNN(args)
    if args.cuda: model.cuda()
    """
    Q5:
        Please give optimizer here
    """
    optimizer = torch.optim.Adam(model.parameters())
    steps = 0
    best_acc = 0
    last_step = 0
    model.train()
    for epoch in range(1, args.epoch + 1):
        for batch in train_iter:
            feature, target = batch.text, batch.label

            # t_()函数表示将(max_len, batch_size)转置为(batch_size, max_len)
            with torch.no_grad():
                #feature.t_()
                target.sub_(1)  # target减去1
                #print(feature.shape)

            if args.cuda:
                feature, target = feature.cuda(), target.cuda()
            optimizer.zero_grad()
            logits = model(feature)
            #print(logits.shape)
            loss = F.cross_entropy(logits, target)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            steps += 1
            if steps % args.log_interval == 0:
                # torch.max(logits, 1)函数:返回每一行中最大值的那个元素,且返回其索引(返回最大元素在这一行的列索引)
                corrects = (torch.max(logits, 1)[1] == target).sum()
                train_acc = 100.0 * corrects / batch.batch_size
                sys.stdout.write(
                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(
                        steps, loss.item(), train_acc, corrects,
                        batch.batch_size))
            if steps % args.test_interval == 0:
                dev_acc = eval(dev_iter, model, args)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    last_step = steps
                    if args.save_best:
                        print('Saving best model, acc: {:.4f}%\n'.format(
                            best_acc))
                        save(model, args.save_dir, 'best', steps)
                else:
                    if steps - last_step >= args.early_stopping:
                        print('\nearly stop by {} steps, acc: {:.4f}%'.format(
                            args.early_stopping, best_acc))
                        raise KeyboardInterrupt
예제 #3
0
def train():
    model = TextRNN().to(device)
    #定义损失函数
    Loss = nn.MultiLabelSoftMarginLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    #保存最好模型,先给一个定义为0
    best_val_acc = 0
    for epoch in range(10):
        # print('epoch=',epoch)
        #分批训练
        accuracy_array0 = np.array([])
        for step, (x_batch, y_batch) in enumerate(train_loader):
            x = x_batch.to(device)
            y = y_batch.to(device)
            out = model(x)
            loss = Loss(out, y)
            #print(out)
            #print('loss=',loss)
            #反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            accuracy0 = np.mean(
                (torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy())
            accuracy_array0 = np.append(accuracy_array0, accuracy0)
        accuracy_train = np.mean(accuracy_array0)
        print('accuracy_train:', accuracy_train)
        #对模型进行验证
        if (epoch + 1) % 5 == 0:
            for step, (x_batch, y_batch) in enumerate(val_loader):
                x = x_batch.to(device)
                y = y_batch.to(device)
                out = model(x)
                #计算准确率
        accuracy1 = np.mean(
            (torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy())
        accuracy_array1 = np.array([])
        if accuracy1 > best_val_acc:
            torch.save(model, 'model.pkl')
            best_val_acc = accuracy1
            print('model.pkl saved')
            accuracy_array1 = np.append(accuracy_array1, best_val_acc)
        accuracy_test = np.mean(accuracy_array1)
        print('accuracy_test:', accuracy_test)
예제 #4
0
def train(args):
    train_iter, dev_iter = data_processor.load_data(args) # 将数据分为训练集和验证集
    print('加载数据完成')
    model = TextRNN(args)
    Cuda = torch.cuda.is_available()
    if Cuda and args.cuda: 
        model.cuda()
    """
    Q5:
        Please give optimizer here
		
		Add lr_scheduler to adjust learning rate.
    """
    optimizer = torch.optim.Adam(model.parameters(), lr = args.lr)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.8)
    steps = 0
    best_acc = 0
    last_step = 0
    model.train()
    for epoch in range(1, args.epoch + 1):
        for batch in train_iter:
            feature, target = batch.text, batch.label
            
            # t_()函数表示将(max_len, batch_size)转置为(batch_size, max_len)
            with torch.no_grad():
                 feature.t_(), target.sub_(1) # target减去1
           
            if args.cuda and Cuda:
                feature, target = feature.cuda(), target.cuda()
            optimizer.zero_grad()
            logits = model(feature)
            loss = F.cross_entropy(logits, target)
            loss.backward()
            optimizer.step()
            steps += 1
            if steps % args.log_interval == 0:
                # torch.max(logits, 1)函数:返回每一行中最大值的那个元素,且返回其索引(返回最大元素在这一行的列索引)
                corrects = (torch.max(logits, 1)[1] == target).sum()
                train_acc = 100.0 * corrects / batch.batch_size
                sys.stdout.write(
                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps,
                                                                             loss.item(),
                                                                             train_acc,
                                                                             corrects,
                                                                             batch.batch_size))
            if steps % args.test_interval == 0:
                dev_acc = eval(dev_iter, model, args)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    last_step = steps
                    if args.save_best:
                        print('Saving best model, acc: {:.4f}%\n'.format(best_acc))
                        save(model, args.save_dir, 'best', steps)
                else:
                    scheduler.step()
                    print('lr decayed to {}'.format(optimizer.state_dict()['param_groups'][0]['lr']))
                    if steps - last_step >= args.early_stopping:
                        print('\nearly stop by {} steps, acc: {:.4f}%'.format(args.early_stopping, best_acc))
                        raise KeyboardInterrupt
예제 #5
0
파일: train.py 프로젝트: zbyzby11/TextRNN
def main():
    reviews_ints, labels, features, word_int_dict = data_processing(300)
    train_data, test_data, train_label, test_label = split_train_test(features, labels, 0.1)
    textrnn = TextRNN(300 * len(train_data), embed_size, hidden_size, 1)
    criterion = nn.CrossEntropyLoss()
    optimizer = t.optim.Adam(textrnn.parameters(), lr=0.01)
    process_bar = len(train_data) // batch_size + 1
    # print('process_bar:', process_bar)
    for epoch in range(num_epochs):
        # h0 = [num_layers(1) * num_directions(1), batch_size, hidden_size]
        # h0 = h0.to(device)# 1*200*256
        # print(type(h0))
        for i in range(process_bar):
            x = train_data[batch_size * i:batch_size * (i + 1)]
            y = train_label[batch_size * i:batch_size * (i + 1)]
            # x = [batch_size * seq_length]
            x = t.LongTensor(x)
            y = t.LongTensor(y)
            # 下面一步中的输入x=[batch_size, seq_length, embed_size],
            # h0 = [batch_size, num_layers(1) * num_directions(1), hidden_size]
            # 输出output= [batch_size, seq_length, output_dim(num_directions * hidden_size)],
            # ht = [batch_size, num_layers * num_directions, hidden_size]
            output = textrnn(x)
            # print(output.size())
            # print(y.size())
            loss = criterion(output, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print(str(datetime.datetime.now()) + '||epoch ' + str(epoch + 1) + '||step ' + str(
                i + 1) + ' | loss is: ' + str(loss.item()))

            if i % 5 == 0:
                # h0 = t.zeros(num_layers, len(test_data), hidden_size)
                test = t.LongTensor(test_data)
                # test = test.transpose(0, 1)
                # test_label = t.LongTensor(test_label)
                output = textrnn(test)
                pre_y = t.max(output,dim=1)[1].data.numpy().squeeze()
                print(len(pre_y))
                acc = sum(pre_y == test_label) / len(test_label)
                print('acc:', acc)
예제 #6
0
    def __init__(self):
        self.config = TRNNConfig()
        self.categories, self.cat_to_id = read_category()
        self.words, self.word_to_id = read_vocab(vocab_dir)
        self.config.vocab_size = len(self.words)
        self.model = TextRNN(self.config)

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
예제 #7
0
def train_TextRNN():
    model = TextRNN(TextRNNConfig)
    loss = CrossEntropyLoss(pred="pred", target="target")
    metrics = AccuracyMetric(pred="pred", target="target")
    trainer = Trainer(model=model,
                      train_data=dataset_train,
                      dev_data=dataset_dev,
                      loss=loss,
                      metrics=metrics,
                      batch_size=16,
                      n_epochs=20)
    trainer.train()
    tester = Tester(dataset_test, model, metrics)
    tester.test()
예제 #8
0
파일: train.py 프로젝트: gitgitgithut/cnews
def train(lr, train_loader, test_dataset):
    model = TextRNN().cuda()
    loss_fn = nn.MultiLabelSoftMarginLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_acc = 0

    for epoch in range(train_epochs):
        for step, (x_batch, y_batch) in enumerate(train_loader):
            x, y = x_batch.cuda(), y_batch.cuda()

            # FF
            y_pred = model(x)
            loss = loss_fn(y_pred, y)

            # BF
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            acc = np.mean(
                (torch.argmax(y_pred, 1) == torch.argmax(y, 1)).cpu().numpy())
        print('Training epoch {:}, loss = {:}, acc = {:}'.format(
            epoch + 1, loss.item(), acc))

        if (epoch + 1) % 5 == 0:
            for step, (x_batch, y_batch) in enumerate(test_loader):
                x, y = x_batch.cuda(), y_batch.cuda()

                # FF
                y_pred = model(x)
                acc = np.mean(
                    (torch.argmax(y_pred, 1) == torch.argmax(y,
                                                             1)).cpu().numpy())
                # print('Test acc = {:}'.format(acc))
                if acc > best_acc:
                    best_acc = acc
                    torch.save(model.state_dict(), 'model_params.pkl')
예제 #9
0
def train(x_train, y_train, vocab_processor, x_dev, y_dev):
    # Training
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            if FLAGS.model == "cnn":
                print("Begin to train model with cnn")
                nn = TextCNN(sequence_length=x_train.shape[1],
                             num_classes=y_train.shape[1],
                             vocab_size=len(vocab_processor.vocabulary_),
                             embedding_size=FLAGS.embedding_dim,
                             filter_sizes=list(
                                 map(int, FLAGS.filter_sizes.split(","))),
                             num_filters=FLAGS.num_filters,
                             l2_reg_lambda=FLAGS.l2_reg_lambda)
            else:
                print("Begin to train model with rnn")
                nn = TextRNN(sequence_length=x_train.shape[1],
                             num_classes=y_train.shape[1],
                             vocab_size=len(vocab_processor.vocabulary_),
                             lstm_size=FLAGS.lstm_size,
                             embedding_size=FLAGS.embedding_dim,
                             num_layers=FLAGS.num_layers,
                             l2_reg_lambda=FLAGS.l2_reg_lambda,
                             attn_size=FLAGS.attn_size)
            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(nn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", nn.loss)
            acc_summary = tf.summary.scalar("accuracy", nn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                    nn.input_x: x_batch,
                    nn.input_y: y_batch,
                    nn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, nn.loss,
                    nn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    nn.input_x: x_batch,
                    nn.input_y: y_batch,
                    nn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, nn.loss, nn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                if writer:
                    writer.add_summary(summaries, step)

            # Generate batches
            batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                              FLAGS.train_batch_size,
                                              FLAGS.num_epochs)
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev, writer=dev_summary_writer)
                    print("")
                if current_step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
예제 #10
0
def train():
    word_dict = load_vocab(FLAGS.vocab_data)
    glove = load_glove("../glove.6B.{}d.txt".format(FLAGS.embedding_size),
                       FLAGS.embedding_size, word_dict)
    train = Dataset(filepath=FLAGS.train_data,
                    num_class=FLAGS.num_class,
                    sequence_length=FLAGS.sequence_length)
    valid = Dataset(filepath=FLAGS.valid_data,
                    num_class=FLAGS.num_class,
                    sequence_length=FLAGS.sequence_length)

    with tf.Graph().as_default():
        session_conf = tf.compat.v1.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.compat.v1.Session(config=session_conf)
        with sess.as_default():
            rnn = TextRNN(vocab_size=len(word_dict),
                          embedding_size=FLAGS.embedding_size,
                          sequence_length=FLAGS.sequence_length,
                          num_class=FLAGS.num_class,
                          cell_type=FLAGS.cell_type,
                          hidden_size=FLAGS.hidden_size,
                          pretrained_embeddings=glove,
                          l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define training procedure
            global_step = tf.compat.v1.Variable(0,
                                                name="global_step",
                                                trainable=False)
            train_op = tf.compat.v1.train.AdamOptimizer(
                FLAGS.learning_rate).minimize(rnn.loss,
                                              global_step=global_step)
            acc, acc_op = tf.compat.v1.metrics.accuracy(
                labels=rnn.labels,
                predictions=rnn.predictions,
                name="metrics/acc")
            metrics_vars = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.LOCAL_VARIABLES, scope="metrics")
            metrics_init_op = tf.compat.v1.variables_initializer(
                var_list=metrics_vars)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.compat.v1.summary.scalar("loss", rnn.loss)
            acc_summary = tf.compat.v1.summary.scalar("accuracy", rnn.accuracy)

            # Train summaries
            train_summary_op = tf.compat.v1.summary.merge(
                [loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.compat.v1.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Valid summaries
            valid_step = 0
            valid_summary_op = tf.compat.v1.summary.merge(
                [loss_summary, acc_summary])
            valid_summary_dir = os.path.join(out_dir, "summaries", "valid")
            valid_summary_writer = tf.compat.v1.summary.FileWriter(
                valid_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(),
                                             max_to_keep=FLAGS.num_checkpoints)

            # initialize all variables
            best_valid_acc = 0.0
            sess.run(tf.compat.v1.global_variables_initializer())
            sess.run(tf.compat.v1.local_variables_initializer())

            # training and validating loop
            for epoch in range(FLAGS.num_epoch):
                print('-' * 100)
                print('\n{}> epoch: {}\n'.format(
                    datetime.datetime.now().isoformat(), epoch))
                sess.run(metrics_init_op)
                # Training process
                for batch in train.bacth_iter(FLAGS.batch_size,
                                              desc="Training",
                                              shuffle=True):
                    labels, docs = zip(*batch)
                    padded_docs, _, masks = vectorize(docs,
                                                      FLAGS.sequence_length)
                    feed_dict = {
                        rnn.inputs: padded_docs,
                        rnn.labels: labels,
                        rnn.masks: masks,
                        rnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                    }
                    _, step, summaries, loss, accuracy, _ = sess.run([
                        train_op, global_step, train_summary_op, rnn.loss,
                        rnn.accuracy, acc_op
                    ], feed_dict)
                    train_summary_writer.add_summary(summaries, step)

                print("\ntraining accuracy = {:.2f}\n".format(
                    sess.run(acc) * 100))

                sess.run(metrics_init_op)
                # Validating process
                for batch in valid.bacth_iter(FLAGS.batch_size,
                                              desc="Validating",
                                              shuffle=False):
                    valid_step += 1
                    labels, docs = zip(*batch)
                    padded_docs, _, masks = vectorize(docs,
                                                      FLAGS.sequence_length)
                    feed_dict = {
                        rnn.inputs: padded_docs,
                        rnn.labels: labels,
                        rnn.masks: masks,
                        rnn.dropout_keep_prob: 1.0
                    }
                    summaries, loss, accuracy, _ = sess.run(
                        [valid_summary_op, rnn.loss, rnn.accuracy, acc_op],
                        feed_dict)
                    valid_summary_writer.add_summary(summaries,
                                                     global_step=valid_step)

                valid_acc = sess.run(acc) * 100
                print("\nvalidating accuracy = {:.2f}\n".format(valid_acc))

                # model checkpoint
                if valid_acc > best_valid_acc:
                    best_valid_acc = valid_acc
                    print("current best validating accuracy = {:.2f}\n".format(
                        best_valid_acc))
                    path = saver.save(sess, checkpoint_prefix)
                    print("saved model checkpoint to {}\n".format(path))

            print("{} optimization finished!\n".format(
                datetime.datetime.now()))
            print("best validating accuracy = {:.2f}\n".format(best_valid_acc))
예제 #11
0
    vocab_dir = os.path.join(base_dir, 'vocab.txt')
    save_dir = os.path.join(base_dir, train_ratio + '/checkpoints/textrnn')
    save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径
    window_size = int(window_size)
    train_ratio = float(train_ratio)

    print('Configuring RNN model...')
    print('Building vocab if not exists.')
    start_time_vocab = time.time()
    config = TRNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_data_dir, vocab_dir)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextRNN(config)
    time_dif_vocab = get_time_dif(start_time_vocab)
    print("Time usage:", time_dif_vocab)

    #读取原始数据并转换成三个集合
    print("Processing and loading training and validation data...")
    start_time = time.time()
    x_train, x_val, x_test, y_train, y_val, y_test = process_all_file(
        train_data_dir, eval_data_dir, train_ratio, word_to_id, cat_to_id,
        config.seq_length, window_size)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    print('==========Training==========')
    start_time_train = time.time()
    train()
예제 #12
0
def train():
    model = TextRNN().to(device)
    #定义损失函数
    Loss = nn.MultiLabelSoftMarginLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    #保存最好模型,先给一个定义为0
    best_val_acc = 0
    costs = []
    early_stop = 0
    min_loss = float('inf')
    for epoch in range(5):
        # print('epoch=',epoch)
        #分批训练
        losses = []
        accuracy_array0 = np.array([])
        for step, (x_batch, y_batch) in enumerate(train_loader):
            x = x_batch.to(device)
            y = y_batch.to(device)
            out = model(x)
            loss = Loss(out, y)
            losses.append(loss.item())
            #print(out)
            #print('loss=',loss)
            #反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            accuracy0 = np.mean(
                (torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy())
            accuracy_array0 = np.append(accuracy_array0, accuracy0)
        meanloss = np.mean(losses)
        costs.append(meanloss)
        #对模型进行验证
        if (epoch + 1) % 5 == 0:
            accuracy_train = np.mean(accuracy_array0)
            print('accuracy_train:', accuracy_train)
            for step, (x_batch, y_batch) in enumerate(val_loader):
                x = x_batch.to(device)
                y = y_batch.to(device)
                out = model(x)
                #计算准确率
            accuracy1 = np.mean(
                (torch.argmax(out, 1) == torch.argmax(y, 1)).cpu().numpy())
            accuracy_array1 = np.array([])
            accuracy_test = np.mean(accuracy_array1)
            print('accuracy_test:', accuracy_test)
            if accuracy1 > best_val_acc:
                torch.save(model, 'model.pkl')
                best_val_acc = accuracy1
                print('model.pkl saved')
        #accuracy_array1 = np.append(accuracy_array1, best_val_acc
        #  早停法
        if meanloss < min_loss:
            min_loss = meanloss
            early_stop = 0
        else:
            early_stop += 1
        if early_stop > 5:
            print(f"loss连续{epoch}个epoch未降低, 停止循环")
            break
예제 #13
0
파일: train.py 프로젝트: RikkyLai/CNews
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_loader.dataset), confusion
    return acc, loss_total / len(data_loader.dataset)


EPOCH = 30
batch_size = 32
best_epoch, best_acc = 0, 0
#保存训练模型
file_name = 'cnews_best.pt'
train_data = textData(train=True)
val_data = textData(val=True)
test_data = textData()
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
model = TextRNN()

# 损失函数:这里用交叉熵
criterion = nn.CrossEntropyLoss()
# 优化器 这里用SGD
optimizer = optim.Adam(model.parameters(), lr=0.001)

# device : GPU or CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# 训练
for epoch in range(EPOCH):
    start_time = time.time()
    for i, data in enumerate(train_loader):
        model.train()
예제 #14
0
def main(args):
    print "loadding reviews and labels from dataset"
    data = pd.read_csv('data/labeledTrainData.tsv.zip',
                       compression='zip',
                       delimiter='\t',
                       header=0,
                       quoting=3)
    reviews = data["review"]
    labels = list(data['sentiment'])
    sentences = []
    for review in reviews:
        if len(review) > 0:
            sentences.append(
                utils.review_to_wordlist(review.decode('utf8').strip(),
                                         remove_stopwords=True))
    print "loaded %d reviews from dataset" % len(sentences)

    word_dict = utils.build_vocab(sentences, max_words=10000)
    vec_reviews = utils.vectorize(sentences, word_dict, verbose=True)
    train_x = vec_reviews[0:20000]
    train_y = labels[0:20000]
    train_y = utils.one_hot(train_y, args.nb_classes)
    test_x = vec_reviews[20000:]
    test_y = labels[20000:]
    test_y = utils.one_hot(test_y, args.nb_classes)

    save_dir = args.save_dir
    log_dir = args.log_dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    with tf.Graph().as_default():
        config_proto = utils.get_config_proto()
        sess = tf.Session(config=config_proto)
        if args.model_type == "cnn":
            model = TextCNN(args, "TextCNN")
            test_batch = utils.get_batches(test_x, test_y, args.max_size)
        elif args.model_type in ["rnn", "bi_rnn"]:
            model = TextRNN(args, "TextRNN")
            test_batch = utils.get_batches(test_x,
                                           test_y,
                                           args.max_size,
                                           type="rnn")

        sess.run(tf.global_variables_initializer())
        summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

        for epoch in range(1, args.nb_epochs + 1):
            print "epoch %d start" % epoch
            print "- " * 50

            loss = 0.
            total_reviews = 0
            accuracy = 0.
            if args.model_type == "cnn":
                train_batch = utils.get_batches(train_x, train_y,
                                                args.batch_size)
            elif args.model_type in ["rnn", "bi_rnn"]:
                train_batch = utils.get_batches(train_x,
                                                train_y,
                                                args.batch_size,
                                                type="rnn")
            epoch_start_time = time.time()
            step_start_time = epoch_start_time
            for idx, batch in enumerate(train_batch):
                reviews, reviews_length, labels = batch
                _, loss_t, accuracy_t, global_step, batch_size, summaries = model.train(
                    sess, reviews, reviews_length, labels, args.keep_prob)

                loss += loss_t * batch_size
                total_reviews += batch_size
                accuracy += accuracy_t * batch_size
                summary_writer.add_summary(summaries, global_step)

                if global_step % 50 == 0:
                    print "epoch %d, step %d, loss %f, accuracy %.4f, time %.2fs" % \
                      (epoch, global_step, loss_t, accuracy_t, time.time() - step_start_time)
                    step_start_time = time.time()

            epoch_time = time.time() - epoch_start_time
            print "%.2f seconds in this epoch" % (epoch_time)
            print "train loss %f, train accuracy %.4f" % (
                loss / total_reviews, accuracy / total_reviews)

            total_reviews = 0
            accuracy = 0.
            for batch in test_batch:
                reviews, reviews_length, labels = batch
                accuracy_t, batch_size = model.test(sess, reviews,
                                                    reviews_length, labels,
                                                    1.0)
                total_reviews += batch_size
                accuracy += accuracy_t * batch_size
            print "accuracy %.4f in %d test reviews" % (
                accuracy / total_reviews, total_reviews)
예제 #15
0
def test():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt", cf.max_seq_len)
    test_dataloader = DataLoader(test_data,
                                 batch_size=cf.batch_size,
                                 shuffle=True)

    # 预训练词向量矩阵
    embedding_matrix = get_pre_embedding_matrix("./data/final_vectors")
    # 模型
    model = TextRNN(cf, torch.tensor(embedding_matrix))

    # model.load_state_dict(torch.load("./output/model.bin",map_location='cpu'))
    model.load_state_dict(torch.load("./output/model.bin"))
    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # 训练
    start_time = time.time()

    data_len = len(test_dataloader)

    model.eval()
    y_pred = np.array([])
    y_test = np.array([])
    # for step,batch in enumerate(tqdm(test_dataloader,"batch",total=len(test_dataloader))):
    for step, batch in enumerate(test_dataloader):

        label_id = batch['label_id'].squeeze(1).to(device)
        seq_len = batch["seq_len"].to(device)
        segment_ids = batch['segment_ids'].to(device)

        # 将序列按长度降序排列
        seq_len, perm_idx = seq_len.sort(0, descending=True)
        label_id = label_id[perm_idx]
        segment_ids = segment_ids[perm_idx].transpose(0, 1)

        with torch.no_grad():
            pred = model.get_labels(segment_ids, seq_len)
        y_pred = np.hstack((y_pred, pred))
        y_test = np.hstack((y_test, label_id.to("cpu").numpy()))

    # 评估
    print("Precision, Recall and F1-Score...")
    print(
        metrics.classification_report(y_test,
                                      y_pred,
                                      target_names=get_labels('./data/label')))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test, y_pred)
    print(cm)
예제 #16
0
def train():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 训练数据
    train_data = NewsDataset("./data/cnews_final_train.txt", cf.max_seq_len)
    train_dataloader = DataLoader(train_data,
                                  batch_size=cf.batch_size,
                                  shuffle=True)
    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt", cf.max_seq_len)
    test_dataloader = DataLoader(test_data,
                                 batch_size=cf.batch_size,
                                 shuffle=True)

    # 预训练词向量矩阵
    embedding_matrix = get_pre_embedding_matrix("./data/final_vectors")
    # 模型
    model = TextRNN(cf, torch.tensor(embedding_matrix))
    # 优化器用adam
    optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # 训练
    start_time = time.time()

    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练

    flag = False
    model.train()
    for epoch_id in trange(cf.epoch, desc="Epoch"):
        # for step,batch in enumerate(tqdm(train_dataloader,"batch",total=len(train_dataloader))):
        for step, batch in enumerate(train_dataloader):

            label_id = batch['label_id'].squeeze(1).to(device)
            seq_len = batch["seq_len"].to(device)
            segment_ids = batch['segment_ids'].to(device)

            # 将序列按长度降序排列
            seq_len, perm_idx = seq_len.sort(0, descending=True)
            label_id = label_id[perm_idx]
            segment_ids = segment_ids[perm_idx].transpose(0, 1)

            loss = model(segment_ids, seq_len, label_id)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_batch += 1

            if total_batch % cf.print_per_batch == 0:
                model.eval()
                with torch.no_grad():
                    loss_train, acc_train = model.get_loss_acc(
                        segment_ids, seq_len, label_id)
                loss_val, acc_val = evaluate(model, test_dataloader, device)

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    torch.save(model.state_dict(), "./output/model.bin")
                    improved_str = "*"
                else:
                    improved_str = ""

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(
                    msg.format(total_batch, loss_train, acc_train, loss_val,
                               acc_val, time_dif, improved_str))

                model.train()

            if total_batch - last_improved > require_improvement:
                print("长时间未优化")
                flag = True
                break
        if flag:
            break
예제 #17
0
    # 评估
    print("Precision, Recall and F1-Score...")
    print(
        metrics.classification_report(y_test_cls,
                                      y_pred_cls,
                                      target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':

    print('Configuring RNN model...')
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, args.VOCAB_SIZE)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    args.VOCAB_SIZE = len(words)
    model = TextRNN(args)

    if args.DO_TRAIN:
        train()
    if args.DO_TEST:
        test()
예제 #18
0
def main(args):
    print "loadding data and labels from dataset"
    train = pd.read_csv(args.train_dir)
    ch_train = pd.read_csv(args.chtrain_dir)
    x_train = train["comment_text"]
    x_chtrain = ch_train["comment_text"]
    target_cols = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]

    x = []
    x_ch = []
    for line in x_train:
        if len(line) > 0:
            x.append(utils.review_to_wordlist(line.strip()))
    print "loaded %d comments from dataset" % len(x)
    for line in x_chtrain:
        if len(line) > 0:
            x_ch.append(utils.review_to_wordlist_char(line.strip()))
    print "loaded %d comments from dataset" % len(x)
    y = train[target_cols].values

    index2word, word2index = utils.load_vocab(args.vocab_dir)
    index2char, char2index = utils.load_char(args.char_dir)
    x_vector = utils.vectorize(x, word2index, verbose=False)
    x_vector = np.array(x_vector)
    char_vector = utils.vectorize_char(x_ch, char2index, verbose=False)
    char_vector = np.array(char_vector)
    print char_vector[0]

    save_dir = os.path.join(args.save_dir, args.model_type)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    if args.model_type in ["cnn", "cnnfe", "chcnn", "chcnn2"]:
        max_step = args.max_step_cnn
        max_size = args.max_size_cnn
        nb_epochs = args.nb_epochs_cnn
    elif args.model_type in [
            "rnn", "rnnfe", "rnnfe2", "chrnn", "chrnnfe", "rcnn"
    ]:
        max_step = args.max_step_rnn
        max_size = args.max_size_rnn
        nb_epochs = args.nb_epochs_rnn

    ex_features = add_features("../data/train.csv")
    nfolds = args.nfolds
    skf = KFold(n_splits=nfolds, shuffle=True, random_state=2018)
    test_prob = []
    stack_logits = np.zeros((len(x_vector), len(target_cols)))
    for (f, (train_index, test_index)) in enumerate(skf.split(x_vector)):
        x_train, x_eval = x_vector[train_index], x_vector[test_index]
        char_train, char_eval = char_vector[train_index], char_vector[
            test_index]
        y_train, y_eval = y[train_index], y[test_index]
        with tf.Graph().as_default():
            config_proto = utils.get_config_proto()
            sess = tf.Session(config=config_proto)
            if args.model_type == "cnn":
                model = TextCNN(args, "TextCNN")
            elif args.model_type == "cnnfe":
                model = TextCNNFE(args, "TextCNNFE")
            elif args.model_type == "rnn":
                model = TextRNN(args, "TextRNN")
            elif args.model_type == "rnnfe":
                model = TextRNNFE(args, "TextRNNFE")
            elif args.model_type == "rcnn":
                model = TextRCNN(args, "TextRCNN")
            elif args.model_type == "attention":
                model = RNNWithAttention(args, "Attention")
            elif args.model_type == "chrnn":
                model = TextRNNChar(args, "TextRNNChar")
            elif args.model_type == "chcnn":
                model = TextCNNChar(args, "TextCNNChar")
            elif args.model_type == "chcnn2":
                model = TextCNNChar(args, "TextCNNChar2")
            elif args.model_type == "rnnfe2":
                model = TextRNNFE2(args, "TextCNNCharFE2")
            elif args.model_type == "chrnnfe":
                model = TextRNNCharFE(args, "TextCNNCharFE")
            else:
                raise ValueError("Unknown model_type %s" % args.model_type)
            sess.run(tf.global_variables_initializer())

            if args.use_ft:
                pretrain_dir = args.ft_dir
                print "use FastText word vector"
                embedding = utils.load_fasttext(pretrain_dir, index2word)
            if not args.use_ft:
                pretrain_dir = args.glove_dir
                print "use Glove word vector"
                embedding = utils.load_glove(pretrain_dir, index2word)
            sess.run(model.embedding_init,
                     {model.embedding_placeholder: embedding})

            for line in model.tvars:
                print line

            print "training %s model for toxic comments classification" % (
                args.model_type)
            print "%d fold start training" % f
            for epoch in range(1, nb_epochs + 1):
                print "epoch %d start with lr %f" % (
                    epoch,
                    model.learning_rate.eval(session=sess)), "\n", "- " * 50
                loss, total_comments = 0.0, 0
                if args.model_type in ["cnn", "rnn", "rcnn"]:
                    train_batch = utils.get_batches(x_train, y_train,
                                                    args.batch_size,
                                                    args.max_len)
                    valid_batch = utils.get_batches(x_eval, y_eval, max_size,
                                                    args.max_len, False)

                elif args.model_type in ["chrnn", "chcnn", "chcnn2"]:
                    train_batch = utils.get_batches_with_char(
                        x_train, char_train, y_train, args.batch_size,
                        args.max_len)
                    valid_batch = utils.get_batches_with_char(
                        x_eval, char_eval, y_eval, max_size, args.max_len,
                        False)

                elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]:
                    train_batch = utils.get_batches_with_fe(
                        x_train, y_train, ex_features, args.batch_size,
                        args.max_len)
                    valid_batch = utils.get_batches_with_fe(
                        x_eval, y_eval, ex_features, max_size, args.max_len,
                        False)

                elif args.model_type in ["chrnnfe"]:
                    train_batch = utils.get_batches_with_charfe(
                        x_train, char_train, y_train, ex_features,
                        args.batch_size, args.max_len)
                    valid_batch = utils.get_batches_with_charfe(
                        x_eval, char_eval, y_eval, ex_features, max_size,
                        args.max_len, False)

                epoch_start_time = time.time()
                step_start_time = epoch_start_time
                for idx, batch in enumerate(train_batch):
                    if args.model_type in ["cnn", "rnn", "rcnn"]:
                        comments, comments_length, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, labels)

                    elif args.model_type in ["chrnn", "chcnn", "chcnn2"]:
                        comments, comments_length, chs, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, chs, labels)

                    elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]:
                        comments, comments_length, exs, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, labels, exs)

                    elif args.model_type in ["chrnnfe"]:
                        comments, comments_length, chs, exs, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, chs, labels, exs)

                    loss += loss_t * batch_size
                    total_comments += batch_size

                    if global_step % 200 == 0:
                        print "epoch %d step %d loss %f time %.2fs" % (
                            epoch, global_step, loss_t,
                            time.time() - step_start_time)

                    if global_step % 200 == 0:
                        _ = run_valid(valid_batch, model, sess,
                                      args.model_type)
                        # model.saver.save(sess, os.path.join(save_dir, "model.ckpt"), global_step=global_step)
                        step_start_time = time.time()

                epoch_time = time.time() - epoch_start_time
                sess.run(model.learning_rate_decay_op)
                print "%.2f seconds in this epoch with train loss %f" % (
                    epoch_time, loss / total_comments)

            test_prob.append(run_test(args, model, sess))
            stack_logits[test_index] = run_valid(valid_batch, model, sess,
                                                 args.model_type)

    preds = np.zeros((test_prob[0].shape[0], len(target_cols)))
    for prob in test_prob:
        preds += prob
        print prob[0]
    preds /= len(test_prob)
    print len(test_prob)
    write_predict(stack_logits, args.model_type)
    write_results(preds, args.model_type)
예제 #19
0
 def __init__(self):
     self.categories, self.cat_to_id = read_category()
     self.words, self.word_to_id = read_vocab(vocab_file)
     self.model = TextRNN()
     self.model.load_state_dict(torch.load('model_params.pkl'))
예제 #20
0
    # 交叉验证
    f = StratifiedKFold(n_splits=n_splits, random_state=seed)
    for i, (tr, va) in enumerate(f.split(x_pad, y)):
        x_train_age = x_pad[tr]
        x_va_age = x_pad[va]
        y_train_age = y[tr]
        y_va_age = y[va]

        # 将整型标签转为onehot
        y_train_age = to_categorical(y_train_age)
        y_va_age = to_categorical(y_va_age)

        print('开始LSTM建模......')
        max_features = len(word2index) + 1  # 词表的大小
        model = TextRNN(maxlen, max_features, embedding_dims, 7,
                        'softmax').get_model()
        # 指定optimizer、loss、评估标准
        model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

        print('训练...')
        my_callbacks = [
            ModelCheckpoint(model_path + 'lstm_model_age.h5', verbose=1),
            EarlyStopping(monitor='val_accuracy', patience=2, mode='max')
        ]
        # fit拟合数据
        history = model.fit(x_train_age,
                            y_train_age,
                            batch_size=batch_size,
                            epochs=epochs,
                            callbacks=my_callbacks,
                            validation_data=(x_va_age, y_va_age))