Пример #1
0
def train(x, y):
    model = TextCNN()
    model = model.cuda()
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.SGD(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss(size_average=False)

    for epoch in range(100):
        total = 0
        for i in range(0, len(x) / 64):
            batch_x = x[i * 64:(i + 1) * 64]
            batch_y = y[i * 64:(i + 1) * 64]
            batch_x = Variable(torch.FloatTensor(batch_x)).cuda()
            batch_y = Variable(torch.LongTensor(batch_y)).cuda()
            optimizer.zero_grad()
            model.train()
            pred = model(batch_x, 64)
            loss = criterion(pred, batch_y)
            #print(loss)
            loss.backward()
            nn.utils.clip_grad_norm(parameters, max_norm=3)
            total += np.sum(
                pred.data.max(1)[1].cpu().numpy() ==
                batch_y.data.cpu().numpy())
            optimizer.step()
        print("epoch ", epoch + 1, " acc: ", float(total) / len(x))
    return model
Пример #2
0
def main(mode):
    if mode == 'train':
        model = TextCNN(embedding_weights)
        model.bulid_graph()
        model.train((x_train, y_train), (x_test, y_test))
    elif mode == 'test':
        model = TextCNN(embedding_weights)
        model.bulid_graph()
        model.test((x_test, y_test), '1528038283')
Пример #3
0
def train(**kwargs):

    opt.parse(kwargs)
    device = torch.device(
        "cuda:{}".format(opt.gpu_id) if torch.cuda.is_available() else "cpu")
    opt.device = device

    x_text, y = load_data_and_labels("./data/rt-polarity.pos",
                                     "./data/rt-polarity.neg")
    x_train, x_test, y_train, y_test = train_test_split(
        x_text, y, test_size=opt.test_size)

    train_data = Data(x_train, y_train)
    test_data = Data(x_test, y_test)

    train_loader = DataLoader(train_data,
                              batch_size=opt.batch_size,
                              shuffle=True,
                              collate_fn=collate_fn)
    test_loader = DataLoader(test_data,
                             batch_size=opt.batch_size,
                             shuffle=False,
                             collate_fn=collate_fn)

    print("{} train data: {}, test data: {}".format(now(), len(train_data),
                                                    len(test_data)))

    model = TextCNN(opt)
    print("{} init model finished".format(now()))

    if opt.use_gpu:
        model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt.lr,
                           weight_decay=opt.weight_decay)

    for epoch in range(opt.epochs):
        total_loss = 0.0
        model.train()
        for step, batch_data in enumerate(train_loader):
            x, labels = batch_data
            labels = torch.LongTensor(labels)
            if opt.use_gpu:
                labels = labels.to(device)
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        acc = test(model, test_loader)
        print("{} {} epoch: loss: {}, acc: {}".format(now(), epoch, total_loss,
                                                      acc))
Пример #4
0
def build_textcnn_model(vocab, config, train=True):
    model = TextCNN(vocab.vocab_size, config)
    if train:
        model.train()
    else:
        model.eval()

    if torch.cuda.is_available():
        model.cuda()
    else:
        model.cpu()
    return model
Пример #5
0
def build_textcnn_model(vocab, config, train=True):
    model = TextCNN(vocab.vocab_size, config)
    if train:
        model.train()
        #在训练模型时会在前面加上train();
    else:
        model.eval()
        #在测试模型时在前面使用eval(),会将BN和DropOut固定住,不会取平均,而是用训练好的值
    if torch.cuda.is_available():
        model.cuda()
    else:
        model.cpu()
    return model
Пример #6
0
def train(args):
    train_iter, dev_iter = data_processor.load_data(args)  # 将数据分为训练集和验证集
    print('加载数据完成')
    model = TextCNN(args)
    if args.cuda: model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    steps = 0
    best_acc = 0
    last_step = 0
    model.train()
    for epoch in range(1, args.epoch + 1):
        for batch in train_iter:
            feature, target = batch.text, batch.label
            # t_()函数表示将(max_len, batch_size)转置为(batch_size, max_len)
            #            feature.data.t_(), target.data.sub_(1) # target减去1
            feature = feature.data.t()  # x.t() x是不变的,所以重新赋值
            #           target.data.sub_(1)
            if args.cuda:
                feature, target = feature.cuda(), target.cuda()
            optimizer.zero_grad()
            logits = model(feature)
            loss = F.cross_entropy(logits, target)
            loss.backward()
            optimizer.step()
            steps += 1
            if steps % args.log_interval == 0:
                # torch.max(logits, 1)函数:返回每一行中最大值的那个元素,且返回其索引(返回最大元素在这一行的列索引)
                corrects = (torch.max(logits, 1)[1] == target).sum()
                train_acc = 100.0 * corrects / batch.batch_size
                sys.stdout.write(
                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(
                        steps, loss.item(), train_acc, corrects,
                        batch.batch_size))
            if steps % args.test_interval == 0:
                dev_acc = eval(dev_iter, model, args)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    last_step = steps
                    if args.save_best:
                        print('Saving best model, acc: {:.4f}%\n'.format(
                            best_acc))
                        save(model, args.save_dir, 'best', steps)
                else:
                    if steps - last_step >= args.early_stopping:
                        print('\nearly stop by {} steps, acc: {:.4f}%'.format(
                            args.early_stopping, best_acc))
                        raise KeyboardInterrupt
Пример #7
0
def build_textcnn_model(vocab, config, train=True):
    model = TextCNN(vocab.vocab_size, config)
    if train:
        model.train()
        #在训练模型时会在前面加上train();
    else:
        model.eval()
        #在测试模型时在前面使用eval(),会将BN和DropOut固定住,不会取平均,而是用训练好的值

    #train()与eval()两个方法是针对网络train和eval时采用不同方式的情况
    #比如Batch Normalization和Dropout
    #BN的作用主要是对网络中间的每层进行归一化处理,并且使用变换重构保证所提取的特征分布不会被破坏;
    #由于训练完毕后参数都是固定的,所有BN的训练和测试时的操作不同
    #Dropopt能够克服过拟合,在每个训练batch中,通过忽略一般的特征检测器,可以明显地减少过拟合现象。
    if torch.cuda.is_available():
        model.cuda()
    else:
        model.cpu()
    return model
Пример #8
0
def objective(trial):

    model = TextCNN(trial, len(id2vocab), CLS)
    model.to(device)
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    criterion = nn.NLLLoss()
    
    for epoch in range(EPOCHS):
        model.train()
        epoch_loss= []
        for batch in train_iter:           
            text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label.to(device)
            model.zero_grad()
            out = model(text_idx_batch)
            loss = criterion(out, label_idx_batch)
            loss.backward()
            epoch_loss.append(loss.item())
            optimizer.step()   
        #print(f'Epoch[{epoch}] - Loss:{sum(epoch_loss)/len(epoch_loss)}')

        model.eval()
        predict_all = np.array([], dtype=int)
        labels_all = np.array([], dtype=int)
        with torch.no_grad():        
            for batch in val_iter:
                text_idx_batch, label_idx_batch = batch.text.t_().to(device), batch.label
                pred = model(text_idx_batch)
                pred = torch.max(pred.data, 1)[1].cpu().numpy()
                predict_all = np.append(predict_all, pred)
                
                truth = label_idx_batch.cpu().numpy()
                labels_all = np.append(labels_all, truth)            
            
        acc = metrics.accuracy_score(labels_all, predict_all)
        
        trial.report(acc, epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return acc
Пример #9
0
# 加载模型
net = TextCNN(args, weight).to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()  # 使用交叉熵损失函数
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                    net.parameters()),
                             lr=args.lr)

# 模型训练
print('training on ', device)
best_acc = 0.0
step = 0
for epoch in range(1, args.num_epochs + 1):
    net.train()
    for X, y in train_iter:
        X, y = X.to(device), y.to(device)
        y_hat = net(X)  # 计算预测概率值
        loss = criterion(y_hat, y)  # 计算loss值
        optimizer.zero_grad()  # 梯度置零
        loss.backward()  # 反向传播
        optimizer.step()  # 参数更新
        step += 1

        # 测试
        if step % args.test_per_step == 0:
            net.eval()
            all_pre = []
            all_label = []
Пример #10
0
def main():
    device = torch.device('cuda')

    embedding_vectors = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl')

    text_processor = TextProcessor(
        wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')),
        tokenizer=get_tokenizer('basic_english'),
        standardize=True,
        min_len=3,
    )

    dataset = TextDataset(CORPUS_DIR, text_processor)

    # split into training and test set
    # TODO: fix this splitting sometimes failing when corpus size changes
    train_set, test_set = torch.utils.data.random_split(
        dataset, [
            int(len(dataset) * DATA_SPLIT),
            int(len(dataset) * (1.0 - DATA_SPLIT))
        ])

    # count number of samples in each class
    class_count = [0, 0]
    for data, label in dataset:
        class_count[int(label.item())] += 1

    # get relative weights for classes
    _sum = sum(class_count)
    class_count[0] /= _sum
    class_count[1] /= _sum

    # reverse the weights since we're getting the inverse for the sampler
    class_count = list(reversed(class_count))

    # set weight for every sample
    weights = [class_count[int(x[1].item())] for x in train_set]

    # weighted sampler
    sampler = torch.utils.data.WeightedRandomSampler(
        weights=weights, num_samples=len(train_set), replacement=True)

    train_loader = DataLoader(dataset=train_set,
                              batch_size=32,
                              collate_fn=Sequencer(SEQUENCE_LEN),
                              sampler=sampler)

    test_loader = DataLoader(dataset=test_set,
                             batch_size=32,
                             collate_fn=Sequencer(SEQUENCE_LEN))

    # number of filters in each convolutional filter
    N_FILTERS = 64

    # sizes and number of convolutional layers
    FILTER_SIZES = [2, 3]

    # dropout for between conv and dense layers
    DROPOUT = 0.5

    model = TextCNN(
        embeddings=embedding_vectors,
        n_filters=N_FILTERS,
        filter_sizes=FILTER_SIZES,
        dropout=DROPOUT,
    ).to(device)

    print(model)
    print('Trainable params:',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    EPOCHS = 12

    best_acc = 0.0

    # training loop
    for epoch in range(EPOCHS):
        print('Epoch', epoch + 1)

        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):
            # get word indices vector and corresponding labels
            x, labels = data

            # send to device
            x = x.to(device)
            labels = labels.to(device)

            # make predictions
            predictions = model(x).squeeze()

            # calculate loss
            loss = criterion(predictions, labels)

            # learning stuff...
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # evaluate
        with torch.no_grad():
            model.eval()

            correct = 0
            wrong = 0
            m = [[0, 0], [0, 0]]

            for data in test_loader:
                x, label = data
                x = x.to(device)

                predictions = model(x).squeeze()

                for truth, prediction in zip(label, predictions):
                    y = int(truth.item())
                    y_pred = 1 if prediction.item() > 0.5 else 0

                    m[y][y_pred] += 1

                    if y == y_pred:
                        correct += 1
                    else:
                        wrong += 1

            model.train()

            acc = correct / (correct + wrong)
            if acc > best_acc:
                best_acc = acc
                for file in glob.glob('models/model_*.pth'):
                    os.remove(file)
                torch.save(model.state_dict(), f'models/state_{epoch}.pth')

            print()
            print('Correct:', f'{correct}/{correct + wrong}', 'Accuracy:', acc)
            print('[[TN, FP], [FN, TP]]')
            print(m)
            print()

    # put into evaluation mode
    model.eval()

    text_processor.do_standardize = True

    with torch.no_grad():
        while True:
            text = input('Prompt: ')
            x = text_processor.process(text)
            x = torch.tensor(x).unsqueeze(dim=0)
            print(model(x.to(device)).squeeze())
Пример #11
0
                                  batch_size=config.batch_size,
                                  num_workers=2)

    config.word_num = len(training_set.tok2num)
    model = TextCNN(config)

    if torch.cuda.is_available():
        model.cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config.lr)

    training_lossse = []
    # Train the model
    for epoch in range(config.epoch):
        model.train()
        for data, label in training_iter:

            if config.cuda and torch.cuda.is_available():
                data = data.cuda()
                labels = label.cuda()

            out = model(data)
            loss = criterion(out, label)
            training_lossse.append(loss.item())

            if len(training_lossse) % 100 == 0:
                print("train epoch", epoch, end='  ')
                print("The loss is: %.5f" %
                      (np.average(training_lossse[-100:])))
Пример #12
0
# -*- coding: utf-8 -*-

import tensorflow.keras as keras
import numpy as np
from sklearn import metrics
import os

from preprocess import preprocesser
from config import Config
from model import TextCNN
from model import LSTM

np.random.seed(42)

if __name__ == '__main__':
    CNN_model = TextCNN()
    CNN_model.train(5)
    CNN_model.test()
    # LSTM_MODEL = LSTM()
    # LSTM_MODEL.train(5)
    # LSTM_MODEL.test()
Пример #13
0
def train():
    # 配置文件
    cf = Config('./config.yaml')
    # 有GPU用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 训练数据
    train_data = NewsDataset("./data/cnews_final_train.txt",cf.max_seq_len)
    train_dataloader = DataLoader(train_data,batch_size=cf.batch_size,shuffle=True)
    # 测试数据
    test_data = NewsDataset("./data/cnews_final_test.txt",cf.max_seq_len)
    test_dataloader = DataLoader(test_data,batch_size=cf.batch_size,shuffle=True)

    # 预训练词向量矩阵
    embedding_matrix = get_pre_embedding_matrix("./data/final_vectors")
    # 模型
    model = TextCNN(cf,torch.tensor(embedding_matrix))
    # 优化器用adam
    optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

    # 把模型放到指定设备
    model.to(device)

    # 让模型并行化运算
    if torch.cuda.device_count()>1:
        model = torch.nn.DataParallel(model)
    
    # 训练
    start_time = time.time()

    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练

    flag = False
    model.train()
    for epoch_id in trange(cf.epoch,desc="Epoch"):
        for step,batch in enumerate(tqdm(train_dataloader,"batch",total=len(train_dataloader))):
            
            label_id = batch['label_id'].squeeze(1).to(device) 
            segment_ids = batch['segment_ids'].to(device) 

            loss = model(segment_ids,label_id)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_batch += 1 

            if total_batch % cf.print_per_batch == 0:
                model.eval()
                with torch.no_grad():
                    loss_train,acc_train = model.get_loss_acc(segment_ids,label_id)
                loss_val,acc_val = evaluate(model,test_dataloader,device)
                
                if acc_val  > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    torch.save(model.state_dict(),"./output/model.bin")
                    improved_str = "*"
                else:
                    improved_str = ""
                
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))
                
                model.train()

            if total_batch - last_improved > require_improvement:
                print("长时间未优化")
                flag = True
                break
        if flag:
            break
Пример #14
0
def main(args):
    print "loadding reviews and labels from dataset"
    data = pd.read_csv('data/labeledTrainData.tsv.zip',
                       compression='zip',
                       delimiter='\t',
                       header=0,
                       quoting=3)
    reviews = data["review"]
    labels = list(data['sentiment'])
    sentences = []
    for review in reviews:
        if len(review) > 0:
            sentences.append(
                utils.review_to_wordlist(review.decode('utf8').strip(),
                                         remove_stopwords=True))
    print "loaded %d reviews from dataset" % len(sentences)

    word_dict = utils.build_vocab(sentences, max_words=10000)
    vec_reviews = utils.vectorize(sentences, word_dict, verbose=True)
    train_x = vec_reviews[0:20000]
    train_y = labels[0:20000]
    train_y = utils.one_hot(train_y, args.nb_classes)
    test_x = vec_reviews[20000:]
    test_y = labels[20000:]
    test_y = utils.one_hot(test_y, args.nb_classes)

    save_dir = args.save_dir
    log_dir = args.log_dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    with tf.Graph().as_default():
        config_proto = utils.get_config_proto()
        sess = tf.Session(config=config_proto)
        if args.model_type == "cnn":
            model = TextCNN(args, "TextCNN")
            test_batch = utils.get_batches(test_x, test_y, args.max_size)
        elif args.model_type in ["rnn", "bi_rnn"]:
            model = TextRNN(args, "TextRNN")
            test_batch = utils.get_batches(test_x,
                                           test_y,
                                           args.max_size,
                                           type="rnn")

        sess.run(tf.global_variables_initializer())
        summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

        for epoch in range(1, args.nb_epochs + 1):
            print "epoch %d start" % epoch
            print "- " * 50

            loss = 0.
            total_reviews = 0
            accuracy = 0.
            if args.model_type == "cnn":
                train_batch = utils.get_batches(train_x, train_y,
                                                args.batch_size)
            elif args.model_type in ["rnn", "bi_rnn"]:
                train_batch = utils.get_batches(train_x,
                                                train_y,
                                                args.batch_size,
                                                type="rnn")
            epoch_start_time = time.time()
            step_start_time = epoch_start_time
            for idx, batch in enumerate(train_batch):
                reviews, reviews_length, labels = batch
                _, loss_t, accuracy_t, global_step, batch_size, summaries = model.train(
                    sess, reviews, reviews_length, labels, args.keep_prob)

                loss += loss_t * batch_size
                total_reviews += batch_size
                accuracy += accuracy_t * batch_size
                summary_writer.add_summary(summaries, global_step)

                if global_step % 50 == 0:
                    print "epoch %d, step %d, loss %f, accuracy %.4f, time %.2fs" % \
                      (epoch, global_step, loss_t, accuracy_t, time.time() - step_start_time)
                    step_start_time = time.time()

            epoch_time = time.time() - epoch_start_time
            print "%.2f seconds in this epoch" % (epoch_time)
            print "train loss %f, train accuracy %.4f" % (
                loss / total_reviews, accuracy / total_reviews)

            total_reviews = 0
            accuracy = 0.
            for batch in test_batch:
                reviews, reviews_length, labels = batch
                accuracy_t, batch_size = model.test(sess, reviews,
                                                    reviews_length, labels,
                                                    1.0)
                total_reviews += batch_size
                accuracy += accuracy_t * batch_size
            print "accuracy %.4f in %d test reviews" % (
                accuracy / total_reviews, total_reviews)
Пример #15
0
class Trainer:
    def __init__(self, config):
        self.config = config
        self.train_data_loader = None
        self.eval_data_loader = None

        # 加载数据集
        self.load_data()
        self.train_inputs, self.train_labels, label_to_idx = self.train_data_loader.gen_data(
        )
        self.vocab_size = self.train_data_loader.vocab_size
        self.word_vectors = self.train_data_loader.word_vectors
        print(f"train data size: {len(self.train_labels)}")
        print(f"vocab size: {self.vocab_size}")
        self.label_list = [value for key, value in label_to_idx.items()]

        self.eval_inputs, self.eval_labels = self.eval_data_loader.gen_data()

        # 初始化模型
        self.model = TextCNN(config=self.config,
                             vocab_size=self.vocab_size,
                             word_vectors=self.word_vectors)

    def load_data(self):
        """加载数据集"""
        self.train_data_loader = TrainData(self.config)
        self.config.test_data = self.config.eval_data  # 使用验证集,进行训练过程中的测试
        self.eval_data_loader = TestData(self.config)

    def train(self):
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9,
                                    allow_growth=True)
        sess_config = tf.ConfigProto(log_device_placement=False,
                                     allow_soft_placement=True,
                                     gpu_options=gpu_options)
        with tf.Session(config=sess_config) as sess:
            sess.run(tf.global_variables_initializer())  # 初始化变量
            current_step = 0

            # 创建Train/Eval的summar路径和写入对象
            train_summary_path = os.path.join(
                self.config.BASE_DIR, self.config.summary_path + "/train")
            if not os.path.exists(train_summary_path):
                os.makedirs(train_summary_path)
            train_summary_writer = tf.summary.FileWriter(
                train_summary_path, sess.graph)
            eval_summary_path = os.path.join(
                self.config.BASE_DIR, self.config.summary_path + "/eval")
            if not os.path.exists(eval_summary_path):
                os.makedirs(eval_summary_path)
            eval_summary_writer = tf.summary.FileWriter(
                eval_summary_path, sess.graph)

            # Train & Eval Process
            for epoch in range(self.config.epochs):
                print(f"----- Epoch {epoch + 1}/{self.config.epochs} -----")
                for batch in self.train_data_loader.next_batch(
                        self.train_inputs, self.train_labels,
                        self.config.batch_size):
                    summary, loss, predictions = self.model.train(
                        sess, batch, self.config.keep_prob)
                    train_summary_writer.add_summary(summary)
                    if self.config.num_classes == 1:
                        acc = get_binary_metrics(pred_y=predictions.tolist(),
                                                 true_y=batch['y'])
                        print("Train step: {}, acc: {:.3f}".format(
                            current_step, acc))
                    elif self.config.num_classes > 1:
                        acc = get_multi_metrics(pred_y=predictions.tolist(),
                                                true_y=batch['y'])
                        print("Train step: {}, acc: {:.3f}".format(
                            current_step, acc))

                    current_step += 1

                    if self.eval_data_loader and current_step % self.config.ckeckpoint_every == 0:
                        eval_losses = []
                        eval_accs = []
                        for eval_batch in self.eval_data_loader.next_batch(
                                self.eval_inputs, self.eval_labels,
                                self.config.batch_size):
                            eval_summary, eval_loss, eval_predictions = self.model.eval(
                                sess, eval_batch)
                            eval_summary_writer.add_summary(eval_summary)
                            eval_losses.append(eval_loss)
                            if self.config.num_classes == 1:
                                acc = get_binary_metrics(
                                    pred_y=eval_predictions.tolist(),
                                    true_y=batch['y'])
                                eval_accs.append(acc)
                            elif self.config.num_classes > 1:
                                acc = get_multi_metrics(
                                    pred_y=eval_predictions.tolist(),
                                    true_y=batch['y'])
                                eval_accs.append(acc)
                        print(
                            f"Eval \tloss: {list_mean(eval_losses)}, acc: {list_mean(eval_accs)}"
                        )

                        if self.config.ckpt_model_path:
                            save_path = os.path.join(
                                self.config.BASE_DIR,
                                self.config.ckpt_model_path)
                            if not os.path.exists(save_path):
                                os.makedirs(save_path)
                            model_save_path = os.path.join(
                                save_path, self.config.model_name)
                            self.model.saver.save(sess,
                                                  model_save_path,
                                                  global_step=current_step)
Пример #16
0
def main(args):
    print "loadding data and labels from dataset"
    train = pd.read_csv(args.train_dir)
    ch_train = pd.read_csv(args.chtrain_dir)
    x_train = train["comment_text"]
    x_chtrain = ch_train["comment_text"]
    target_cols = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]

    x = []
    x_ch = []
    for line in x_train:
        if len(line) > 0:
            x.append(utils.review_to_wordlist(line.strip()))
    print "loaded %d comments from dataset" % len(x)
    for line in x_chtrain:
        if len(line) > 0:
            x_ch.append(utils.review_to_wordlist_char(line.strip()))
    print "loaded %d comments from dataset" % len(x)
    y = train[target_cols].values

    index2word, word2index = utils.load_vocab(args.vocab_dir)
    index2char, char2index = utils.load_char(args.char_dir)
    x_vector = utils.vectorize(x, word2index, verbose=False)
    x_vector = np.array(x_vector)
    char_vector = utils.vectorize_char(x_ch, char2index, verbose=False)
    char_vector = np.array(char_vector)
    print char_vector[0]

    save_dir = os.path.join(args.save_dir, args.model_type)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    if args.model_type in ["cnn", "cnnfe", "chcnn", "chcnn2"]:
        max_step = args.max_step_cnn
        max_size = args.max_size_cnn
        nb_epochs = args.nb_epochs_cnn
    elif args.model_type in [
            "rnn", "rnnfe", "rnnfe2", "chrnn", "chrnnfe", "rcnn"
    ]:
        max_step = args.max_step_rnn
        max_size = args.max_size_rnn
        nb_epochs = args.nb_epochs_rnn

    ex_features = add_features("../data/train.csv")
    nfolds = args.nfolds
    skf = KFold(n_splits=nfolds, shuffle=True, random_state=2018)
    test_prob = []
    stack_logits = np.zeros((len(x_vector), len(target_cols)))
    for (f, (train_index, test_index)) in enumerate(skf.split(x_vector)):
        x_train, x_eval = x_vector[train_index], x_vector[test_index]
        char_train, char_eval = char_vector[train_index], char_vector[
            test_index]
        y_train, y_eval = y[train_index], y[test_index]
        with tf.Graph().as_default():
            config_proto = utils.get_config_proto()
            sess = tf.Session(config=config_proto)
            if args.model_type == "cnn":
                model = TextCNN(args, "TextCNN")
            elif args.model_type == "cnnfe":
                model = TextCNNFE(args, "TextCNNFE")
            elif args.model_type == "rnn":
                model = TextRNN(args, "TextRNN")
            elif args.model_type == "rnnfe":
                model = TextRNNFE(args, "TextRNNFE")
            elif args.model_type == "rcnn":
                model = TextRCNN(args, "TextRCNN")
            elif args.model_type == "attention":
                model = RNNWithAttention(args, "Attention")
            elif args.model_type == "chrnn":
                model = TextRNNChar(args, "TextRNNChar")
            elif args.model_type == "chcnn":
                model = TextCNNChar(args, "TextCNNChar")
            elif args.model_type == "chcnn2":
                model = TextCNNChar(args, "TextCNNChar2")
            elif args.model_type == "rnnfe2":
                model = TextRNNFE2(args, "TextCNNCharFE2")
            elif args.model_type == "chrnnfe":
                model = TextRNNCharFE(args, "TextCNNCharFE")
            else:
                raise ValueError("Unknown model_type %s" % args.model_type)
            sess.run(tf.global_variables_initializer())

            if args.use_ft:
                pretrain_dir = args.ft_dir
                print "use FastText word vector"
                embedding = utils.load_fasttext(pretrain_dir, index2word)
            if not args.use_ft:
                pretrain_dir = args.glove_dir
                print "use Glove word vector"
                embedding = utils.load_glove(pretrain_dir, index2word)
            sess.run(model.embedding_init,
                     {model.embedding_placeholder: embedding})

            for line in model.tvars:
                print line

            print "training %s model for toxic comments classification" % (
                args.model_type)
            print "%d fold start training" % f
            for epoch in range(1, nb_epochs + 1):
                print "epoch %d start with lr %f" % (
                    epoch,
                    model.learning_rate.eval(session=sess)), "\n", "- " * 50
                loss, total_comments = 0.0, 0
                if args.model_type in ["cnn", "rnn", "rcnn"]:
                    train_batch = utils.get_batches(x_train, y_train,
                                                    args.batch_size,
                                                    args.max_len)
                    valid_batch = utils.get_batches(x_eval, y_eval, max_size,
                                                    args.max_len, False)

                elif args.model_type in ["chrnn", "chcnn", "chcnn2"]:
                    train_batch = utils.get_batches_with_char(
                        x_train, char_train, y_train, args.batch_size,
                        args.max_len)
                    valid_batch = utils.get_batches_with_char(
                        x_eval, char_eval, y_eval, max_size, args.max_len,
                        False)

                elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]:
                    train_batch = utils.get_batches_with_fe(
                        x_train, y_train, ex_features, args.batch_size,
                        args.max_len)
                    valid_batch = utils.get_batches_with_fe(
                        x_eval, y_eval, ex_features, max_size, args.max_len,
                        False)

                elif args.model_type in ["chrnnfe"]:
                    train_batch = utils.get_batches_with_charfe(
                        x_train, char_train, y_train, ex_features,
                        args.batch_size, args.max_len)
                    valid_batch = utils.get_batches_with_charfe(
                        x_eval, char_eval, y_eval, ex_features, max_size,
                        args.max_len, False)

                epoch_start_time = time.time()
                step_start_time = epoch_start_time
                for idx, batch in enumerate(train_batch):
                    if args.model_type in ["cnn", "rnn", "rcnn"]:
                        comments, comments_length, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, labels)

                    elif args.model_type in ["chrnn", "chcnn", "chcnn2"]:
                        comments, comments_length, chs, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, chs, labels)

                    elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]:
                        comments, comments_length, exs, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, labels, exs)

                    elif args.model_type in ["chrnnfe"]:
                        comments, comments_length, chs, exs, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, chs, labels, exs)

                    loss += loss_t * batch_size
                    total_comments += batch_size

                    if global_step % 200 == 0:
                        print "epoch %d step %d loss %f time %.2fs" % (
                            epoch, global_step, loss_t,
                            time.time() - step_start_time)

                    if global_step % 200 == 0:
                        _ = run_valid(valid_batch, model, sess,
                                      args.model_type)
                        # model.saver.save(sess, os.path.join(save_dir, "model.ckpt"), global_step=global_step)
                        step_start_time = time.time()

                epoch_time = time.time() - epoch_start_time
                sess.run(model.learning_rate_decay_op)
                print "%.2f seconds in this epoch with train loss %f" % (
                    epoch_time, loss / total_comments)

            test_prob.append(run_test(args, model, sess))
            stack_logits[test_index] = run_valid(valid_batch, model, sess,
                                                 args.model_type)

    preds = np.zeros((test_prob[0].shape[0], len(target_cols)))
    for prob in test_prob:
        preds += prob
        print prob[0]
    preds /= len(test_prob)
    print len(test_prob)
    write_predict(stack_logits, args.model_type)
    write_results(preds, args.model_type)
Пример #17
0
# -*- coding: utf-8 -*-
"""
Created on 2020-07-19 01:32
@Author  : Justin Jiang
@Email   : [email protected]
"""

import tensorflow.keras as keras
import numpy as np
from sklearn import metrics
import os

from preprocess import preprocesser
from config import Config
from model import TextCNN

if __name__ == '__main__':
    CNN_model = TextCNN()
    CNN_model.train(3)
    CNN_model.test()
Пример #18
0
def train(config):
    try:
        split = config["split"]
        data_path = config["data_path"]
        pretrained_model_dir = config["pretrained_model_dir"]
        pretrained_model_file = config["pretrained_model_file"]
        last_model_path = config["last_model_path"]
        save_to = config["save_to"]
        min_freq = config["min_freq"]
        batch_size = config["batch_size"]
        max_sent_length = config["max_sent_length"]
        embed_dim = config["embed_dim"]
        filter_num = config["filter_num"]
        filter_widths = config["filter_widths"]
        learning_rate = config["learning_rate"]
        patience = config["patience"]
        lr_decay = config["lr_decay"]
        max_num_trial = config["max_num_trial"]
        max_epoch = config["max_epoch"]
        save_every = config["save_every"]
        cuda = config["cuda"]
        debug = config["debug"]
    except KeyError:
        print("Input Parameter Error")
        exit(1)

    if not Path(save_to).exists():
        Path(save_to).mkdir()
    device = torch.device("cuda:0" if (
        torch.cuda.is_available() and cuda) else "cpu")

    # build torchtext field
    TEXT = torchtext.data.Field(tokenize='spacy', lower=True)
    LABEL = torchtext.data.Field(dtype=torch.long)

    train_data, test_data = IMDB.splits(TEXT, LABEL, root=data_path)
    if debug:
        train_data, val_data = train_data.split(split_ratio=0.1)
    train_data, val_data = train_data.split(split_ratio=0.7)
    train_iter, val_iter = torchtext.data.Iterator.splits(
        (train_data, val_data), batch_size=batch_size, device=device)

    if (pretrained_model_file is not None) and (pretrained_model_dir
                                                is not None):
        pretrained_vector = Vectors(name=pretrained_model_file,
                                    cache=pretrained_model_dir)

    TEXT.build_vocab(train_data, min_freq=min_freq, vectors=pretrained_vector)
    LABEL.build_vocab(train_data)

    logging.info("saving TEXT/LABEL vocabulary...")
    with open(f"{save_to}/TEXT_vocab.bin", "wb") as f:
        dill.dump(TEXT, f)
    with open(f"{save_to}/LABEL_vocab.bin", "wb") as f:
        dill.dump(LABEL, f)

    assert embed_dim == TEXT.vocab.vectors.shape[
        -1], "incompatiable embeddings"
    embed_num, class_num = len(TEXT.vocab), len(LABEL.vocab)

    model = TextCNN(embed_num,
                    embed_dim,
                    class_num,
                    filter_num,
                    filter_widths,
                    from_pretrained=TEXT.vocab.vectors).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    cross_entropy = nn.CrossEntropyLoss(weight=torch.tensor(
        [0, 0, 1.0, 1.0], device=device))  # class [<unk>,<pad>,'pos','neg']
    if last_model_path is not None:
        # load model
        logging.info(f'load model from  {last_model_path}')
        params = torch.load(last_model_path,
                            map_location=lambda storage, loc: storage)
        model.load_state_dict(params['state_dict'])
        logging.info('restore parameters of the optimizers')
        optimizer.load_state_dict(torch.load(last_model_path + '.optim'))

    model.train()

    epoch = 0
    cur_trial = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    logging.info("begin training!")
    while True:
        epoch += 1
        train_loss = 0
        cum_cnt = 0
        step = 0
        for batch in iter(train_iter):
            feature, target = batch.text.T, batch.label.squeeze(0)
            step += 1
            optimizer.zero_grad()
            res = model(feature)
            loss = cross_entropy(res, target)
            train_loss += loss
            loss.backward()
            optimizer.step()
        train_loss = train_loss / step
        val_loss, accuracy = evaluate(model, val_iter, cross_entropy)

        logging.info(
            f'epoch {epoch}\t train_loss: {train_loss}\t val_loss:{val_loss}\t val_accuracy:{accuracy}  speed:{time.time()-train_time:.2f}s/epoch\t time elapsed {time.time()-begin_time:.2f}s'
        )
        train_time = time.time()

        is_better = len(
            hist_valid_scores) == 0 or val_loss < min(hist_valid_scores)
        hist_valid_scores.append(val_loss)

        if epoch % save_every == 0:
            model.save(f"{save_to}/model_step_{epoch}")
            torch.save(optimizer.state_dict(),
                       f"{save_to}/model_step_{epoch}.optim")
        if is_better:
            cur_patience = 0
            model_save_path = f"{save_to}/model_best"
            print(f'save currently the best model to [{model_save_path}]')
            model.save(model_save_path)
            # also save the optimizers' state
            torch.save(optimizer.state_dict(), model_save_path + '.optim')
        elif cur_patience < patience:
            cur_patience += 1
            print('hit patience %d' % cur_patience)

            if cur_patience == patience:
                cur_trial += 1
                print(f'hit #{cur_trial} trial')
                if cur_trial == max_num_trial:
                    print('early stop!')
                    exit(0)

                # decay lr, and restore from previously best checkpoint
                lr = optimizer.param_groups[0]['lr'] * lr_decay
                logging.info(
                    f'load previously best model and decay learning rate to {lr}'
                )

                # load model
                params = torch.load(model_save_path,
                                    map_location=lambda storage, loc: storage)
                model.load_state_dict(params['state_dict'])
                model = model.to(device)

                logging.info('restore parameters of the optimizers')
                optimizer.load_state_dict(
                    torch.load(model_save_path + '.optim'))

                # set new lr
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

                # reset patience
                cur_patience = 0

        if epoch == max_epoch:
            print('reached maximum number of epochs!')
            exit(0)
Пример #19
0
criterion = torch.nn.CrossEntropyLoss()

# 训练
t0 = time()
nb_epoch = opts.nb_epoch
max_patience = opts.max_patience
current_patience = 0
root_model = opts.root_model
if not os.path.exists(root_model):
    os.makedirs(root_model)
path_model = os.path.join(root_model, 'textcnn.model')
best_dev_loss = 1000.
for epoch in range(nb_epoch):
    sys.stdout.write('epoch {0} / {1}: \r'.format(epoch, nb_epoch))
    total_loss, dev_loss = 0., 0.
    text_cnn.train()
    current_count = 0
    for i_batch, sample_batched in enumerate(data_loader_train):
        optimizer.zero_grad()
        data = Variable(sample_batched['data'])
        label = Variable(sample_batched['label'])
        if use_cuda:
            data = data.cuda()
            label = label.cuda()
        target = text_cnn(data)
        loss = criterion(target, label)

        loss.backward()
        optimizer.step()

        total_loss += loss.data[0]
Пример #20
0
    def train(self):
        best_valid_loss = 1e9
        all_valid_loss, all_valid_acc = 0, 0

        # CV loop
        for i in range(self.args.cv_num):
            model = TextCNN(self.vocab_size, self.pad_idx,
                            self.args).to(device)

            # model variations (cf. "rand" is default value)
            if self.args.mode == "static":
                model.static_embedding.weight.data.copy_(self.embeddings)
                model.static_embedding.weight.requires_grad = False
            elif self.args.mode == "non-static":
                model.static_embedding.data.normal_(0, 1)
                model.static_embedding.weight.data.copy_(self.embeddings)
            elif self.args.mode == "multichannel":
                model.static_embedding.weight.data.copy_(self.embeddings)
                model.static_embedding.weight.requires_grad = False
                model.nonstatic_embedding.weight.data.copy_(self.embeddings)

            optimizer = optim.Adadelta(model.parameters())
            model.train()

            # generate train dataset
            print(f'>>> {i+1}th dataset is testset')  ## ??
            dataset = self.dataset_list.copy()
            del dataset[i]  # remove testset
            dataset = functools.reduce(
                lambda x, y: x + y,
                dataset)  # Concatenate datasets consecutively.

            data_loader = DataLoader(dataset=dataset,
                                     batch_size=self.args.batch_size,
                                     shuffle=True,
                                     collate_fn=self.collate_fn)

            for epoch in range(self.args.epochs):  # Epoch loop
                pbar = tqdm(data_loader)

                for text, label in pbar:
                    text = text.to(device)
                    label = label.to(device)

                    optimizer.zero_grad()

                    predictions = model(text).squeeze(1)
                    loss = self.criterion(predictions, label)
                    acc = self._binary_accuracy(predictions, label)

                    loss.backward()
                    optimizer.step()

                    # max_norm_scaling
                    eps = 1e-7
                    param = model.fc.weight
                    norm = torch.norm(param)  # l2_norm
                    if norm > self.args.l2_constraint:
                        param.data *= self.args.l2_constraint / (eps + norm)

                    pbar.set_description(
                        f"loss : {loss.item():.4f}, acc : {acc.item():.4f}")

            valid_loss, valid_acc = self.evaluate(model, i)
            all_valid_loss += valid_loss.item()
            all_valid_acc += valid_acc.item()
            print(
                f'valid loss : {valid_loss.item():.3f}, valid acc : {valid_acc.item():.3f}'
            )

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(
                    model.state_dict(),
                    osp.join(self.args.ck_path, f'{self.args.name}_best.pt'))

            if not self.args.cv:
                return

        print()
        print(f'Final loss : {all_valid_loss / self.args.cv_num:.3f}')
        print(f'Final acc : {all_valid_acc / self.args.cv_num:.3f}')