Пример #1
0
def build_model(args):
    if args.clf_model.lower() == "cnn":
        # easy for text tokenization
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)
        model = CNN_Text(args)

    elif args.clf_model.lower() == "robert":
        print("name is {}".format(args.model_name_or_path))
        tokenizer = RobertaTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)

        config = RobertaConfig.from_pretrained(args.model_name_or_path,
                                               num_labels=args.num_labels,
                                               finetuning_task=args.task_name)

        model = RobertaForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
        # freeze the weight for transformers
        if args.freeze:
            for n, p in model.named_parameters():
                if "bert" in n:
                    p.requires_grad = False
    elif args.clf_model.lower() == "bert":
        tokenizer = BertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)

        config = BertConfig.from_pretrained(args.model_name_or_path,
                                            num_labels=args.num_labels,
                                            finetuning_task=args.task_name)

        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)
        # freeze the weight for transformers
        # if args.freeze:
        #     for n, p in model.named_parameters():
        #         if "bert" in n:
        #             p.requires_grad = False

    else:
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)
        config = DistilBertConfig.from_pretrained(
            args.model_name_or_path,
            num_labels=args.num_labels,
            finetuning_task=args.task_name)
        model = DistilBertForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config)

    model.expand_class_head(args.multi_head)
    model = model.to(args.device)
    return tokenizer, model
Пример #2
0
def main_train():
    def clean_str(string):
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)
        return string

    TEXT = data.Field(sequential=True, lower=True, batch_first=True)
    TEXT.preprocessing = data.Pipeline(clean_str)
    LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)

    trainset, valset = MR.splits(data_path, fields=[("text", TEXT), ("label", LABEL)])
    TEXT.build_vocab(trainset)

    with open("text.field", 'wb') as f:
        dill.dump(TEXT, f)

    trainiter = data.BucketIterator(trainset, batch_size=batch_size, sort_key=lambda x: len(x.text),
                                    shuffle=True, device=device)

    valiter = data.BucketIterator(valset, batch_size=batch_size, sort_key=lambda x: len(x.text),
                                  shuffle=True, device=device)

    model = CNN_Text(channel_dim, len(TEXT.vocab), embed_dim, output_dim, kernel_sizes, is_static=False,
                     dropout_rate=dropout_rate)
    model = model.to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
    train_model(epochs, model, trainiter, valiter, optimizer, criterion)
Пример #3
0
text_field = data.Field(lower=True)
label_field = data.Field(sequential=False)
train_data, dev_data = MR.splits(text_field, label_field)
text_field.build_vocab(train_data, dev_data)
label_field.build_vocab(train_data, dev_data)

args = Args()
args.dropout = 0.5
args.max_norm = 3.0

args.embed_dim = 128
args.kernel_num = 100
args.kernel_sizes = '3,4,5'
args.static = False
args.snapshot = 'snapshot/best.pt'
args.embed_num = len(text_field.vocab)
args.class_num = len(label_field.vocab) - 1
args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]

model = CNN_Text(args)
model.load_state_dict(torch.load(args.snapshot, map_location='cpu'))
model = model.to(device)


@app.route('/cls/<text>')
def classify_text(text):
    app.logger.warning(text)
    result, conf = predict(text, model, text_field, label_field, device)
    app.logger.warning(conf)
    return result
Пример #4
0
class Trainer:
    def __init__(self, config, n_gpu, vocab, train_loader=None, val_loader=None):
        self.config = config
        self.vocab = vocab
        self.n_gpu = n_gpu
        self.train_loader = train_loader
        self.val_loader = val_loader

        # Build model
        vocab_size = self.vocab.vocab_size()

        self.model = CNN_Text(self.config, vocab_size, self.config.n_label)
        self.model.to(device)

        if self.n_gpu > 1:
            self.model = nn.DataParallel(self.model)

        # Build optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=0.0005)

        # Build criterion
        self.criterion = nn.CrossEntropyLoss()

    def train(self):
        best_f1 = 0.0
        best_acc = 0.0
        global_step = 0
        batch_f1 = []
        batch_acc = []


        for epoch in range(self.config.num_epoch):
            batch_loss = []

            for step, batch in enumerate(self.train_loader):
                self.model.train()
                batch = tuple(t.to(device) for t in batch)
                batch = sort_batch(batch)
                input_ids, input_lengths, labels = batch

                outputs = self.model(input_ids)
                
                loss = self.criterion(outputs['logits'].view(-1, self.config.n_label), labels.view(-1))

                f1, acc = ic_metric(labels.cpu(), outputs['predicted_intents'].cpu())

                if self.n_gpu > 1:
                    loss = loss.mean()

                loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()

                global_step += 1
                batch_loss.append(loss.float().item())
                batch_f1.append(f1)
                batch_acc.append(acc)

                if (global_step == 1) or (global_step % self.config.log_interval == 0):
                    mean_loss = np.mean(batch_loss)
                    mean_f1 = np.mean(batch_f1)
                    mean_acc = np.mean(batch_acc)
                    batch_loss = []
                    nsml.report(summary=True, scope=locals(), epoch=epoch, train_loss=mean_loss, step=global_step)

                if (global_step > 0) and (global_step % self.config.val_interval == 0):
                    val_loss, val_f1, val_acc = self.evaluation()
                    nsml.report(summary=True, scope=locals(), epoch=epoch, val_loss=val_loss,
                                val_f1=val_f1, val_acc=val_acc, step=global_step)

                    if val_f1 > best_f1:
                        best_f1 = val_f1
                        best_acc = val_acc
                        nsml.save(global_step)


    def evaluation(self):
        self.model.eval()
        total_loss = []
        preds = []
        targets = []
        with torch.no_grad():
            for step, batch in enumerate(self.val_loader):
                batch = tuple(t.to(device) for t in batch)
                batch = sort_batch(batch)
                input_ids, input_lengths, labels = batch

                outputs = self.model(input_ids)

                loss = self.criterion(outputs['logits'].view(-1, self.config.n_label), labels.view(-1))

                pred = outputs['predicted_intents'].squeeze(-1).cpu().numpy().tolist()
                target = labels.cpu().numpy().tolist()

                preds.extend(pred)
                targets.extend(target)
                total_loss.append(loss.float().item())

        mean_loss = np.mean(total_loss)
        mean_f1, mean_acc = ic_metric(targets, preds)
        return mean_loss, mean_f1, mean_acc