## huggingface
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, DistilBertForSequenceClassification
import torch

distil_bert = 'distilbert-base-cased'
tokenizer = DistilBertTokenizerFast.from_pretrained(distil_bert, do_lower_case=False, add_special_tokens=True,
                                                max_length=256, pad_to_max_length=True)
token_clf = DistilBertForTokenClassification.from_pretrained(distil_bert)
sequence_clf = DistilBertForSequenceClassification.from_pretrained(distil_bert)

sentence = 'Apple and Microsoft plan to form a joint venture for the development of cloud-based computing ' \
           'infrastructure.'

input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
token_clf(input_ids)
outputs = model(input_ids)

last_hidden_states = outputs[0]

test = db.sample(n=10)

token_clf(tokenizer.encode_plus(sentence))
tokenizer.batch_encode_plus(test.text.to_list())

kb
## spacy
def get_sequences_with_2_orgs(text, dist=150):
    ''' Uses spacy NER to identify organisations. If two organizations are detected within dist
    tokens from each other, extracts the sequence
    '''
    # Apply the model
Exemplo n.º 2
0
    def __init__(self, n_classes):

        super(SentimentClassifier, self).__init__()

        self.bert = DistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=3)
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

import random


def tokenize(batch):
    return tokenizer(batch['text'],
                     truncation=True,
                     max_length=256,
                     add_special_tokens=True,
                     padding='max_length',
                     return_attention_mask=True)


train_dataset = load_dataset(
    'json',
    data_files={'train': 'dataset_full_question/quanta_train.json'},
    field='questions')['train']
train_dataset = train_dataset.map(
    lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]})
train_dataset = train_dataset.map(tokenize,
                                  batched=True,
                                  batch_size=len(train_dataset))
train_dataset.set_format('torch',
Exemplo n.º 4
0
    alpha = 0.1  # smoothing parameters for true label
    # /PARAMETERS

    # create log file
    data_folder = '../../data/from-figure-eight/balanced-test-data/tobert/'
    res_path = '../../res/'
    res_path += logfile_name
    with open(res_path, 'w') as f:
        c = 'epoch, iter, loss_train, loss_val, pre_val, rec_val, f01_val, f1_val, f10_val, ece_val'
        f.write(c + '\n')

    # configure DistilBERT model
    config = DistilBertConfig.from_pretrained('distilbert-base-cased')
    config.num_labels = num_labels
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
    model = DistilBertForSequenceClassification(config)
    # load model to GPU if available
    if torch.cuda.is_available():
        model = model.cuda()

    # load datasets
    train_dataset = pd.read_csv(data_folder + train_file)
    val_dataset = pd.read_csv(data_folder + val_file)
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("VAL Dataset: {}".format(val_dataset.shape))
    training_set = DataLoaderSmoothing(train_dataset, alpha)
    validating_set = DataLoaderHard(val_dataset)

    # initialize batch sampler
    target = train_dataset.crowd_label.values
    print('target train 0/1: {}/{}'.format(len(np.where(target == 0)[0]),
Exemplo n.º 5
0
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

model = DistilBertForSequenceClassification.from_pretrained('/app/incivility_project/models/distilbert_5000_03-06-20')
#config = BertConfig.from_json_file('../models/bert_classifier_2epoch_256size/config.json')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

model.cuda()

#load comments and labels from the input tsv
comments, labels = load_data.get_data(sys.argv[1])

#encode inputs using BERT tokenizer
input_ids = []

for comment in comments:
    encoded_comment = tokenizer.encode(comment, add_special_tokens = True, max_length=256,pad_to_max_length=True)
    input_ids.append(encoded_comment)
from transformers import (
    DistilBertForSequenceClassification,
    AutoTokenizer,
)
from nlp_model.labels import technical_competency_labels

# TODO: Auto set these up if they don't already exist.
cache_dir = (str(Path(__file__).parent) + "/cache").replace("\\", "/")
saved_model_dir = (str(Path(__file__).parent) + "/results").replace("\\", "/")

max_tokens_count = 512

# DistilBert was chosen based on its speed and lightweight footprint.
# Future work could explore different models.
tokenizer = AutoTokenizer.from_pretrained(
    "distilbert-base-uncased",
    cache_dir=cache_dir,
    max_character_length=max_tokens_count,
)

# Change "distilbert-base-uncased" to save_model_dir after running the training script once.
# Unfortunately cannot upload the pre-trained model to git due to the file size.
# Will find a better workaround later.
# TODO: experiment with linear regression output (eg estimated years of experience) over ALL questions as input.
# Using SequenceClassification over ONE question as input for now because it's well supported out of box.
technical_competency_classifier = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(technical_competency_labels),
    cache_dir=cache_dir,
)
Exemplo n.º 7
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.tokenizer = DistilBertTokenizer.from_pretrained(
         'distilbert-base-uncased')
     self.model = DistilBertForSequenceClassification.from_pretrained(
         'distilbert-base-uncased-finetuned-sst-2-english')
Exemplo n.º 8
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Export bert onnx model',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--input_dir',
        type=str,
        help='input_dir of bert model, must contain config.json')
    parser.add_argument('--task_name',
                        type=str,
                        choices=["MRPC", "MNLI"],
                        help='tasks names of bert model')
    parser.add_argument('--max_len',
                        type=int,
                        default=128,
                        help='Maximum length of the sentence pairs')
    parser.add_argument('--do_lower_case',
                        type=bool,
                        default=True,
                        help='whether lower the tokenizer')
    parser.add_argument('--output_model',
                        type=str,
                        default='bert.onnx',
                        help='path to exported model file')
    args = parser.parse_args()

    model = DistilBertForSequenceClassification.from_pretrained(args.input_dir)
    export_onnx_model(args, model, args.output_model)
Exemplo n.º 9
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
from transformers import pipeline, DistilBertForSequenceClassification, DistilBertTokenizerFast

parser = argparse.ArgumentParser(description='Sentiment Analysis')
parser.add_argument('text', help='tweet text')
args = parser.parse_args()

# Sentiment classes
LABELS = {'LABEL_0': 'NEGATIVE', 'LABEL_1': 'POSITIVE'}

model = pipeline(
    'sentiment-analysis',
    model=DistilBertForSequenceClassification.from_pretrained("model"),
    tokenizer=DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased'))
result = model(args.text)
sentiment = LABELS[result[0].get('label')]
score = result[0].get('score')

if __name__ == "__main__":
    print(
        '\n' +
        f'The sentiment for the text `{args.text}` is {sentiment} with a probaility of {round(score, 5)}.'
    )
Exemplo n.º 10
0
def main(proj_root_dir, epochs: int = 3):
    print("Begin fine-tune for IMDB sentiment")
    torch.manual_seed(1)
    np.random.seed(1)

    # Load raw IMDB train data into memory
    print("\nLoading IMDB train data subset into memory...")
    train_reviews, train_labels = read_imdb(f"{proj_root_dir}/train")

    # consider creating validation set here
    #
    #

    # Tokenize the raw data reviews text
    print("\nTokenizing training text...")
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    train_tokens = tokenizer(train_reviews, truncation=True,
                             padding=True)  # token IDs and mask

    # Load tokenized text and labels into PyTorch Dataset
    print("\nLoading tokenized text into Pytorch Dataset")
    train_dataset = IMDbDataset(train_tokens, train_labels)

    # Load (possibly cached) pretrained HF model
    print("\nLoading pre-trained DistilBERT model ")
    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased")
    model.to(device)
    model.train()  # set at training mode

    # Fine-tune / train model using standard PyTorch
    print("Loading Dataset with batch_size: 10 ")
    train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)

    print(f"\nFine-tuning the model. It's now {datetime.now()}")
    optim = AdamW(model.parameters(), lr=5.0e-5)  # weight decay
    for epoch in range(epochs):
        epoch_loss = 0.0
        for (batch_idx, batch) in enumerate(train_loader):
            optim.zero_grad()

            input_ids = batch["input_ids"]  # tensor
            attn_mask = batch["attention_mask"]  # tensor
            labels = batch["labels"]  # tensor

            outputs = model(input_ids, attention_mask=attn_mask, labels=labels)
            loss = outputs[0]
            epoch_loss += loss.item()  # accumulate batch loss
            loss.backward()
            optim.step()
            if batch_idx % 20 == 0:
                print("batch_idx: %5d, curr batch loss: %0.4f. It is now: %s" %
                      (batch_idx, loss.item(), datetime.now()))
        print("End of epoch no. %4d, epoch loss = %0.4f. Now is %s" %
              (epoch, epoch_loss, datetime.now()))
    print("Training is complete")

    # 6. save trained model weights and biases
    print("\nSaving tuned model state")
    model.eval()
    torch.save(model.state_dict(),
               f"{proj_root_dir}/models/imdb_state.pt")  # just state

    print("\nEnd pf demo")
Exemplo n.º 11
0
                                 num_replicas=hvd.size(),
                                 rank=hvd.rank())
val_dataloader = DataLoader(val_data,
                            sampler=val_sampler,
                            batch_size=batch_size)

gpu_available = torch.cuda.is_available()

if gpu_available:
    torch.cuda.set_device(hvd.local_rank())

num_labels = len(set(labels))

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels,
    output_attentions=False,
    output_hidden_states=False)

lr_scaler = hvd.size()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params': [
        p for n, p in model.named_parameters()
        if not any(nd in n for nd in no_decay)
    ],
    'weight_decay_rate':
Exemplo n.º 12
0
    "distilBertBigram": lambda: DistilBertTokenizer.from_pretrained(DISTILBERT_BIGRAM_TOKENIZER, is_split_into_words=True),
    "distilBertPOS": lambda: DistilBertTokenizer.from_pretrained(DISTILBERT_POS_TOKENIZER, is_split_into_words=True),
    "distilBertEmbed": lambda: DistilBertTokenizer.from_pretrained(DISTILBERT_EMBED_TOKENIZER,  is_split_into_words=True)
}


def getDistilbertEmbeds(tokenizerName):
    model = all_models["distilBert"](64)
    tokenizer = all_tokenizers[tokenizerName]()
    model.resize_token_embeddings(len(tokenizer))
    return model


all_models = {
    "T5": lambda _: T5ForConditionalGeneration.from_pretrained(
        "t5-small", num_labels=10
    ),
    "distilBert": lambda _: DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", num_labels=10,
    ),
    "distilBertBig": lambda _: DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=10, n_layers=8),
    "distilBertSmall": lambda _: DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=10, n_layers=2, output_attentions=True, n_heads=4),
    "distilBertEmbed": lambda _: getDistilbertEmbeds("distilBertEmbed"),
    "distilBertBigram": lambda _: getDistilbertEmbeds("distilBertBigram"),
    "distilBertPOS": lambda _: getDistilbertEmbeds("distilBertPOS"),
    "lstm": lambda bs: newsLSTM(bs),
    "lstmAttention": lambda bs: lstmAttention(bs),
    "lstmAttentionBigram": lambda bs: lstmAttention(bs, useBigram=True),
    "lstmBigram": lambda bs: newsLSTM(bs, useBigram=True)
}
Exemplo n.º 13
0
def run_model(pos_train_file, neg_train_file, pos_dev_file, neg_dev_file,
              nrows_train, nrows_dev, epochs, out_dir):
    batch_size = 16

    #x_train = _read_data('../data/train_bal.csv', nrows_train)
    #x_dev = _read_data('../data/dev_bal.csv', nrows_dev)

    #train_data = list( zip( x_train['comment_text'].values, x_train['target'].values  ))

    #train_dataloader = DataLoader(  train_data,
    #                            collate_fn=my_collate,
    #                            batch_size=batch_size , shuffle=True,  )
    # #

    #dev_data = list( zip( x_dev['comment_text'].values, x_dev['target'].values  ))

    #dev_dataloader = DataLoader(  dev_data,
    #                            collate_fn=my_collate,
    #                            batch_size=batch_size, shuffle=False,  )

    train_dataloader = get_data_loader_bal(pos_train_file,
                                           neg_train_file,
                                           batch_size=batch_size,
                                           nrows_pos=nrows_train,
                                           nrows_neg=nrows_train,
                                           mode='train')
    dev_dataloader = get_data_loader_bal(pos_dev_file,
                                         neg_dev_file,
                                         batch_size=batch_size,
                                         nrows_pos=nrows_dev,
                                         nrows_neg=nrows_dev,
                                         mode='dev')

    device = get_device()

    bert_hidden_states = 4
    config = DistilBertConfig()
    config.output_hidden_states = True

    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=
        2,  # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.
    )
    model = model.to(device)

    optimizer = AdamW(
        model.parameters(),
        lr=2e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=1e-8  # args.adam_epsilon  - default is 1e-8.
    )

    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # Default value in run_glue.py
        num_training_steps=total_steps)

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    best_score = -np.inf

    stats_vec = []
    for epoch in range(epochs):
        stats = train_epoch(model, train_dataloader, dev_dataloader, optimizer,
                            scheduler)
        print(stats)

        if stats['accuracy'] > best_score:
            best_score = stats['accuracy']
            f = out_dir + '/' + 'best_model_ch.pt'
            torch.save({
                'epoch': epoch,
                'model': model,
                'stats': stats,
            }, f)

        stats_vec.append(stats)

    stats_vec = pd.DataFrame(stats_vec)

    f = out_dir + '/' + 'last_model_ch.pt'
    torch.save({
        'epoch': epoch,
        'model': model,
        'stats': stats,
    }, f)

    print(stats_vec)
    stats_vec.to_csv(out_dir + '/' + 'stats.csv')
Exemplo n.º 14
0
# model = TFAutoModel.from_pretrained(SAVE_DIR, from_pt=True)
# 在PyTorch中加载TensorFlow模型:
# model = AutoModel.from_pretrained(SAVE_DIR, from_tf=True)

# %%
# 还可以令模型返回所有隐藏状态和注意力权重
outputs = model(**inputs, output_hidden_states=True, output_attentions=True)
hidden_states, attentions = outputs[-2:]

# %%
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# 根据已知模型架构选择对应的模型类进行实例化,与上述使用`AutoModel`效果一样
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

# %%
# 自定义模型
# 每种模型架构都有相应的配置类(譬如,Customizing the model -> DistilBertConfig)
# 可以通过修改配置参数(包括hidden dimension, dropout rate等)改变模型,

from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification

# %%
# 若进行核心修改(譬如hidden dimension),则无法使用预训练模型,需要从头开始训练
config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4 * 512)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification(config)

# %%
Exemplo n.º 15
0
    def __init__(self, config):
        # self.name, self.num_classes, epochs, batchs

        self.Configs = config
        self.num_classes = len(config.label_list)

        self.train_logits = []
        self.validation_logits = []
        self.test_logits = []

        self.train_texts = []
        self.train_labels = []
        self.validation_texts = []
        self.validation_labels = []
        self.test_texts = []
        self.test_labels = []

        train = pd.read_csv(os.path.join(self.Configs.data_dir, 'train.csv'))

        try:
            dev = pd.read_csv(os.path.join(self.Configs.data_dir, 'dev.csv'))

        except:
            print('Validation disabled.')
        test = pd.read_csv(os.path.join(self.Configs.data_dir, 'test.csv'))

        self.train_texts = train['text'].tolist()

        self.train_labels = train['label'].tolist()

        try:
            self.validation_texts = dev['text'].tolist()
            self.validation_labels = dev['label'].tolist()

        except:
            pass
        self.test_texts = test['text'].tolist()

        for i in range(len(self.test_texts)):
            self.test_labels.append(0)

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            print('No GPU available, using the CPU instead.')
            self.device = torch.device("cpu")

        if self.Configs.model_name == 'bert':
            self.model = BertForSequenceClassification.from_pretrained(
                self.Configs.pretrained_model_dir, num_labels=self.num_classes)
            self.tokenizer = BertTokenizer.from_pretrained(
                self.Configs.pretrained_model_dir)

        if self.Configs.model_name == 'albert':
            self.model = AlbertForSequenceClassification.from_pretrained(
                self.Configs.pretrained_model_dir, num_labels=self.num_classes)
            self.tokenizer = AlbertTokenizer.from_pretrained(
                self.Configs.pretrained_model_dir)

        if self.Configs.model_name == 'distilbert':
            self.model = DistilBertForSequenceClassification.from_pretrained(
                self.Configs.pretrained_model_dir, num_labels=self.num_classes)
            self.tokenizer = DistilBertTokenizer.from_pretrained(
                self.Configs.pretrained_model_dir)

        if self.Configs.model_name == 'roberta':
            self.model = RobertaForSequenceClassification.from_pretrained(
                self.Configs.pretrained_model_dir, num_labels=self.num_classes)
            self.tokenizer = RobertaTokenizer.from_pretrained(
                self.Configs.pretrained_model_dir)

        if torch.cuda.is_available():
            self.model.cuda()
Exemplo n.º 16
0
# -*- coding: utf-8 -*-
# @Time    : 10/12/19 5:44 PM
# @Author  : hujunchao
# @Email   : [email protected]
# @File    : text_classify_using_distil_bert.py
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig, DistilBertForSequenceClassification
import torch
from download_save import save_path_model, save_path_tokenizer

tokenizer = DistilBertTokenizer.from_pretrained(save_path_tokenizer)
model = DistilBertForSequenceClassification.from_pretrained(save_path_model)

# encoded = tokenizer.encode('hello world, my name is Tom')
# encoded = torch.tensor(encoded).unsqueeze(dim=0)
label = torch.tensor([1]).unsqueeze(dim=0)

encoded = torch.randint(5000, size=[32, 512])
labels = torch.randint(1, size=[32, 2])
result = model(encoded, labels=label)
print()

Exemplo n.º 17
0
def main():
    parser = setup_parser()
    args = parser.parse_args()

    # specifies the path where the biobert or clinical bert model is saved
    if args.bert_model == 'biobert' or args.bert_model == 'clinical_bert':
        args.bert_model = args.model_loc

    print(args.bert_model)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "mednli": MedNLIProcessor,
        "goc": GOCProcessor
    }

    num_labels_task = {"cola": 2, "mnli": 3, "mrpc": 2, "mednli": 3, "goc": 2}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    #if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
    #    raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    #if not os.path.exists(args.output_dir):
    #    os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = DistilBertTokenizer.from_pretrained(
        args.bert_model, do_lower_case=args.do_lower_case)

    print('TRAIN')
    train = processor.get_train_examples(args.data_dir)
    print([(train[i].text_a, train[i].text_b, train[i].label)
           for i in range(3)])
    print('DEV')
    dev = processor.get_dev_examples(args.data_dir)
    print([(dev[i].text_a, dev[i].text_b, dev[i].label) for i in range(3)])
    print('TEST')
    test = processor.get_test_examples(args.data_dir)
    print([(test[i].text_a, test[i].text_b, test[i].label) for i in range(3)])

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(
            args.local_rank))
    model = DistilBertForSequenceClassification.from_pretrained(
        args.bert_model, cache_dir=cache_dir, num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          correct_bias=False)
        num_train_optimization_steps
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=float(num_train_optimization_steps) *
            args.warmup_proportion,
            num_training_steps=num_train_optimization_steps)
        #optimizer = BertAdam(optimizer_grouped_parameters,
        #                     lr=args.learning_rate,
        #                     warmup=args.warmup_proportion,
        #                     t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for epoch_num in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, logits, other = model(input_ids=input_ids,
                                            attention_mask=input_mask,
                                            labels=label_ids)

                #print(loss[0].shape)
                #print(loss[1].shape)
                #print(loss[2].shape)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    global_step += 1

            # Saving checkpoint
            save_checkpoint(model, args.output_dir,
                            "epoch_%d_checkpoint.pth" % epoch_num)

    if args.do_train:
        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

        # Load a trained model and config that you have fine-tuned
        #config = DistilBertConfig(output_config_file)
        model = DistilBertForSequenceClassification.from_pretrained(
            args.output_dir)  #, num_labels=num_labels)
        #model.load_state_dict(torch.load(output_model_file))
    else:
        model = DistilBertForSequenceClassification.from_pretrained(
            args.bert_model)  #, num_labels=num_labels)
    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss, logits, other = model(input_ids=input_ids,
                                                     attention_mask=input_mask,
                                                     labels=label_ids)
            # logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step,
            'loss': loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if args.do_test and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        test_examples = processor.get_test_examples(args.data_dir)
        test_features = convert_examples_to_features(test_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running testing *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in test_features],
                                     dtype=torch.long)
        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        test_loss, test_accuracy = 0, 0
        nb_test_steps, nb_test_examples = 0, 0

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                test_dataloader, desc="Testing"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                #tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids)
                tmp_test_loss, logits, other = model(input_ids=input_ids,
                                                     attention_mask=input_mask,
                                                     labels=label_ids)
                #logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_test_accuracy = accuracy(logits, label_ids)

            test_loss += tmp_test_loss.mean().item()
            test_accuracy += tmp_test_accuracy

            nb_test_examples += input_ids.size(0)
            nb_test_steps += 1

        test_loss = test_loss / nb_test_steps
        test_accuracy = test_accuracy / nb_test_examples
        loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'test_loss': test_loss,
            'test_accuracy': test_accuracy,
            'global_step': global_step,
            'loss': loss
        }

        output_test_file = os.path.join(args.output_dir, "test_results.txt")
        with open(output_test_file, "w") as writer:
            logger.info("***** Test results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Exemplo n.º 18
0
 def __init__(self):
     super(DistilBertModelTest, self).__init__()
     config = DistilBertConfig.from_pretrained('models/config.json')
     self.distilbert = DistilBertForSequenceClassification(
         config)  # /bert_pretrain/
     self.device = torch.device("cuda")
Exemplo n.º 19
0
                    default=512,
                    help='maximum length handled by the model')

args = parser.parse_args()

usecfg = False
if usecfg:
    from transformers import (
        DistilBertConfig,
        DistilBertForSequenceClassification,
        DistilBertTokenizer,
    )
    config = DistilBertConfig.from_pretrained(args.model_name,
                                              finetuning_task='sentiment3',
                                              num_labels=3)
    model = DistilBertForSequenceClassification.from_pretrained(
        args.model_name, config=config)
    tokenizer = DistilBertTokenizer.from_pretrained(
        args.model_name, do_lower_case=(not args.keep_case))
else:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name, do_lower_case=(not args.keep_case))
    model = AutoModelForSequenceClassification.from_pretrained(args.model_name)

model.to("cpu")
model.eval()

classes = ["0", "1", "2"]

texts = ["I hate you", "I love you", "Isomorphic protein matrices"]
Exemplo n.º 20
0
def main():
    """
    main function for conducting Subtask C. Parameters are parsed with argparse.
    Language model should be suitable for German e.g.:
        'bert-base-multilingual-uncased', 
        'bert-base-multilingual-cased',              
        'bert-base-german-cased', 
        'bert-base-german-dbmdz-cased',
        'bert-base-german-dbmdz-uncased',
        'distilbert-base-german-cased',
        'distilbert-base-multilingual-cased'.
    """

    ############################ variable settings #################################
    parser = argparse.ArgumentParser(
        description=
        'Run Subtask C of GermEval 2017 Using Pre-Trained Language Model.')
    parser.add_argument('--seed', type=int, default=42, help='Random seed.')
    parser.add_argument('--lang_model',
                        type=str,
                        default='bert-base-german-dbmdz-uncased',
                        help='The pre-trained language model.')
    parser.add_argument('--epochs',
                        type=int,
                        default=4,
                        help='Number of epochs for training.')
    parser.add_argument('--lr',
                        type=float,
                        default=5e-5,
                        help='The learning rate.')
    parser.add_argument('--max_len',
                        type=int,
                        default=256,
                        help='The maximum sequence length of the input text.')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Your train set batch size.')
    parser.add_argument('--df_path',
                        type=str,
                        default='./data/',
                        help='The data directory.')
    parser.add_argument('--train_data',
                        type=str,
                        default='train_df_cat.tsv',
                        help='The filename of the input train data.')
    parser.add_argument('--dev_data',
                        type=str,
                        default='dev_df_cat.tsv',
                        help='The filename of the input development data.')
    parser.add_argument(
        '--test_data1',
        type=str,
        default='test_syn_df_cat.tsv',
        help='The filename of the first input test data (synchronic).')
    parser.add_argument(
        '--test_data2',
        type=str,
        default='test_dia_df_cat.tsv',
        help='The filename of the second input test data (diachronic).')
    parser.add_argument(
        '--output_path',
        type=str,
        default='./output/subtaskC/',
        help='The output directory of the model and predictions.')
    parser.add_argument("--train",
                        default=True,
                        action="store_true",
                        help="Flag for training.")
    parser.add_argument("--save_prediction",
                        default=False,
                        action="store_true",
                        help="Flag for saving predictions.")
    parser.add_argument("--save_cr",
                        default=False,
                        action="store_true",
                        help="Flag for saving confusion matrix.")
    parser.add_argument("--exclude_general",
                        default=False,
                        action="store_true",
                        help="Flag for excluding category Allgemein.")
    parser.add_argument("--exclude_neutral",
                        default=False,
                        action="store_true",
                        help="Flag for excluding neutral polarity.")
    parser.add_argument("--exclude_general_neutral",
                        default=False,
                        action="store_true",
                        help="Flag for excluding category Allgemein:neutral.")
    args = parser.parse_args()
    ################################################################################
    set_all_seeds(args.seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # Load data
    train_df = pd.read_csv(args.df_path + args.train_data, delimiter='\t')
    dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter='\t')
    test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter='\t')
    test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter='\t')

    # Create a tokenizer
    lower_case = False
    if args.lang_model[-7:] == "uncased":
        lower_case = True

    if args.lang_model[:4] == "bert":
        model_class = "BERT"
        tokenizer = BertTokenizer.from_pretrained(args.lang_model,
                                                  do_lower_case=lower_case,
                                                  max_length=args.max_len)

    if args.lang_model[:10] == "distilbert":
        model_class = "DistilBERT"
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.lang_model, do_lower_case=lower_case, max_length=args.max_len)

    # get training features
    cats = train_df.columns[5:]
    end = "full"
    # exclude categories if required
    if (args.exclude_general):
        cats = [i for i in list(cats) if "Allgemein" not in i]
        end = "excl_gen"
    if (args.exclude_neutral):
        cats = [i for i in list(cats) if "neutral" not in i]
        end = "excl_neu"
    if (args.exclude_general_neutral):
        cats = [i for i in list(cats) if "Allgemein:neutral" not in i]
        end = "excl_genneu"

    num_labels = len(list(cats))

    # create one hot labels
    train_df['one_hot_labels'] = list(train_df[list(cats)].values)
    dev_df['one_hot_labels'] = list(dev_df[list(cats)].values)
    test_syn_df['one_hot_labels'] = list(test_syn_df[list(cats)].values)
    test_dia_df['one_hot_labels'] = list(test_dia_df[list(cats)].values)

    # retrieve sentences and labels
    df = pd.concat([train_df, dev_df])
    sentences = df.text.values
    labels = list(df.one_hot_labels.values)

    sentences_syn = test_syn_df.text.values
    labels_syn = list(test_syn_df.one_hot_labels.values)

    sentences_dia = test_dia_df.text.values
    labels_dia = list(test_dia_df.one_hot_labels.values)

    print("number of categories:", len(list(cats)))

    # Tokenize all of the sentences and map the tokens to their word IDs.
    input_ids = [
        tokenizer.encode(sent,
                         add_special_tokens=True,
                         truncation=True,
                         max_length=args.max_len) for sent in sentences
    ]
    input_ids = pad_sequences(input_ids,
                              maxlen=args.max_len,
                              dtype="long",
                              value=0.0,
                              truncating="post",
                              padding="post")
    # Create attention masks
    attention_masks = [[int(token_id > 0) for token_id in sent]
                       for sent in input_ids]

    # synchronic test data
    input_ids_syn = [
        tokenizer.encode(sent, add_special_tokens=True, truncation=True)
        for sent in sentences_syn
    ]
    input_ids_syn = pad_sequences(input_ids_syn,
                                  maxlen=args.max_len,
                                  dtype="long",
                                  value=0.0,
                                  truncating="post",
                                  padding="post")
    attention_masks_syn = [[int(token_id > 0) for token_id in sent]
                           for sent in input_ids_syn]

    # diachronic test data
    input_ids_dia = [
        tokenizer.encode(sent, add_special_tokens=True, truncation=True)
        for sent in sentences_dia
    ]
    input_ids_dia = pad_sequences(input_ids_dia,
                                  maxlen=args.max_len,
                                  dtype="long",
                                  value=0.0,
                                  truncating="post",
                                  padding="post")
    attention_masks_dia = [[int(token_id > 0) for token_id in sent]
                           for sent in input_ids_dia]

    # split train, dev
    train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev(
        train_df, dev_df, attention_masks, input_ids, labels)

    # transform to torch tensor
    train_inputs = torch.tensor(train_inputs)
    dev_inputs = torch.tensor(dev_inputs)

    train_labels = torch.tensor(train_labels)
    dev_labels = torch.tensor(dev_labels)

    train_masks = torch.tensor(train_masks)
    dev_masks = torch.tensor(dev_masks)

    test_syn_inputs = torch.tensor(input_ids_syn)
    test_syn_masks = torch.tensor(attention_masks_syn)
    test_syn_labels = torch.tensor(labels_syn)

    test_dia_inputs = torch.tensor(input_ids_dia)
    test_dia_masks = torch.tensor(attention_masks_dia)
    test_dia_labels = torch.tensor(labels_dia)

    # Create the DataLoader
    train_dataloader = create_dataloader(train_inputs,
                                         train_masks,
                                         train_labels,
                                         args.batch_size,
                                         train=True)

    dev_dataloader = create_dataloader(dev_inputs,
                                       dev_masks,
                                       dev_labels,
                                       args.batch_size,
                                       train=False)

    test_syn_dataloader = create_dataloader(test_syn_inputs,
                                            test_syn_masks,
                                            test_syn_labels,
                                            args.batch_size,
                                            train=False)

    test_dia_dataloader = create_dataloader(test_dia_inputs,
                                            test_dia_masks,
                                            test_dia_labels,
                                            args.batch_size,
                                            train=False)

    # Create model
    if args.train:
        if model_class == "BERT":
            config = BertConfig.from_pretrained(args.lang_model,
                                                num_labels=num_labels)
            config.hidden_dropout_prob = 0.1
            model = BertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels=num_labels,
                output_attentions=False,
                output_hidden_states=False)

        if model_class == "DistilBERT":
            config = DistilBertConfig.from_pretrained(args.lang_model,
                                                      num_labels=num_labels)
            config.hidden_dropout_prob = 0.1
            model = DistilBertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels=num_labels,
                output_attentions=False,
                output_hidden_states=False)
        model.cuda()

        # Create an optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-8)
        # Total number of training steps = number of batches * number of epochs
        total_steps = len(train_dataloader) * args.epochs
        # Create the learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        # train model
        # Main Loop
        print("=================== Train ================")
        print("##### Language Model:", args.lang_model, ",", "learning rate:",
              args.lr)
        print()

        track_time = time.time()
        # trange is a tqdm wrapper around the normal python range
        for epoch in trange(args.epochs, desc="Epoch"):
            print("Epoch: %4i" % epoch, dt.datetime.now())

            model, optimizer, scheduler, tr_loss = train_multilabel(
                train_dataloader=train_dataloader,
                model=model,
                device=device,
                optimizer=optimizer,
                scheduler=scheduler,
                num_labels=num_labels)
            # EVALUATION: TRAIN SET
            pred_bools_train, true_bools_train, f1_train = eval_multilabel(
                train_dataloader, model=model, device=device)
            print("TRAIN: micro F1 %.3f" % (f1_train))

            # EVALUATION: DEV SET
            pred_bools_dev, true_bools_dev, f1_dev = eval_multilabel(
                dev_dataloader, model=model, device=device)
            print("EVAL: micro F1 %.3f" % (f1_dev))

        print("  Training and validation took in total: {:}".format(
            format_time(time.time() - track_time)))

        # EVALUATION: TEST SYN SET
        pred_bools_syn, true_bools_syn, f1_test_syn = eval_multilabel(
            test_syn_dataloader, model=model, device=device)
        print("TEST SYN: micro F1 %.4f" % (f1_test_syn))

        # classification report
        clf_report_syn = classification_report(true_bools_syn,
                                               pred_bools_syn,
                                               target_names=cats,
                                               digits=3)
        print(clf_report_syn)

        # EVALUATION: TEST DIA SET
        pred_bools_dia, true_bools_dia, f1_test_dia = eval_multilabel(
            test_dia_dataloader, model=model, device=device)
        print("TEST DIA: micro F1 %.4f" % (f1_test_dia))

        # classification report
        clf_report_dia = classification_report(true_bools_dia,
                                               pred_bools_dia,
                                               target_names=cats,
                                               digits=3)
        print(clf_report_dia)

        if args.save_cr:
            pickle.dump(
                clf_report_syn,
                open(
                    args.output_path + 'clf_report_' + args.lang_model +
                    '_test_syn_' + str(num_labels) + end + '.txt', 'wb'))
            pickle.dump(
                clf_report_dia,
                open(
                    args.output_path + 'clf_report_' + args.lang_model +
                    '_test_dia_' + str(num_labels) + end + '.txt', 'wb'))

        if args.save_prediction:
            test_syn_df["category_pred"] = pred_bools_syn
            test_dia_df["category_pred"] = pred_bools_dia
            test_syn_df.category_pred.to_csv(args.output_path +
                                             args.lang_model + '_test_syn_' +
                                             str(num_labels) + end + ".tsv",
                                             sep="\t",
                                             index=False,
                                             header=True,
                                             encoding="utf-8-sig")
            test_dia_df.category_pred.to_csv(args.output_path +
                                             args.lang_model + '_test_dia_' +
                                             str(num_labels) + end + ".tsv",
                                             sep="\t",
                                             index=False,
                                             header=True,
                                             encoding="utf-8-sig")
Exemplo n.º 21
0
def train_optim_2(model, epochs, log_frequency, device, learning_rate):
  
  resnet18=models.resnet18(pretrained=True)
  resnet18.fc=Identity()
  resnet18.to(device)
  
  distilbert=DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
  token = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
  distilbert.classifier=Identity()
  distilbert.to(device)
      
  model.to(device) # we make sure the model is on the proper device

  # Multiclass classification setting, we use cross-entropy
  # note that this implementation requires the logits as input 
  # logits: values prior softmax transformation 
  loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')

  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  
  for t in range(epochs):

      model.train() # we specify that we are training the model

      # At each epoch, the training set will be processed as a set of batches
      for batch_id,  batch in enumerate(train_set) : 

        images, question, labels  = batch
        
        question = token(question,return_tensors="pt",truncation=True, padding=True)

        # we put the data on the same device
        images , question , labels = images.to(device), question.to(device) , labels.to(device)  
        
        representation_image = resnet18(images) #vecteur de taille 512
        output_distil = distilbert(**question)
        representation_texte = output_distil.logits #vecteur de taille 768
        
        X = torch.cat((representation_image,representation_texte),dim=1)
        
        y_pred = model(X) # forward pass output=logits

        loss = loss_fn(y_pred, labels)

        if batch_id % log_frequency == 0:
            print("epoch: {:03d}, batch: {:03d}, loss: {:.3f} ".format(t+1, batch_id+1, loss.item()))

        optimizer.zero_grad() # clear the gradient before backward
        loss.backward()       # update the gradient

        optimizer.step() # update the model parameters using the gradient

      # Model evaluation after each step computing the accuracy
      model.eval()
      total = 0
      correct = 0
      for batch_id, batch in enumerate(test_set):
        images , question , labels = batch
        question = token(question,return_tensors="pt",truncation=True, padding=True)
        images , question , labels = images.to(device), question.to(device) , labels.to(device) 
        
        representation_image = resnet18(images) #vecteur de taille 512
        output_distil = distilbert(**question)
        representation_texte = output_distil.logits #vecteur de taille 768
        
        X = torch.cat((representation_image,representation_texte),dim=1)
        y_pred = model(X) # forward computes the logits
        sf_y_pred = torch.nn.Softmax(dim=1)(y_pred) # softmax to obtain the probability distribution
        _, predicted = torch.max(sf_y_pred , 1)     # decision rule, we select the max
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
      print("[validation] accuracy: {:.3f}%\n".format(100 * correct / total))     
  return
Exemplo n.º 22
0
    def __init__(self, args: dict, doLower: bool, train_batchSize: int, testval_batchSize:int, learningRate: float, doLearningRateScheduler: bool, target_columns: list, smartBatching: bool = True, mixedPrecision: bool = True, labelSentences: dict = None, max_label_len= None, model= None, optimizer= None, loss_fct= None, device= "cpu"):
        self.args = args
        self.labelSentences = labelSentences
        self.tokenizer = None
        self.device = device
        self.train_batchSize = train_batchSize
        self.testval_batchSize = testval_batchSize
        self.learningRate = learningRate
        self.optimizer = optimizer
        self.doLearningRateScheduler = doLearningRateScheduler
        self.learningRateScheduler = None
        self.smartBatching = smartBatching
        self.mixedPrecision = mixedPrecision
        self.max_label_len = max_label_len
        self.target_columns = target_columns
        self.input_multiclass_as_one = False


        if self.args["model"] in ["distilbert", "bert", "xlnet", "lstm", "roberta", "distilroberta"]:
            # define loss function
            if loss_fct:
                self.loss_fct = loss_fct
            else:
                self.loss_fct = BCEWithLogitsLoss()

            # define how many labels need to be classified
            if self.args["binaryClassification"]:
                self.num_labels = 1
            else:
                self.num_labels = len(self.labelSentences.keys())

        # build model from the model_str
        if self.args["model"] == "distilbert":
            if doLower:
                self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
            else:
                self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

        elif self.args["model"] == "bert":
            if doLower:
                self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            else:
                self.model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

        elif self.args["model"] == "xlnet":
            if doLower:
                # no lowercase version exists therefore using the cased version in the doLower case as well
                self.model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
            else:
                self.model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

        elif self.args["model"] == "roberta":
            if doLower:
                self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
            else:
                self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        elif self.args["model"] == "distilroberta":
            if doLower:
                self.model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
            else:
                self.model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True)
                self.tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

        #elif self.args["model"] == "CNN":
        #    self.model = MyLSTM(num_labels=self.num_labels)

        elif self.args["model"] == "gradboost":
            self.model = GradientBoostingClassifier(learning_rate= self.learningRate, n_estimators= self.args["n_estimators"], max_depth= self.args["max_depth"], verbose=1)
            self.input_multiclass_as_one = True

        elif self.args["model"] == "randomforest":
            self.model = RandomForestClassifier(n_estimators= self.args["n_estimators"], max_depth= self.args["max_depth"], verbose=1, n_jobs= -1)
            self.input_multiclass_as_one = True

        elif self.args["model"] == "naivebayes":
            self.model = OneVsRestClassifier(MultinomialNB(alpha= self.learningRate))

        elif self.args["model"] == "naivebayes_norm":
            self.model = Pipeline([
                ("nb_norm", MinMaxScaler()),
                ("nb_clf", OneVsRestClassifier(MultinomialNB(alpha= self.learningRate)))
                ])

        elif self.args["model"] == "sgd":
            self.model = OneVsRestClassifier(SGDClassifier(alpha= self.learningRate, loss='hinge', penalty='l2'))

        else:
            logging.error("Define a model in the args dict.")
            sys.exit("Define a model in the args dict.")
    train_sizes = [int(len(dset) * args.train_pct) for j, dset in enumerate(all_dsets)]
    val_sizes = [len(all_dsets[j]) - train_sizes[j] for j in range(len(train_sizes))]

    for i in range(len(all_dsets)):
        domain = args.domains[i]

        test_dset = all_dsets[i]

        dataloader = DataLoader(
            test_dset,
            batch_size=4,
            shuffle=True,
            collate_fn=collate_batch_transformer
        )

        bert = DistilBertForSequenceClassification.from_pretrained(bert_model, config=bert_config).to(device)
        # Create the model

        model = torch.nn.DataParallel(MultiViewTransformerNetworkAveragingIndividuals(
            bert_model,
            bert_config,
            len(all_dsets) - 1
        )).to(device)
        model.module.average = True
        # load the trained model

        # Load the best weights
        for v in range(len(all_dsets)-1):
            model.module.domain_experts[v].load_state_dict(torch.load(f'{args.pretrained_model}/model_{domain}_{v}.pth'))
        model.module.shared_bert.load_state_dict(torch.load(f'{args.pretrained_model}/model_{domain}_{len(all_dsets)-1}.pth'))
Exemplo n.º 24
0
    def __init__(self, pre_trained: str, class_count: int):
        super().__init__()

        self.bert = DistilBertForSequenceClassification.from_pretrained(
            pre_trained, num_labels=class_count)
Exemplo n.º 25
0
def retrain(filepath, epochs_per_item=2, min_to_train=10):
    '''Retrain a new model from scratch 
    '''
    global label
    global current_model
    global currently_training

    if currently_training:
        "skipping while model already training"
        return

    positives = load_headlines(filepath+"positive.csv")
    negatives = load_headlines(filepath+"negative.csv")

    if len(positives) < min_to_train or len(negatives) < min_to_train:
        print("too few annotations to train: less than "+str(min_to_train))
        return

    
    currently_training = True
    
    new_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

    
    # sample each item no more than `epochs_per_item` for least frequent label
    iterations = int(min(len(positives), len(negatives)) * epochs_per_item)
    
        
    for i in range(0, iterations):
        positive_headline = random.choice(positives)    
        positive_inputs = tokenizer(positive_headline, return_tensors="pt")
        positive_labels = torch.tensor([1]).unsqueeze(0)  
 
        train_item(new_model, positive_inputs, positive_labels)
         
        negative_headline = random.choice(negatives)        
        negative_inputs = tokenizer(negative_headline, return_tensors="pt")
        negative_labels = torch.tensor([0]).unsqueeze(0)  
                   
        train_item(new_model, negative_inputs, negative_labels)

        eel.sleep(0.01) # allow other processes through
 
    new_fscore = evaluate_model(new_model)
    current_fscore = evaluate_model(current_model)
      
    if(new_fscore > current_fscore):
        print("replacing model!")
        current_model = new_model
        timestamp = re.sub('\.[0-9]*','_',str(datetime.datetime.now())).replace(" ", "_").replace("-", "").replace(":","")
        accuracy = str(round(new_fscore, 4))
                     
        model_path = "data/"+label+"/"+timestamp+accuracy+".model"
        current_model.save_pretrained(model_path)
        if verbose:
            print("saved model to "+model_path)
        clean_old_models()
            
        get_predictions()
    else:
        print("staying with old model")
        
    currently_training = False
Exemplo n.º 26
0
def main():
    """
    main function for conducting Subtask A. Parameters are parsed with argparse.
    Language model should be suitable for German e.g.:
        'bert-base-multilingual-uncased', 
        'bert-base-multilingual-cased',              
        'bert-base-german-cased', 
        'bert-base-german-dbmdz-cased',
        'bert-base-german-dbmdz-uncased',
        'distilbert-base-german-cased',
        'distilbert-base-multilingual-cased'.
    """

    ############################ variable settings #################################
    parser = argparse.ArgumentParser(description='Run Subtask A or B of GermEval 2017 Using Pre-Trained Language Model.')
    parser.add_argument('--task', type=str, default='A', help="The task you want to conduct ('A' or 'B').")
    parser.add_argument('--seed', type=int, default=42, help='Random seed.')
    parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.')
    parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.')
    parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.')
    parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.')
    parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.')
    parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.')    
    parser.add_argument('--train_data', type=str, default='train_df.tsv', help='The filename of the input train data.')
    parser.add_argument('--dev_data', type=str, default='dev_df.tsv', help='The filename of the input development data.')
    parser.add_argument('--test_data1', type=str, default='test_syn_df.tsv', help='The filename of the first input test data (synchronic).')
    parser.add_argument('--test_data2', type=str, default='test_dia_df.tsv', help='The filename of the second input test data (diachronic).')
    parser.add_argument('--output_path', type=str, default='./output/subtaskA/', help='The output directory of the model and predictions.')
    parser.add_argument("--train", default=True, action="store_true", help="Flag for training.")
    parser.add_argument("--save_prediction", default=True, action="store_true", help="Flag for saving predictions.")
    args = parser.parse_args()

    ################################################################################
    set_all_seeds(args.seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # Load data
    train_df = pd.read_csv(args.df_path + args.train_data, delimiter = '\t')
    dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter = '\t')
    test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter = '\t')
    test_syn_df = test_syn_df.dropna(subset = ["text"])    
    test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter = '\t')
    
    # Create a tokenizer
    lower_case = False
    if args.lang_model[-7:] == "uncased":
        lower_case = True

    if args.lang_model[:4] == "bert":
        model_class = "BERT"
        tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len)
    
    if args.lang_model[:10] == "distilbert":
        model_class = "DistilBERT"
        tokenizer = DistilBertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len)
    
    # get training features
    df = pd.concat([train_df, dev_df])
    sentences = df.text.values
    sentences_syn = test_syn_df.text.values    
    sentences_dia = test_dia_df.text.values
    
    if args.task == 'A':
        class_list = [False, True]
        df['relevance_label'] = df.apply(lambda x:  class_list.index(x['relevance']), axis = 1)
        labels = df.relevance_label.values
        test_syn_df['relevance_label'] = test_syn_df.apply(lambda x:  class_list.index(x['relevance']), axis = 1)
        labels_syn = test_syn_df.relevance_label.values
        test_dia_df['relevance_label'] = test_dia_df.apply(lambda x:  class_list.index(x['relevance']), axis = 1)
        labels_dia = test_dia_df.relevance_label.values

    if args.task == 'B':
        class_list = ["negative", "neutral", "positive"]
        df['sentiment_label'] = df.apply(lambda x:  class_list.index(x['sentiment']), axis = 1)
        labels = df.sentiment_label.values
        test_syn_df['sentiment_label'] = test_syn_df.apply(lambda x:  class_list.index(x['sentiment']), axis = 1)
        labels_syn = test_syn_df.sentiment_label.values
        test_dia_df['sentiment_label'] = test_dia_df.apply(lambda x:  class_list.index(x['sentiment']), axis = 1)
        labels_dia = test_dia_df.sentiment_label.values
    
    num_labels = len(set(labels))
    
    # Tokenize all of the sentences and map the tokens to their word IDs.
    input_ids = [tokenizer.encode(sent, add_special_tokens=True, truncation=True, 
                                  max_length=args.max_len) for sent in sentences]
    input_ids = pad_sequences(input_ids, maxlen=args.max_len, dtype="long", 
                          value=0.0, truncating="post", padding="post")
    # Create attention masks
    attention_masks = [[int(token_id > 0) for token_id in sent] for sent in input_ids]

    # synchronic test data
    input_ids_syn = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_syn]
    input_ids_syn = pad_sequences(input_ids_syn, maxlen=args.max_len, dtype="long", 
                          value=0.0, truncating="post", padding="post")
    attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn]
    
    # diachronic test data
    input_ids_dia = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_dia]
    input_ids_dia = pad_sequences(input_ids_dia, maxlen=args.max_len, dtype="long", 
                          value=0.0, truncating="post", padding="post")
    attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia]

    # split train, dev
    train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev(
        train_df, dev_df, attention_masks, input_ids, labels)

    # transform to torch tensor
    train_inputs = torch.tensor(train_inputs)
    dev_inputs = torch.tensor(dev_inputs)

    train_labels = torch.tensor(train_labels)
    dev_labels = torch.tensor(dev_labels)

    train_masks = torch.tensor(train_masks)
    dev_masks = torch.tensor(dev_masks)

    test_syn_inputs = torch.tensor(input_ids_syn)
    test_syn_labels = torch.tensor(labels_syn)
    test_syn_masks = torch.tensor(attention_masks_syn)

    test_dia_inputs = torch.tensor(input_ids_dia)
    test_dia_labels = torch.tensor(labels_dia)
    test_dia_masks = torch.tensor(attention_masks_dia)

    # Create the DataLoader
    train_dataloader = create_dataloader(train_inputs, train_masks, 
                                     train_labels, args.batch_size, train=True)

    dev_dataloader = create_dataloader(dev_inputs, dev_masks, 
                                   dev_labels, args.batch_size, train=False)

    test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, 
                                        test_syn_labels, args.batch_size, 
                                        train=False)

    test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, 
                                        test_dia_labels, args.batch_size, 
                                        train=False)

    # Create model
    if args.train:
        if model_class == "BERT":
            config = BertConfig.from_pretrained(args.lang_model, num_labels=num_labels)   
            config.hidden_dropout_prob = 0.1
            model = BertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels = num_labels,
                output_attentions = False,
                output_hidden_states = False
            )

        if model_class == "DistilBERT":
            config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=num_labels)   
            config.hidden_dropout_prob = 0.1 
            model = DistilBertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels = num_labels,
                output_attentions = False,
                output_hidden_states = False
            )
        model.cuda()


        # Create an optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                'weight_decay_rate': 0.0}
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.lr,
            eps=1e-8
        )

        # Total number of training steps = number of batches * number of epochs
        total_steps = len(train_dataloader) * args.epochs
        # Create the learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )
    
        # train model
        # Main Loop
        print("=================== Train ================")
        print("##### Language Model:", args.lang_model, ",", "learning rate:", args.lr)
        print()

        track_time = time.time()
        # trange is a tqdm wrapper around the normal python range
        for epoch in trange(args.epochs, desc="Epoch"):
            print("Epoch: %4i"%epoch, dt.datetime.now())

            model, optimizer, scheduler, tr_loss = train(
                train_dataloader, 
                model=model, 
                device=device, 
                optimizer=optimizer, 
                scheduler=scheduler
            )
            # EVALUATION: TRAIN SET
            true_bools_train, pred_bools_train, f1_train = eval(
                train_dataloader, model=model, device=device)
            print("TRAIN: micro F1 %.4f"%(f1_train)) # here: same as accuracy
            print(confusion_matrix(true_bools_train,pred_bools_train))
            
            # EVALUATION: DEV SET
            true_bools_dev, pred_bools_dev, f1_dev = eval(
                dev_dataloader, model=model, device=device)
            print("EVAL: micro F1 %.4f"%(f1_dev))
            print(confusion_matrix(true_bools_dev,pred_bools_dev))
        

        print("  Training and validation took in total: {:}".format(format_time(time.time()-track_time)))

        # EVALUATION: TEST SYN SET
        true_bools_syn, pred_bools_syn, f1_test_syn = eval(
            test_syn_dataloader, model=model, device=device)
        print("TEST SYN: micro F1 %.4f"%(f1_test_syn))
        print(confusion_matrix(true_bools_syn,pred_bools_syn))

        # EVALUATION: TEST DIA SET
        true_bools_dia, pred_bools_dia, f1_test_dia = eval(
            test_dia_dataloader, model=model, device=device)
        print("TEST DIA: micro F1 %.4f"%(f1_test_dia))
        print(confusion_matrix(true_bools_dia, pred_bools_dia))

        if args.save_prediction:
            if args.task == 'A':
                test_syn_df["relevance_pred"] = pred_bools_syn
                test_dia_df["relevance_pred"] = pred_bools_dia
            if args.task == 'B':                
                test_syn_df["sentiment_pred"] = pred_bools_syn
                test_dia_df["sentiment_pred"] = pred_bools_dia
            
            test_syn_df.to_csv(args.output_path+args.lang_model+"_eval_test_syn.tsv", sep="\t", index = False, 
                header = True, encoding = "utf-8-sig")
            test_dia_df.to_csv(args.output_path+args.lang_model+"_eval_test_dia.tsv", sep="\t", index = False, 
                header = True, encoding = "utf-8-sig")
Exemplo n.º 27
0
def main():
    parser = argparse.ArgumentParser(
        description='argument parsing for training')

    parser.add_argument('--data_dir',
                        default='data',
                        type=str,
                        help='path to data directory - default: \'data\'')

    parser.add_argument('--review_file',
                        default='yelp_reviews_train5000.csv',
                        type=str,
                        help='file name containig reviews')

    parser.add_argument('--batch_size',
                        default=32,
                        type=int,
                        help='batch size - default: 32')

    parser.add_argument('--train_ratio',
                        default=0.85,
                        type=float,
                        help='train size - default: 0.85')

    parser.add_argument('--epochs',
                        default=4,
                        type=int,
                        help='number of training epochs - default: 4')

    parser.add_argument('--distil',
                        action='store_true',
                        help='use DistilBert instead of BERT')

    parser.add_argument('--model_save',
                        default='./model_save',
                        type=str,
                        help='directory to save model')

    parser.add_argument('--nolog', action='store_true', help='disable logging')

    # parse input arguments
    clargs = parser.parse_args()

    # log to file and stdout
    if clargs.nolog:
        print("Not logging")
    else:
        sys.stdout = Logger('train')

    print("")
    print("==========================================")
    print("-------------Confirm Arguments------------")
    print("==========================================")
    print("")

    print("Data directory: {0:s}".format(clargs.data_dir))
    print("Reviews file: {0:s}".format(clargs.review_file))
    print("Batch size of {0:d}".format(clargs.batch_size))
    print("Train ratio of {0:0.2f}".format(clargs.train_ratio))
    print("Train for {0:d} epochs".format(clargs.epochs))
    print("Using DistilBert" if clargs.distil else "Using Bert")
    print("Will save model in: {0:s}".format(clargs.model_save))

    # Check to see if GPU is available
    CUDA_FLAG = False
    if torch.cuda.is_available():
        CUDA_FLAG = True
        device = torch.device("cuda")
        print('*We will use the GPU:', torch.cuda.get_device_name(0))

    else:
        CUDA_FLAG = False
        print('*No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    print("")
    print("==========================================")
    print("---------------Process Data---------------")
    print("==========================================")
    print("")

    path = clargs.data_dir
    fn = clargs.review_file
    filename = path + "/" + fn

    # read in data from review dataset
    t0 = time.perf_counter()
    print("Reading in training data from {0:s}".format(clargs.review_file))
    reviews_df = pd.read_csv(filename)
    reviews_df = reviews_df[['text', 'stars']]
    elapsed = time.perf_counter() - t0
    print("Finished reading {0:d} entries | Took {1:0.2f} seconds".format(
        len(reviews_df.index), elapsed))

    # create tokenizer and model from transformers
    if clargs.distil:
        tokenizer = DistilBertTokenizer.from_pretrained(
            'distilbert-base-uncased')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # tokenize the data into something that BERT can use, then split
    print("Tokenizing and encoding data to be fed into BERT model")
    t1 = time.perf_counter()
    dataset = extract_features(reviews_df, tokenizer)
    elapsed = time.perf_counter() - t1
    print("Finished tokenizing | Took {0:0.2f} seconds".format(elapsed))

    # split the data into training and validation set
    TRAIN_SIZE = int(len(reviews_df.index) * clargs.train_ratio)
    VAL_SIZE = len(reviews_df.index) - TRAIN_SIZE
    BATCH_SIZE = clargs.batch_size
    train_dataloader, validation_dataloader = train_val_split(
        dataset=dataset, batch_sz=BATCH_SIZE, lengths=[TRAIN_SIZE, VAL_SIZE])

    print("Training - Split {0:d} examples into {1:d} batches".format(
        TRAIN_SIZE, len(train_dataloader)))
    print("Validation - Split {0:d} examples into {1:d} batches".format(
        VAL_SIZE, len(validation_dataloader)))
    print("Finished splitting")

    # load a pre-trained model
    if clargs.distil:
        model = DistilBertForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=5,
            output_attentions=False,
            output_hidden_states=False)
    else:
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=5,
            output_attentions=False,
            output_hidden_states=False)
    if CUDA_FLAG:
        model.cuda()
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    # Total number of training steps is [number of batches] x [number of epochs].
    # (Note that this is not the same as the number of training samples).
    epochs = clargs.epochs
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    # Training statistics:
    train_losses = []
    val_losses = []
    val_accs = []

    print("")
    print("==========================================")
    print("-------------Starting training------------")
    print("==========================================")
    print("")

    # TRAINING LOOP:
    # - epoch: number of times through the entire dataset
    # - consists of a training portion: forward pass, then backward pass
    # - followed by a validation portion: evaluate model on a validation set
    start_train_time = time.perf_counter()
    for i in range(epochs):
        print("-----------------Epoch {0:d}-----------------".format(i + 1))
        print("Epoch {0:d} Training Phase".format(i + 1))
        # first train
        model.train()  # only to put the model into train mode
        train_loss = train(model, device, train_dataloader, optimizer,
                           scheduler)
        print("  Training Loss: {0:.2f}".format(train_loss))
        train_losses.append(train_loss)
        print("")

        # then validate
        print("Epoch {0:d} Validation Phase".format(i + 1))
        val_loss, val_acc, _, _ = evaluate(model, device,
                                           validation_dataloader, VAL_SIZE)
        print("  Validation Accuracy: {0:.2f}".format(val_acc))
        print("  Validation Loss: {0:.2f}".format(val_loss))
        val_losses.append(val_loss)
        val_accs.append(val_acc)
        print("")

        elapsed_time = time.perf_counter() - start_train_time
        m, s = divmod(elapsed_time, 60)
        print("End epoch {0:d} - Time so far - {1:02d}:{2:05.2f}".format(
            (i + 1), int(m), s))
        print("")

    print("==========================================")
    print("------------Summary of Training-----------")
    print("==========================================")
    print("")
    total_elapsed_time = time.perf_counter() - start_train_time
    m, s = divmod(total_elapsed_time, 60)
    print("Total training time: {0:02d}:{1:05.2f}".format(int(m), s))
    print("")
    print(
        tabulate(np.stack((train_losses, val_losses, val_accs), axis=-1),
                 ["train_loss", "val_loss", "val_acc"]))
    print("")
    print("Data directory: {0:s}".format(clargs.data_dir))
    print("Reviews file: {0:s}".format(clargs.review_file))
    print("Batch size of {0:d}".format(clargs.batch_size))
    print("Train ratio of {0:0.2f}".format(clargs.train_ratio))
    print("Train for {0:d} epochs".format(clargs.epochs))
    print("")

    # save model
    output_dir = clargs.model_save

    # Create output directory if needed
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    model_type = "distil" if clargs.distil else "bert"
    # save hyperparameters for testing:
    hyper_json = {
        "dataDirectory": clargs.data_dir,
        "dataFile": clargs.review_file,
        "batchSize": str(clargs.batch_size),
        "trainRatio": str(clargs.train_ratio),
        "numEpochs": str(clargs.epochs),
        "model": model_type
    }
    json_outfile = output_dir + '/' + 'hyperparams.json'
    with open(json_outfile, 'w') as outfile:
        json.dump(hyper_json, outfile)

    print("Saving model to %s" % output_dir)
    model_to_save = model.module if hasattr(
        model,
        'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print("Finished saving model")
Exemplo n.º 28
0
import tensorflow as tf
import transformers as trans
import torch
from transformers import DistilBertTokenizer, BertConfig
from transformers import AdamW, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch.nn as nn
import numpy as np

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',
                                                do_lower_case=True)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)
model.load_state_dict(torch.load('torch_weights', map_location='cpu'))


def classify(Message):
    label = {0: 'Ham', 1: 'Spam'}
    X = tokenizer(Message,
                  max_length=200,
                  padding=True,
                  truncation=True,
                  return_tensors="pt")
    device = torch.device("cpu")
    X.to(device)
    model.to(device)
    result = model(**X)
    result_list = list(result[0][0].cpu().detach().numpy())
    max_value = max(zip(result_list, [0, 1]))
    proba1 = max(nn.Softmax(dim=-1)(result[0][0]).cpu().detach().numpy())
    proba = nn.Softmax(dim=-1)(result[0][0]).cpu().detach().numpy()
    #proba = np.exp(max_value[0])/(1+np.exp(max_value[0]))
 def load_pretrained_model(self, input_path = './DistilBERT/pre_trained_model/'):
     self.model = DistilBertForSequenceClassification.from_pretrained(input_path, num_labels = 9)
     self.tokenizer = DistilBertTokenizer.from_pretrained(input_path, do_lower_case = True)
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_dir",
        default="/home/jqu/Documents/data/XNLI/",
        type=str,
        required=False,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument("--model_type",
                        type=str,
                        required=True,
                        help="distilbert|bert")
    parser.add_argument("--model_dir",
                        type=str,
                        required=True,
                        help="where the trained model locates")
    args = parser.parse_args()
    # load test dataset
    processor = processors["xnli"](language="en", train_language="en")
    examples = processor.get_test_examples(args.data_dir)

    if args.model_type == "bert":
        # prepare tokenizer
        tokenizer = BertTokenizer.from_pretrained(args.model_dir,
                                                  do_lower_case=False)

        model = BertForSequenceClassification.from_pretrained(args.model_dir)

    elif args.model_type == "distilbert":
        tokenizer = DistilBertTokenizer.from_pretrained(args.model_dir,
                                                        do_lower_case=False)

        model = DistilBertForSequenceClassification.from_pretrained(
            args.model_dir)

    elif args.model_type == "albert":
        tokenizer = AlbertTokenizer.from_pretrained(args.model_dir,
                                                    do_lower_case=False)

        model = AlbertForSequenceClassification.from_pretrained(args.model_dir)

    model.to("cuda:0")
    model.eval()

    features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=processor.get_labels(),
        max_length=128,
        output_mode="classification",
        pad_on_left=False,
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=0,
        mask_padding_with_zero=True)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)

    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_labels)
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=512)

    overall_preds = [[], []]
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        with torch.no_grad():
            batch = tuple(t.to("cuda:0") for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert"] else None
                )  # XLM and DistilBERT don't use segment_ids
            outputs = model(**inputs)
            _, logits = outputs[:2]
            preds = logits.detach().cpu().numpy()
            preds = np.argmax(preds, axis=1)
            overall_preds[0] += preds.tolist()

            out_label_ids = inputs["labels"].detach().cpu().numpy()
            overall_preds[1] += out_label_ids.tolist()
    # compute scores
    result = accuracy_score(overall_preds[0], overall_preds[1])
    print(f"Overall accuracy: {result}")
    confusion_score = confusion_matrix(overall_preds[0], overall_preds[1])
    print("confusion matrix:\n")
    print(confusion_score)