Пример #1
0
from transformers import BartForConditionalGeneration, BartTokenizer

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-xsum')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-xsum')

model = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-xsum-12-3')
tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-xsum-12-3')
ARTICLE_TO_SUMMARIZE = " \"The accident meant the motorway was closed, making travel to Mourneview Park impossible for the team and fans travelling from Belfast, \" said the Irish Football Association . A new date for the match has yet to be confirmed by Uefa . Northern Ireland have three points from their first two Group Six qualifiers."
inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=512, return_tensors='pt')
summary_ids = model.generate(inputs['input_ids'], num_beams=5, max_length=62, min_length=10, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in summary_ids])
Пример #2
0
 def test_dummy_inputs(self):
     config, *_ = self._get_config_and_data()
     model = BartForConditionalGeneration(config).eval().to(torch_device)
     model(**model.dummy_inputs)
Пример #3
0
def Seq2Seq(df):
    model_type = 'bart-large'

    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
    model = BartModel.from_pretrained('facebook/bart-large')
    mask_model = BartForConditionalGeneration.from_pretrained(
        'facebook/bart-large')

    sep_token = '</s>'
    mask_token = '<mask>'

    mask_id = tokenizer(mask_token, return_tensors='pt')['input_ids'][0][1]
    sep_id = tokenizer(sep_token, return_tensors='pt')['input_ids'][0][1]

    optimizer = AdamW(model.parameters())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    mask_model.to(device)

    auxiliary_tokens = ['the', 'aspect', 'term', 'is']

    df['mask_tokens'] = 0
    df['auxiliary_tokens'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        #for j in range(len(df['aspect_terms'].iloc[i])):
        auxiliary_sents = []
        for j in range(len(df['aspect_terms'].iloc[i])):
            aspect_terms = df['aspect_terms'].iloc[i][j]
            auxiliary_sent = auxiliary_tokens + [aspect_terms] + [
                sep_token
            ] + df['tokens'].iloc[i]
            auxiliary_sents.append(auxiliary_sent)

        mask_sent = auxiliary_tokens + [mask_token] + [sep_token
                                                       ] + df['tokens'].iloc[i]
        df['mask_tokens'].iloc[i] = mask_sent
        df['auxiliary_tokens'].iloc[i] = auxiliary_sents

    df['distance'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        tokenized = tokenizer.encode(df['mask_tokens'].iloc[i])

        sep_index = tokenized.index(sep_id)
        mask_index = tokenized.index(mask_id)

        tokenized = pd.Series([tokenized])

        padded = pad_sequences(tokenized,
                               maxlen=MAX_LEN,
                               dtype="long",
                               value=0,
                               truncating="post",
                               padding="post")

        attention_mask = np.where(padded != 0, 1, 0)

        input_ids = torch.tensor(padded).to(device)
        attention_mask = torch.tensor(attention_mask).to(device)

        with torch.no_grad():
            last_hidden_states = model(input_ids,
                                       attention_mask=attention_mask)

        original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu(
        ).numpy()

        distance = []

        for pertubed_index in range(sep_index + 1, MAX_LEN):
            padded = pad_sequences(tokenized,
                                   maxlen=MAX_LEN,
                                   dtype="long",
                                   value=0,
                                   truncating="post",
                                   padding="post")
            if padded[0][pertubed_index] != 0 and padded[0][
                    pertubed_index] != sep_id:
                #print(padded.shape)
                cur_id = padded[0][pertubed_index]
                padded[0][pertubed_index] = mask_id

                cur_embedding = mask_embedding(model, padded, mask_index)
                d = dist(original_mask_embedding, cur_embedding)
                distance.append((cur_id, d))

        df['distance'].iloc[i] = distance

    df['perturbed_mask_index'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        perturbed_mask_index = []
        mask_threshold = calculate_threshold(
            np.array(df['distance'].iloc[i])[:, 1], std_strength)
        for dis_index in range(len(df['distance'].iloc[i])):
            if df['distance'].iloc[i][dis_index][1] < mask_threshold and df[
                    'labels'].iloc[i][dis_index] != 'B' and df['labels'].iloc[
                        i][dis_index] != 'I':
                perturbed_mask_index.append(dis_index)

        df['perturbed_mask_index'].iloc[i] = perturbed_mask_index

    df['augment_token_id'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        augment_tokenizeds = []

        for j in range(len(df['aspect_terms'].iloc[i])):

            tokenized = tokenizer.encode(df['auxiliary_tokens'].iloc[i][j])
            tokenized = torch.Tensor(tokenized).unsqueeze(0).to(
                torch.int64).to(device)
            augment_tokenized = tokenizer.encode(
                df['auxiliary_tokens'].iloc[i][j])

            for k in range(len(df['perturbed_mask_index'].iloc[i])):
                mask_tokenized = tokenizer.encode(
                    df['auxiliary_tokens'].iloc[i][j])
                sep_index = mask_tokenized.index(sep_id)
                perturbed_mask_index = df['perturbed_mask_index'].iloc[i][
                    k] + sep_index + 1
                mask_tokenized[perturbed_mask_index] = mask_id

                mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to(
                    torch.int64).to(device)

                logits = mask_model(mask_tokenized).logits

                probs = logits[0, perturbed_mask_index].softmax(dim=0)
                values, predictions = probs.topk(1)
                augment_tokenized[perturbed_mask_index] = int(
                    predictions.cpu().numpy())

            augment_tokenizeds.append(augment_tokenized)

        df['augment_token_id'].iloc[i] = augment_tokenizeds

    df['augment_tokens'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        tokens_lists = []

        for j in range(len(df['aspect_terms'].iloc[i])):

            tokens_list = []

            for k in range(1, len(df['augment_token_id'].iloc[i][j]) - 1):
                tokens_list.append(
                    tokenizer.decode([df['augment_token_id'].iloc[i][j][k]]))

            sep_index = tokens_list.index(sep_token)
            tokens_list = tokens_list[sep_index + 1:]
            tokens_lists.append(tokens_list)

        df['augment_tokens'].iloc[i] = tokens_lists

    return df
def test_model_download():
    """This warms up the cache so that we can time the next test without including download time, which varies between machines."""
    BartForConditionalGeneration.from_pretrained(MODEL_NAME)
    MarianMTModel.from_pretrained(MARIAN_MODEL)
Пример #5
0
 def test_base_model_fp16(self):
     config, input_ids, batch_size = self._get_config_and_data()
     attention_mask = input_ids.ne(1).to(torch_device)
     lm_model = BartForConditionalGeneration(config).eval().to(torch_device).half()
     lm_model(input_ids, attention_mask=attention_mask)
def main(args):

    # load pretrained or finetuned transformer model
    print(f'loading pre-trained model: {args["model_id"]}')

    # we have to load fine-tuned models in a different way because of pytorch-lightning
    # fine-tuned
    if args['model_id'].endswith('.ckpt'):
    	from transformer_decoding.finetune import SummarizationTrainer
    	lightning_model = SummarizationTrainer.load_from_checkpoint(args['model_id'])
    	args['model'] = lightning_model.model
    	args['tokenizer'] = lightning_model.tokenizer
    else:
        # transformers pretrained
        args['model'] = BartForConditionalGeneration.from_pretrained(args['model_id'])
        args['tokenizer'] = BartTokenizer.from_pretrained(args['model_id'])

    # Set the model in evaluation mode to deactivate the DropOut modules
    # This is IMPORTANT to have reproducible results during evaluation!
    args['model'].eval()

    if torch.cuda.is_available():
        args['model'].to('cuda')

    # summarize MDS / summarization dataset with model

    # print and write out evaluation results
    if args['evaluation_dataset'].endswith('.jsonl'):
        dataset = [json.loads(l) for l in open(args['evaluation_dataset'])][:args['rows_to_eval']]
    else:
        raise AssertionError('Right now we only know how to handle .jsonl evaluation datasets')

    # WORKING: also write out summaries as they're generated
    eval_prefix = args['eval_prefix']
    preds_output = open(f'{eval_prefix}eval_predicted_summaries.out', 'w', buffering=1)
    gold_output = open(f'{eval_prefix}eval_gold_summaries.out', 'w', buffering=1)
    metadata_output = open(f'{eval_prefix}decoding_metadata.jsonl', 'w', buffering=1)

    summaries = []
    # get summary for each cluster
    # note here we have a macro-batch size of one cluster by definition

    for cluster in tqdm.tqdm(dataset):
        # shuffle articles before selecting topk to use in ensemble
        articles = [article_to_text(a) for a in cluster['articles']]
        np.random.shuffle(articles)
        articles = articles[:args['max_articles_in_cluster']]

        if args['min_input_char_length'] is not None:
            articles_ = [a for a in articles if len(a) >= args['min_input_char_length']]
            if len(articles_) == 0:
                articles_ = [articles[0]]
            articles = articles_

        predictions, sorted_hyps = summarize_articles(articles, args)

        # sorted_hyps -- (token_idxs, score, metadata)
        # they're in sorted order according to ensemble score, so first one is the best        
        # we will have one list of timestamp metadata for each input
        length_penalty = args['length_penalty']
        component_scores = []
        for input_idx, state_metadata in enumerate(sorted_hyps[0][2]):

            timestep_scores = np.array([o['score'] for o in state_metadata])
            
            global_score = np.sum(timestep_scores) / len(timestep_scores) ** length_penalty
            component_scores.append(global_score)
            
        component_scores = np.array(component_scores)
        for idx in np.argsort(component_scores)[::-1]:
            print(f'ARTICLE: {articles[idx][:200]}')
            print(f'Input {idx} score: {component_scores[idx]}')
            print()
        
        print(f'Ensemble score: {sorted_hyps[0][1]}')
        print(f'Gold: {cluster["summary"]}')
        print(f'Predicted: {predictions[0]}')
        print()
        #import ipdb; ipdb.set_trace()
        
        #tok_ids = [o['token'] for o in sorted_hyps[0][2][0]] 
        #print(args['tokenizer'].decode(tok_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))

        # NOTE: hack to just take the first one right now, disregarding scores of different beam items
        predicted_summary = predictions[0]
        gold_summary = cluster['summary'].strip()
        summaries.append((predicted_summary, gold_summary))
        preds_output.write(f'{predicted_summary}\n')
        gold_output.write(f'{gold_summary}\n')

        # TODO: need to map some stuff to put in json?
        sorted_hyps_ = []
        for tok_idxs, score, tok_scores in sorted_hyps:
            tok_idxs = [int(idx) for idx in tok_idxs.cpu().numpy()]
            sorted_hyps_.append((tok_idxs, score, tok_scores))
        sorted_hyps = sorted_hyps_

        metadata_output.write(
            json.dumps(
                {
                   'cluster': cluster,
                   'predictions': predictions,
                   'inputs_used': articles,
                   'component_scores': list(component_scores),
                   'decoding_metadata': sorted_hyps
                })
            + '\n') 

    preds_output.close()
    gold_output.close()

    # Evaluation
    hyps, refs = zip(*summaries)
    results, rouge_types = evaluate_rouge(hyps, refs)

    print_mean(results, rouge_types)
Пример #7
0
# BART tests (run from  unifiedqa-tjh/bart directory):

import torch
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
from bart import MyBart

base_model = "facebook/bart-large"
#unifiedqa_path = "unifiedQA-uncased/best-model.pt" # path to the downloaded checkpoint
unifiedqa_path = "/data/thar011/ckpts/unifiedqa-bart-large-allenai/unifiedQA-uncased/best-model.pt"  # path to the downloaded checkpoint

tokenizer = BartTokenizer.from_pretrained(base_model)
model = MyBart.from_pretrained(base_model,
                               state_dict=torch.load(unifiedqa_path))
model.eval()

# ERROR: TypeError: forward() got an unexpected keyword argument 'past_key_values'
x = model.generate_from_string(
    "Which is best conductor? \\n (A) iron (B) feather", tokenizer=tokenizer)
print(x)

x = model.generate_from_string(
    "What is the sum of 3 and 5? \\n (A) 8 (B) 3 (C) 5 (D) 10",
    tokenizer=tokenizer)
print(x)

#try basic bart model (no error):
model = BartForConditionalGeneration.from_pretrained(base_model)
model.eval()
run_model("which is best conductor? \\n (a) iron (b) feather"
          )  #['whichwhich is best conductor?']
Пример #8
0
        "attention_mask": input_encodings["attention_mask"].numpy().copy(),
        "decoder_input_ids": decoder_input_ids.numpy().copy(),
        "labels": labels.numpy().copy(),
    }

    return encodings


# 2. store the preprocessed data on disk in a .pckl file
# 3. create a dataloader that loads the data from disk to memory on-demand


if __name__ == "__main__":
    dataset = datasets.load_dataset(
        "json",
        data_files=str(disk.VERSIONED_DATA_DIR / "generated/examples.json"),
    )["train"].train_test_split(test_size=100, shuffle=True)

    dataset["validation"] = dataset["test"]
    del dataset["test"]

    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

    dataset = dataset.map(convert_to_features, batched=True)

    columns = ["input_ids", "labels", "decoder_input_ids", "attention_mask"]
    dataset.set_format(type="torch", columns=columns)

    dataset.save_to_disk(disk.UNVERSIONED_DATA_DIR / "features")
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):

        # must assign tokenizers before init
        if cfg.language_model.pretrained_model_name:
            if cfg.language_model.pretrained_encoder_model_name or cfg.language_model.pretrained_decoder_model_name:
                raise ValueError(
                    "Must have either pretrained_model_name or both pretrained_encoder_model name and "
                    "pretrained_decoder_model_name.")
            # setup tokenizer
            self.encoder_tokenizer = self.setup_tokenizer(
                cfg.encoder_tokenizer)
            self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens

            # set decoder to encoder
            self.decoder_tokenizer = self.encoder_tokenizer
            self.decoder_add_special_tokens = self.encoder_add_special_tokens
        else:
            if not (cfg.language_model.pretrained_encoder_model_name
                    and cfg.language_model.pretrained_decoder_model_name):
                raise ValueError("Both encoder and decoder must be specified")

            # setup tokenizers
            self.encoder_tokenizer = self.setup_tokenizer(
                cfg.encoder_tokenizer)
            self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens

            self.decoder_tokenizer = self.setup_tokenizer(
                cfg.decoder_tokenizer)
            self.decoder_add_special_tokens = cfg.decoder_tokenizer.add_special_tokens

        if not self.encoder_tokenizer:
            raise TypeError("encoder_tokenizer failed to initialize")
        if not self.decoder_tokenizer:
            raise TypeError("decoder_tokenizer failed to initialize")

        # init superclass
        super().__init__(cfg=cfg, trainer=trainer)

        # must assign modules after init
        if cfg.language_model.pretrained_model_name:
            # Setup end-to-end model
            if "bart" in cfg.language_model.pretrained_model_name:
                self.model = BartForConditionalGeneration.from_pretrained(
                    cfg.language_model.pretrained_model_name)
            else:
                self.model = AutoModel.from_pretrained(
                    cfg.language_model.pretrained_model_name)
        else:
            if not (cfg.language_model.pretrained_encoder_model_name
                    and cfg.language_model.pretrained_decoder_model_name):
                raise ValueError("Both encoder and decoder must be specified")

            # Setup encoder/decoder model
            self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder=cfg.language_model.pretrained_encoder_model_name,
                decoder=cfg.language_model.pretrained_decoder_model_name,
            )

        self.validation_perplexity = Perplexity(compute_on_step=False)

        self.setup_optimization(cfg.optim)
Пример #10
0
    def __init__(
        self,
        pretrained_model=None,
        additional_special_tokens_encoder=None,
        additional_special_tokens_decoder=None,
        model_config=None,
        vocab_file=None,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):
        self.args = self._load_model_args()
        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, Seq2SeqArgs):
            self.args = args

        if "sweep_config" in kwargs:
            self.is_sweeping = True
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = sweep_config_to_sweep_values(sweep_config)
            self.args.update_from_dict(sweep_values)
        else:
            self.is_sweeping = False

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`.")
        else:
            self.device = "cpu"

        self.results = {}

        if not use_cuda:
            self.args.fp16 = False

        # BartConfig, BartForConditionalGeneration, BartTokenizer
        # config = EncoderDecoderConfig.from_encoder_decoder_configs(config, config)
        model_config = BartConfig.from_json_file(model_config)
        if pretrained_model is None:
            self.model = BartForConditionalGeneration(config=model_config)
            self.encoder_tokenizer = BartTokenizer.from_pretrained(vocab_file)

        else:
            self.model = BartForConditionalGeneration.from_pretrained(
                pretrained_model)
            self.encoder_tokenizer = BartTokenizer.from_pretrained(vocab_file)
        self.decoder_tokenizer = self.encoder_tokenizer

        # special AST token
        # additional_special_tokens_encoder = {'additional_special_tokens': ['Assertion', 'RegExp', 'Repetition', 'Quantifier',  'ClassRange', 'CharacterClass']}
        # additional_special_tokens_decoder = {'additional_special_tokens': ['Assertion', 'RegExp', 'Repetition', 'Quantifier',  'ClassRange', 'CharacterClass']}

        self.config = self.model.config

        if additional_special_tokens_encoder is not None:
            self.encoder_tokenizer.add_special_tokens(
                additional_special_tokens_encoder)

        if additional_special_tokens_decoder is not None:
            self.decoder_tokenizer.add_special_tokens(
                additional_special_tokens_decoder)

        if self.args.wandb_project and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args.wandb_project = None

        self.args.model_type = 'bart'
        self.args.model_name = 'ExplainREGEX'
Пример #11
0
import numpy as np

from transformers import BartForConditionalGeneration as BCD, BartTokenizerFast as BTF

import dataset

import sys

batch_size = int(sys.argv[1])

vit = timm.create_model('vit_base_patch32_384', pretrained=True, num_classes=0)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

bart = BCD.from_pretrained('facebook/bart-base')
tokenizer = BTF.from_pretrained('facebook/bart-base')

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Lambda(dataset.make_img_rgb),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = dataset.NarrativesDataset(root='./data/images/',
                                     file='./data/dataset.jsonl',
                                     transform=transform)

trainloader = torch.utils.data.DataLoader(trainset,
Пример #12
0
class BartModel:
    def __init__(
        self,
        pretrained_model=None,
        additional_special_tokens_encoder=None,
        additional_special_tokens_decoder=None,
        model_config=None,
        vocab_file=None,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):
        self.args = self._load_model_args()
        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, Seq2SeqArgs):
            self.args = args

        if "sweep_config" in kwargs:
            self.is_sweeping = True
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = sweep_config_to_sweep_values(sweep_config)
            self.args.update_from_dict(sweep_values)
        else:
            self.is_sweeping = False

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`.")
        else:
            self.device = "cpu"

        self.results = {}

        if not use_cuda:
            self.args.fp16 = False

        # BartConfig, BartForConditionalGeneration, BartTokenizer
        # config = EncoderDecoderConfig.from_encoder_decoder_configs(config, config)
        model_config = BartConfig.from_json_file(model_config)
        if pretrained_model is None:
            self.model = BartForConditionalGeneration(config=model_config)
            self.encoder_tokenizer = BartTokenizer.from_pretrained(vocab_file)

        else:
            self.model = BartForConditionalGeneration.from_pretrained(
                pretrained_model)
            self.encoder_tokenizer = BartTokenizer.from_pretrained(vocab_file)
        self.decoder_tokenizer = self.encoder_tokenizer

        # special AST token
        # additional_special_tokens_encoder = {'additional_special_tokens': ['Assertion', 'RegExp', 'Repetition', 'Quantifier',  'ClassRange', 'CharacterClass']}
        # additional_special_tokens_decoder = {'additional_special_tokens': ['Assertion', 'RegExp', 'Repetition', 'Quantifier',  'ClassRange', 'CharacterClass']}

        self.config = self.model.config

        if additional_special_tokens_encoder is not None:
            self.encoder_tokenizer.add_special_tokens(
                additional_special_tokens_encoder)

        if additional_special_tokens_decoder is not None:
            self.decoder_tokenizer.add_special_tokens(
                additional_special_tokens_decoder)

        if self.args.wandb_project and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args.wandb_project = None

        self.args.model_type = 'bart'
        self.args.model_name = 'ExplainREGEX'

    def train_model(
        self,
        train_data,
        output_dir=None,
        show_running_loss=True,
        args=None,
        eval_data=None,
        verbose=True,
        **kwargs,
    ):
        """
        Trains the model using 'train_data'

        Args:
            train_data: Pandas DataFrame containing the 2 columns - `input_text`, `target_text`.
                        - `input_text`: The input text sequence.
                        - `target_text`: The target text sequence
            output_dir: The directory where model files will be saved. If not given, self.args.output_dir will be used.
            show_running_loss (optional): Set to False to prevent running loss from being printed to console. Defaults to True.
            args (optional): Optional changes to the args dict of the model. Any changes made will persist for the model.
            eval_data (optional): A DataFrame against which evaluation will be performed when evaluate_during_training is enabled. Is required if evaluate_during_training is enabled.
            **kwargs: Additional metrics that should be used. Pass in the metrics as keyword arguments (name of metric: function to use).
                        A metric function should take in two parameters. The first parameter will be the true labels, and the second parameter will be the predictions. Both inputs
                        will be lists of strings. Note that this will slow down training significantly as the predicted sequences need to be generated.

        Returns:
            global_step: Number of global steps trained
            training_details: Average training loss if evaluate_during_training is False or full training progress scores if evaluate_during_training is True
        """  # noqa: ignore flake8"

        if args:
            self.args.update_from_dict(args)

        # if self.args.silent:
        #     show_running_loss = False

        if self.args.evaluate_during_training and eval_data is None:
            raise ValueError(
                "evaluate_during_training is enabled but eval_data is not specified."
                " Pass eval_data to model.train_model() if using evaluate_during_training."
            )

        if not output_dir:
            output_dir = self.args.output_dir

        if os.path.exists(output_dir) and os.listdir(
                output_dir) and not self.args.overwrite_output_dir:
            raise ValueError(
                "Output directory ({}) already exists and is not empty."
                " Set args.overwrite_output_dir = True to overcome.".format(
                    output_dir))

        self._move_model_to_device()

        train_dataset = self.load_and_cache_examples(train_data,
                                                     verbose=verbose)

        os.makedirs(output_dir, exist_ok=True)

        global_step, training_details = self.train(
            train_dataset,
            output_dir,
            show_running_loss=show_running_loss,
            eval_data=eval_data,
            verbose=verbose,
            **kwargs,
        )

        self.save_model(self.args.output_dir, model=self.model)

        # model_to_save = self.model.module if hasattr(self.model, "module") else self.model
        # model_to_save.save_pretrained(output_dir)
        # self.encoder_tokenizer.save_pretrained(output_dir)
        # self.decoder_tokenizer.save_pretrained(output_dir)
        # torch.save(self.args, os.path.join(output_dir, "training_args.bin"))

        if verbose:
            logger.info(" Training of {} model complete. Saved to {}.".format(
                self.args.model_name, output_dir))

        return global_step, training_details

    def train(
        self,
        train_dataset,
        output_dir,
        show_running_loss=True,
        eval_data=None,
        verbose=True,
        **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        model = self.model
        args = self.args

        tb_writer = SummaryWriter(logdir=args.tensorboard_dir)
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(
            train_dataset,
            sampler=train_sampler,
            batch_size=args.train_batch_size,
            num_workers=self.args.dataloader_num_workers,
        )

        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (
                len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(
                train_dataloader
            ) // args.gradient_accumulation_steps * args.num_train_epochs

        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = []
        custom_parameter_names = set()
        for group in self.args.custom_parameter_groups:
            params = group.pop("params")
            custom_parameter_names.update(params)
            param_group = {**group}
            param_group["params"] = [
                p for n, p in model.named_parameters() if n in params
            ]
            optimizer_grouped_parameters.append(param_group)

        for group in self.args.custom_layer_parameters:
            layer_number = group.pop("layer")
            layer = f"layer.{layer_number}."
            group_d = {**group}
            group_nd = {**group}
            group_nd["weight_decay"] = 0.0
            params_d = []
            params_nd = []
            for n, p in model.named_parameters():
                if n not in custom_parameter_names and layer in n:
                    if any(nd in n for nd in no_decay):
                        params_nd.append(p)
                    else:
                        params_d.append(p)
                    custom_parameter_names.add(n)
            group_d["params"] = params_d
            group_nd["params"] = params_nd

            optimizer_grouped_parameters.append(group_d)
            optimizer_grouped_parameters.append(group_nd)

        if not self.args.train_custom_parameters_only:
            optimizer_grouped_parameters.extend([
                {
                    "params": [
                        p for n, p in model.named_parameters()
                        if n not in custom_parameter_names and not any(
                            nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    args.weight_decay,
                },
                {
                    "params": [
                        p for n, p in model.named_parameters()
                        if n not in custom_parameter_names and any(
                            nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    0.0,
                },
            ])

        warmup_steps = math.ceil(t_total * args.warmup_ratio)
        args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps

        # TODO: Use custom optimizer like with BertSum?
        if args.optimizer == "AdamW":
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              eps=args.adam_epsilon)
        elif args.optimizer == "Adafactor":
            optimizer = Adafactor(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                eps=args.adafactor_eps,
                clip_threshold=args.adafactor_clip_threshold,
                decay_rate=args.adafactor_decay_rate,
                beta1=args.adafactor_beta1,
                weight_decay=args.weight_decay,
                scale_parameter=args.adafactor_scale_parameter,
                relative_step=args.adafactor_relative_step,
                warmup_init=args.adafactor_warmup_init,
            )
            print("Using Adafactor for T5")
        else:
            raise ValueError(
                "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead."
                .format(args.optimizer))

        if args.scheduler == "constant_schedule":
            scheduler = get_constant_schedule(optimizer)

        elif args.scheduler == "constant_schedule_with_warmup":
            scheduler = get_constant_schedule_with_warmup(
                optimizer, num_warmup_steps=args.warmup_steps)

        elif args.scheduler == "linear_schedule_with_warmup":
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total)

        elif args.scheduler == "cosine_schedule_with_warmup":
            scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                num_cycles=args.cosine_schedule_num_cycles,
            )

        elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup":
            scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                num_cycles=args.cosine_schedule_num_cycles,
            )

        elif args.scheduler == "polynomial_decay_schedule_with_warmup":
            scheduler = get_polynomial_decay_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                lr_end=args.polynomial_decay_schedule_lr_end,
                power=args.polynomial_decay_schedule_power,
            )

        else:
            raise ValueError("{} is not a valid scheduler.".format(
                args.scheduler))

        if (args.model_name and os.path.isfile(
                os.path.join(args.model_name, "optimizer.pt"))
                and os.path.isfile(
                    os.path.join(args.model_name, "scheduler.pt"))):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(
                torch.load(os.path.join(args.model_name, "optimizer.pt")))
            scheduler.load_state_dict(
                torch.load(os.path.join(args.model_name, "scheduler.pt")))

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        logger.info(" Training started")

        global_step = 0
        training_progress_scores = None
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args.num_train_epochs),
                                desc="Epoch",
                                disable=args.silent,
                                mininterval=0)
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0
        steps_trained_in_current_epoch = 0
        epochs_trained = 0

        if args.model_name and os.path.exists(args.model_name):
            try:
                # set global_step to gobal_step of last saved checkpoint from model path
                checkpoint_suffix = args.model_name.split("/")[-1].split("-")
                if len(checkpoint_suffix) > 2:
                    checkpoint_suffix = checkpoint_suffix[1]
                else:
                    checkpoint_suffix = checkpoint_suffix[-1]
                global_step = int(checkpoint_suffix)
                epochs_trained = global_step // (
                    len(train_dataloader) // args.gradient_accumulation_steps)
                steps_trained_in_current_epoch = global_step % (
                    len(train_dataloader) // args.gradient_accumulation_steps)

                logger.info(
                    "   Continuing training from checkpoint, will skip to saved global_step"
                )
                logger.info("   Continuing training from epoch %d",
                            epochs_trained)
                logger.info("   Continuing training from global step %d",
                            global_step)
                logger.info(
                    "   Will skip the first %d steps in the current epoch",
                    steps_trained_in_current_epoch)
            except ValueError:
                logger.info("   Starting fine-tuning.")

        if args.evaluate_during_training:
            training_progress_scores = self._create_training_progress_scores(
                **kwargs)

        if args.wandb_project:
            wandb.init(project=args.wandb_project,
                       config={**asdict(args)},
                       **args.wandb_kwargs)
            wandb.watch(self.model)

        if args.fp16:
            from torch.cuda import amp

            scaler = amp.GradScaler()

        for current_epoch in train_iterator:
            model.train()
            if epochs_trained > 0:
                epochs_trained -= 1
                continue
            train_iterator.set_description(
                f"Epoch {epoch_number + 1} of {args.num_train_epochs}")
            batch_iterator = tqdm(
                train_dataloader,
                desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}",
                disable=args.silent,
                mininterval=0,
            )
            for step, batch in enumerate(batch_iterator):
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue
                # batch = tuple(t.to(device) for t in batch)

                inputs = self._get_inputs_dict(batch)
                if args.fp16:
                    with amp.autocast():
                        outputs = model(**inputs)
                        # model outputs are always tuple in pytorch-transformers (see doc)
                        loss = outputs[0]
                else:
                    outputs = model(**inputs)
                    # model outputs are always tuple in pytorch-transformers (see doc)
                    loss = outputs[0]

                if args.n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    batch_iterator.set_description(
                        f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}"
                    )

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    scaler.scale(loss).backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        scaler.unscale_(optimizer)
                    if args.optimizer == "AdamW":
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                    if args.fp16:
                        scaler.step(optimizer)
                        scaler.update()
                    else:
                        optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr",
                                             scheduler.get_last_lr()[0],
                                             global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                             args.logging_steps, global_step)
                        logging_loss = tr_loss
                        if args.wandb_project or self.is_sweeping:
                            wandb.log({
                                "Training loss": current_loss,
                                "lr": scheduler.get_last_lr()[0],
                                "global_step": global_step,
                            })

                    if args.save_steps > 0 and global_step % args.save_steps == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        self.save_model(output_dir_current,
                                        optimizer,
                                        scheduler,
                                        model=model)

                    if args.evaluate_during_training and (
                            args.evaluate_during_training_steps > 0
                            and global_step %
                            args.evaluate_during_training_steps == 0):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results = self.eval_model(
                            eval_data,
                            verbose=verbose
                            and args.evaluate_during_training_verbose,
                            silent=args.evaluate_during_training_silent,
                            **kwargs,
                        )
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        if args.save_eval_checkpoints:
                            self.save_model(output_dir_current,
                                            optimizer,
                                            scheduler,
                                            model=model,
                                            results=results)

                        training_progress_scores["global_step"].append(
                            global_step)
                        training_progress_scores["train_loss"].append(
                            current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(args.output_dir,
                                         "training_progress_scores.csv"),
                            index=False,
                        )

                        if args.wandb_project or self.is_sweeping:
                            wandb.log(
                                self._get_last_metrics(
                                    training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[
                                args.early_stopping_metric]
                            if args.save_best_model:
                                self.save_model(args.best_model_dir,
                                                optimizer,
                                                scheduler,
                                                model=model,
                                                results=results)
                        if best_eval_metric and args.early_stopping_metric_minimize:
                            if results[
                                    args.
                                    early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                                best_eval_metric = results[
                                    args.early_stopping_metric]
                                if args.save_best_model:
                                    self.save_model(args.best_model_dir,
                                                    optimizer,
                                                    scheduler,
                                                    model=model,
                                                    results=results)
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if early_stopping_counter < args.early_stopping_patience:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args.early_stopping_metric}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args.early_stopping_patience}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args.early_stopping_patience} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step if not self.
                                            args.evaluate_during_training else
                                            training_progress_scores,
                                        )
                        else:
                            if results[
                                    args.
                                    early_stopping_metric] - best_eval_metric > args.early_stopping_delta:
                                best_eval_metric = results[
                                    args.early_stopping_metric]
                                if args.save_best_model:
                                    self.save_model(args.best_model_dir,
                                                    optimizer,
                                                    scheduler,
                                                    model=model,
                                                    results=results)
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if early_stopping_counter < args.early_stopping_patience:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args.early_stopping_metric}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args.early_stopping_patience}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args.early_stopping_patience} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step if not self.
                                            args.evaluate_during_training else
                                            training_progress_scores,
                                        )
                        model.train()

            epoch_number += 1
            output_dir_current = os.path.join(
                output_dir,
                "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args.save_model_every_epoch or args.evaluate_during_training:
                os.makedirs(output_dir_current, exist_ok=True)

            if args.save_model_every_epoch:
                self.save_model(output_dir_current,
                                optimizer,
                                scheduler,
                                model=model)

            if args.evaluate_during_training and args.evaluate_each_epoch:
                results = self.eval_model(
                    eval_data,
                    verbose=verbose and args.evaluate_during_training_verbose,
                    silent=args.evaluate_during_training_silent,
                    **kwargs,
                )

                if args.save_eval_checkpoints:
                    self.save_model(output_dir_current,
                                    optimizer,
                                    scheduler,
                                    results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(os.path.join(args.output_dir,
                                           "training_progress_scores.csv"),
                              index=False)

                if args.wandb_project or self.is_sweeping:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args.early_stopping_metric]
                    if args.save_best_model:
                        self.save_model(args.best_model_dir,
                                        optimizer,
                                        scheduler,
                                        model=model,
                                        results=results)
                if best_eval_metric and args.early_stopping_metric_minimize:
                    if results[
                            args.
                            early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                        best_eval_metric = results[args.early_stopping_metric]
                        if args.save_best_model:
                            self.save_model(args.best_model_dir,
                                            optimizer,
                                            scheduler,
                                            model=model,
                                            results=results)
                        early_stopping_counter = 0
                    else:
                        if args.use_early_stopping and args.early_stopping_consider_epochs:
                            if early_stopping_counter < args.early_stopping_patience:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(
                                        f" No improvement in {args.early_stopping_metric}"
                                    )
                                    logger.info(
                                        f" Current step: {early_stopping_counter}"
                                    )
                                    logger.info(
                                        f" Early stopping patience: {args.early_stopping_patience}"
                                    )
                            else:
                                if verbose:
                                    logger.info(
                                        f" Patience of {args.early_stopping_patience} steps reached"
                                    )
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return (
                                    global_step,
                                    tr_loss / global_step
                                    if not self.args.evaluate_during_training
                                    else training_progress_scores,
                                )
                else:
                    if results[
                            args.
                            early_stopping_metric] - best_eval_metric > args.early_stopping_delta:
                        best_eval_metric = results[args.early_stopping_metric]
                        if args.save_best_model:
                            self.save_model(args.best_model_dir,
                                            optimizer,
                                            scheduler,
                                            model=model,
                                            results=results)
                        early_stopping_counter = 0
                    else:
                        if args.use_early_stopping and args.early_stopping_consider_epochs:
                            if early_stopping_counter < args.early_stopping_patience:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(
                                        f" No improvement in {args.early_stopping_metric}"
                                    )
                                    logger.info(
                                        f" Current step: {early_stopping_counter}"
                                    )
                                    logger.info(
                                        f" Early stopping patience: {args.early_stopping_patience}"
                                    )
                            else:
                                if verbose:
                                    logger.info(
                                        f" Patience of {args.early_stopping_patience} steps reached"
                                    )
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return (
                                    global_step,
                                    tr_loss / global_step
                                    if not self.args.evaluate_during_training
                                    else training_progress_scores,
                                )

        return (
            global_step,
            tr_loss / global_step if not self.args.evaluate_during_training
            else training_progress_scores,
        )

    def eval_model(self,
                   eval_data,
                   output_dir=None,
                   verbose=True,
                   silent=False,
                   **kwargs):
        """
        Evaluates the model on eval_data. Saves results to output_dir.

        Args:
            eval_data: Pandas DataFrame containing the 2 columns - `input_text`, `target_text`.
                        - `input_text`: The input text sequence.
                        - `target_text`: The target text sequence.
            output_dir: The directory where model files will be saved. If not given, self.args.output_dir will be used.
            verbose: If verbose, results will be printed to the console on completion of evaluation.
            silent: If silent, tqdm progress bars will be hidden.
            **kwargs: Additional metrics that should be used. Pass in the metrics as keyword arguments (name of metric: function to use).
                        A metric function should take in two parameters. The first parameter will be the true labels, and the second parameter will be the predictions. Both inputs
                        will be lists of strings. Note that this will slow down evaluation significantly as the predicted sequences need to be generated.
        Returns:
            results: Dictionary containing evaluation results.
        """  # noqa: ignore flake8"

        if not output_dir:
            output_dir = self.args.output_dir

        self._move_model_to_device()

        eval_dataset = self.load_and_cache_examples(eval_data,
                                                    evaluate=True,
                                                    verbose=verbose,
                                                    silent=silent)
        os.makedirs(output_dir, exist_ok=True)

        result = self.evaluate(eval_dataset,
                               output_dir,
                               verbose=verbose,
                               silent=silent,
                               **kwargs)
        self.results.update(result)

        if self.args.evaluate_generated_text:
            to_predict = eval_data["input_text"].tolist()
            preds = self.predict(to_predict)

            result = self.compute_metrics(eval_data["target_text"].tolist(),
                                          preds, **kwargs)
            self.results.update(result)

        if verbose:
            logger.info(self.results)

        return self.results

    def evaluate(self,
                 eval_dataset,
                 output_dir,
                 verbose=True,
                 silent=False,
                 **kwargs):
        """
        Evaluates the model on eval_dataset.

        Utility function to be used by the eval_model() method. Not intended to be used directly.
        """

        model = self.model
        args = self.args
        eval_output_dir = output_dir

        results = {}

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        eval_loss = 0.0
        nb_eval_steps = 0
        model.eval()

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        if self.args.fp16:
            from torch.cuda import amp

        for batch in tqdm(eval_dataloader,
                          disable=args.silent or silent,
                          desc="Running Evaluation"):
            # batch = tuple(t.to(device) for t in batch)

            inputs = self._get_inputs_dict(batch)
            with torch.no_grad():
                if self.args.fp16:
                    with amp.autocast():
                        outputs = model(**inputs)
                        tmp_eval_loss = outputs[0]
                else:
                    outputs = model(**inputs)
                    tmp_eval_loss = outputs[0]
                if self.args.n_gpu > 1:
                    tmp_eval_loss = tmp_eval_loss.mean()
                eval_loss += tmp_eval_loss.item()
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps

        results["eval_loss"] = eval_loss

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))

        return results

    def predict(self, to_predict):
        """
        Performs predictions on a list of text.

        Args:
            to_predict: A python list of text (str) to be sent to the model for prediction. Note that the prefix should be prepended to the text.

        Returns:
            preds: A python list of the generated sequences.
        """  # noqa: ignore flake8"

        self._move_model_to_device()

        all_outputs = []
        # Batching
        for batch in tqdm(
            [
                to_predict[i:i + self.args.eval_batch_size]
                for i in range(0, len(to_predict), self.args.eval_batch_size)
            ],
                desc="Generating outputs",
                disable=self.args.silent,
        ):
            input_ids = self.encoder_tokenizer.batch_encode_plus(
                batch,
                max_length=self.args.max_seq_length,
                padding="max_length",
                return_tensors="pt",
                truncation=True,
            )["input_ids"]
            input_ids = input_ids.to(self.device)

            outputs = self.model.generate(
                input_ids=input_ids,
                num_beams=self.args.num_beams,
                max_length=self.args.max_length,
                length_penalty=self.args.length_penalty,
                early_stopping=self.args.early_stopping,
                repetition_penalty=self.args.repetition_penalty,
                do_sample=self.args.do_sample,
                top_k=self.args.top_k,
                top_p=self.args.top_p,
                num_return_sequences=self.args.num_return_sequences,
            )

            all_outputs.extend(outputs.cpu().numpy())

        if self.args.use_multiprocessed_decoding:
            if self.args.multiprocessing_chunksize == -1:
                chunksize = max(
                    len(all_outputs) // (self.args.process_count * 2), 500)
            else:
                chunksize = self.args.multiprocessing_chunksize

            self.model.to("cpu")
            with Pool(self.args.process_count) as p:
                outputs = list(
                    tqdm(
                        p.imap(self._decode, all_outputs, chunksize=chunksize),
                        total=len(all_outputs),
                        desc="Decoding outputs",
                        disable=self.args.silent,
                    ))
            self._move_model_to_device()
        else:
            outputs = [
                self.decoder_tokenizer.decode(
                    output_id,
                    skip_special_tokens=self.args.skip_special_tokens,
                    clean_up_tokenization_spaces=True)
                for output_id in all_outputs
            ]

        data_list = []

        for data in outputs:
            if isinstance(data, str):
                data = data.replace('. ', '.')
                data = data.replace(' .', '.')
                if data.endswith('.'):
                    data = data.replace('.', ' .')
                if data.endswith('?'):
                    data = data.replace('?', ' ?')
            data_list.append(data)

        if self.args.num_return_sequences > 1:
            return [
                data_list[i:i + self.args.num_return_sequences] for i in range(
                    0, len(data_list), self.args.num_return_sequences)
            ]
        else:
            return data_list

    def _decode(self, output_id):
        return self.decoder_tokenizer.decode(
            output_id,
            skip_special_tokens=self.args.skip_special_tokens,
            clean_up_tokenization_spaces=True)

    def compute_metrics(self, labels, preds, **kwargs):
        """
        Computes the evaluation metrics for the model predictions.

        Args:
            labels: List of target sequences
            preds: List of model generated outputs
            **kwargs: Custom metrics that should be used. Pass in the metrics as keyword arguments (name of metric: function to use).
                        A metric function should take in two parameters. The first parameter will be the true labels, and the second parameter will be the predictions. Both inputs
                        will be lists of strings. Note that this will slow down evaluation significantly as the predicted sequences need to be generated.

        Returns:
            result: Dictionary containing evaluation results.
        """  # noqa: ignore flake8"
        # assert len(labels) == len(preds)

        results = {}
        for metric, func in kwargs.items():
            results[metric] = func(labels, preds)

        return results

    def load_and_cache_examples(self,
                                data,
                                evaluate=False,
                                no_cache=False,
                                verbose=True,
                                silent=False):
        """
        Creates a T5Dataset from data.

        Utility function for train() and eval() methods. Not intended to be used directly.
        """

        encoder_tokenizer = self.encoder_tokenizer
        decoder_tokenizer = self.decoder_tokenizer
        args = self.args

        if not no_cache:
            no_cache = args.no_cache

        if not no_cache:
            os.makedirs(self.args.cache_dir, exist_ok=True)

        mode = "dev" if evaluate else "train"

        if args.dataset_class:
            CustomDataset = args.dataset_class
            return CustomDataset(encoder_tokenizer, decoder_tokenizer, args,
                                 data, mode)
        else:
            return SimpleSummarizationDataset(encoder_tokenizer, self.args,
                                              data, mode)

    def _create_training_progress_scores(self, **kwargs):
        extra_metrics = {key: [] for key in kwargs}
        training_progress_scores = {
            "global_step": [],
            "eval_loss": [],
            "train_loss": [],
            **extra_metrics,
        }

        return training_progress_scores

    def _get_last_metrics(self, metric_values):
        return {metric: values[-1] for metric, values in metric_values.items()}

    def save_model(self,
                   output_dir=None,
                   optimizer=None,
                   scheduler=None,
                   model=None,
                   results=None):
        if not output_dir:
            output_dir = self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)

        logger.info(f"Saving model into {output_dir}")

        if model and not self.args.no_save:
            # Take care of distributed/parallel training
            model_to_save = model.module if hasattr(model, "module") else model
            self.save_model_args(output_dir)

            os.makedirs(os.path.join(output_dir), exist_ok=True)
            model_to_save.save_pretrained(output_dir)
            self.config.save_pretrained(output_dir)
            self.encoder_tokenizer.save_pretrained(output_dir)

            torch.save(self.args, os.path.join(output_dir,
                                               "training_args.bin"))
            if optimizer and scheduler and self.args.save_optimizer_and_scheduler:
                torch.save(optimizer.state_dict(),
                           os.path.join(output_dir, "optimizer.pt"))
                torch.save(scheduler.state_dict(),
                           os.path.join(output_dir, "scheduler.pt"))

        if results:
            output_eval_file = os.path.join(output_dir, "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                for key in sorted(results.keys()):
                    writer.write("{} = {}\n".format(key, str(results[key])))

    def _move_model_to_device(self):
        self.model.to(self.device)

    def _get_inputs_dict(self, batch):
        device = self.device
        pad_token_id = self.encoder_tokenizer.pad_token_id
        source_ids, source_mask, y = batch["source_ids"], batch[
            "source_mask"], batch["target_ids"]
        y_ids = y[:, :-1].contiguous()
        labels = y[:, 1:].clone()
        labels[y[:, 1:] == pad_token_id] = -100

        inputs = {
            "input_ids": source_ids.to(device),
            "attention_mask": source_mask.to(device),
            "decoder_input_ids": y_ids.to(device),
            "labels": labels.to(device),
        }

        return inputs

    def save_model_args(self, output_dir):
        os.makedirs(output_dir, exist_ok=True)
        self.args.save(output_dir)

    def _load_model_args(self, input_dir=None):
        args = Seq2SeqArgs()
        if input_dir is not None:
            args.load(input_dir)
        return args

    def get_named_parameters(self):
        return [n for n, p in self.model.named_parameters()]
Пример #13
0
    print(args)
    if args.t5_type == "e2e" and args.is_fquad == True:
        print(
            "WARNING: e2e is meant to generate questions by context. The ouput of the script will be a csv instead of a json."
        )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device:", device)

    model_created = False
    print("Loading model and tokenizer...", end="", flush=True)
    if args.checkpoint != None:
        model_created = True
        if args.bart:
            config = BartConfig.from_json_file(args.checkpoint +
                                               "/config.json")
            model = BartForConditionalGeneration.from_pretrained(
                args.checkpoint + "/pytorch_model.bin", config=config)
        if args.t5:
            config = T5Config.from_json_file(args.checkpoint + "/config.json")
            model = T5ForConditionalGeneration.from_pretrained(
                args.checkpoint + "/pytorch_model.bin", config=config)
        elif not args.bart and not args.t5:
            config = EncoderDecoderConfig.from_json_file(args.checkpoint +
                                                         "/config.json")
            model = EncoderDecoderModel.from_pretrained(args.checkpoint +
                                                        "/pytorch_model.bin",
                                                        config=config)
        model_name = args.checkpoint

    if args.bart:
        if args.checkpoint == None:
            model_name = "WikinewsSum/bart-large-multi-fr-wiki-news" if args.model_name == "" else args.model_name
Пример #14
0
def run(args):
    save_args(args, with_tensorboard=True)

    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
    tokenizer.bos_token = BOS_OUTPUT_TOKEN  # For decoding specifically

    train_dataset, eval_dataset, test_dataset = [
        SequentialJSONIterableDataset(
            os.path.join(args.datadir, f"{split}_*.clf.jsonl"),
            args=args,
            process_lines=False,
            reservoir_shuffle=shuffle,
            repeat=repeat,
            reservoir_size=args.reservoir_shuffle_size,
        )
        for (split, shuffle, repeat) in [
            ("train", True, True),
            ("valid", False, True),
            ("test", False, False),
        ]
    ]

    # Multiple inputs. Use Informativeness
    if args.input_type == "all":
        # ControlCode or generic Bart
        model = MultiInputBartForConditionalGeneration.from_pretrained(
            args.model_name_or_path
        )
        # MultiHead
        if args.use_multi_head:
            model = MultiInputMultiHeadBartForConditionalGeneration.from_pretrained_multi(
                args.model_name_or_path
            )

    elif args.use_multi_head:
        # MultiHead
        model = MultiHeadBartForConditionalGeneration.from_pretrained_multi(
            args.model_name_or_path
        )
    else:
        # ControlCode or generic Bart
        model = BartForConditionalGeneration.from_pretrained(args.model_name_or_path)

    # Set special token IDs for eval function
    model.config.decoder_start_token_id = tokenizer.bos_token_id
    model.config.pad_token_id = (
        tokenizer.pad_token_id
    )  # Might not be necessary, but idk

    if args.cuda:
        model = model.to("cuda")
    if args.distributed:
        model = utils_dist.wrap(model, args)

    optimizer = get_optimizer(args, model)
    if args.use_apex and HAS_APEX:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O2")

    collate_fn_filled = functools.partial(
        collate_fn,
        input_type=args.input_type,
        modify_prefix=(not args.no_modify_prefix),
        target_type=args.target_type,
    )

    if args.test_only:  # run on test set
        print("=== TEST/EVAL ONLY, no training")

        named_splits = {
            "train": train_dataset,
            "valid": eval_dataset,
            "test": test_dataset,
        }

        selected_split = named_splits[args.test_split]
        eval_iter = DataLoader(
            selected_split,
            batch_size=args.eval_batch_size,
            collate_fn=collate_fn_filled,
            num_workers=1,
            worker_init_fn=worker_init_fn,
        )
        results = evaluation(args, model, 0, tokenizer, eval_iter, write_summary=False)

        print(results["rouge_scores"])

        # Save results in JSON file
        results_filename = Path(args.logdir) / f"{args.test_split}_results.json"

        with results_filename.open("w") as f:
            json.dump(results, f, indent=2, sort_keys=True)
        return

    model.train()

    global_step = 0
    grad_acc_step = 0
    loss_tensor_log = []

    train_iter = DataLoader(
        train_dataset,
        batch_size=args.per_unit_batch_size,
        collate_fn=collate_fn_filled,
        num_workers=args.num_data_workers,
        worker_init_fn=worker_init_fn,
    )

    eval_iter = DataLoader(
        eval_dataset,
        batch_size=args.eval_batch_size,
        collate_fn=collate_fn_filled,
        num_workers=args.num_data_workers,
        worker_init_fn=worker_init_fn,
    )

    for _, (_, input_texts, output_texts) in enumerate(train_iter):
        if len(input_texts) == 0:
            continue

        # Prohibit batches with no contribution summaries at all
        if sum(len(out) for out in output_texts) == 0:
            continue

        # MultiHead + Auxiliary loss (Informativeness)
        if args.target_type == "both" and args.use_multi_head:
            if args.input_type == "paper":
                ips = [i for ip in input_texts for i in ip]
            elif args.input_type == "all":
                ips = [list(ip[0]) for ip in input_texts]

            ops = [o for op in output_texts for o in op]
            tok_input, tok_output, labels = tokenize_batch(
                ips, ops, model, tokenizer, args
            )

            # Prepare inputs
            if args.input_type == "paper":
                tok_input["input_ids"] = tok_input["input_ids"].view(
                    args.per_unit_batch_size, 2, -1
                )[:, 0, :]
                tok_input["attention_mask"] = tok_input["attention_mask"].view(
                    args.per_unit_batch_size, 2, -1
                )[:, 0, :]
                additional_kwargs = {}
            elif args.input_type == "all":
                new_tok_input = {}
                new_tok_input["input_ids"] = [t["input_ids"] for t in tok_input]
                new_tok_input["attention_mask"] = [
                    t["attention_mask"] for t in tok_input
                ]
                tok_input = new_tok_input
                additional_kwargs = {
                    "final_layer": [None, None, None],
                    "input_modes": ["LogL", "MI_inbound", "MI_outbound"],
                    "informativeness": args.use_informativeness,
                }

            # b x [cont, ctx] x seq_len
            tok_output["input_ids"] = tok_output["input_ids"].view(
                args.per_unit_batch_size, 2, -1
            )
            tok_output["attention_mask"] = tok_output["attention_mask"].view(
                args.per_unit_batch_size, 2, -1
            )
            labels = labels.view(args.per_unit_batch_size, 2, -1)

            # Fixing the strange behavior of torch.distributed where some values
            # are overwritten when the sequence length is just one.
            for b in range(args.per_unit_batch_size):
                tok_output["input_ids"][b][tok_output["input_ids"][b][:, 0] == 1, 0] = 2
                tok_output["attention_mask"][b][
                    tok_output["attention_mask"][b][:, 0] == 0, 0
                ] = 1
                labels[b][labels[b][:, 0] == -100, 0] = 2

            all_labels = []
            all_dec_inputs = []
            for idx in range(2):
                all_dec_inputs.append(
                    dict(
                        input_ids=tok_output["input_ids"][:, idx, :],
                        attention_mask=tok_output["attention_mask"][:, idx, :],
                    )
                )
                all_labels.append(labels[:, idx, :])

            # Disable sync except at the beginning and the end of gradient accumulation
            if args.distributed:
                if (grad_acc_step == 0) or (
                    (grad_acc_step + 1) % args.gradient_accumulation_steps == 0
                ):
                    model.require_forward_param_sync = True
                    model.require_backward_grad_sync = True
                else:
                    model.require_forward_param_sync = False
                    model.require_backward_grad_sync = False

            outs = model(
                input_ids=tok_input["input_ids"],
                attention_mask=tok_input["attention_mask"],
                decoder_input_ids=[
                    shift_left(tok_output["input_ids"], tokenizer.bos_token_id)
                    for tok_output in all_dec_inputs
                ],
                decoder_attention_mask=[
                    tok_output["attention_mask"] for tok_output in all_dec_inputs
                ],
                lm_labels=all_labels,
                **additional_kwargs,
            )

            # MultiHead + Informativeness
            if args.input_type == "all":
                # losses for generating both contrib & context
                if args.use_informativeness:
                    # MI_outbound: informativeness
                    contrib_loss = (
                        outs["LogL"][0][0] + args.aux_scale * outs["MI_outbound"][0][0]
                    )
                    context_loss = (
                        outs["LogL"][1][0] + args.aux_scale * outs["MI_inbound"][1][0]
                    )
                else:
                    contrib_loss = (
                        outs["LogL"][0][0]
                        - args.aux_scale * outs["MI_inbound"][0][0]
                        + (
                            args.aux_scale * outs["MI_outbound"][0][0]
                            if not args.use_adaptive_scale
                            else 0
                        )
                    )
                    context_loss = (
                        outs["LogL"][1][0]
                        + (
                            args.aux_scale * outs["MI_inbound"][1][0]
                            if not args.use_adaptive_scale
                            else 0
                        )
                        - args.aux_scale * outs["MI_outbound"][1][0]
                    )
                loss = (contrib_loss + context_loss) / 2
                losses = [
                    outs["LogL"][0][0],
                    outs["MI_inbound"][0][0],
                    outs["MI_outbound"][0][0],
                    outs["LogL"][1][0],
                    outs["MI_inbound"][1][0],
                    outs["MI_outbound"][1][0],
                ]

            # multihead
            else:
                # contrib, context
                losses = [o[0] for o in outs]
                loss = sum(losses) / len(losses)

            check_nan(loss)

            # reporting logL only
            loss_tensor_log.append(
                (losses[0] if args.input_type == "all" else loss).detach()
            )

            global_step, grad_acc_step, loss_tensor_log = update_step(
                args,
                model,
                tokenizer,
                optimizer,
                loss,
                losses,
                eval_iter,
                global_step,
                grad_acc_step,
                loss_tensor_log,
            )

        else:
            # For compatibility of training loop
            if args.target_type != "both":
                input_texts, output_texts = ([input_texts], [output_texts])

            input_texts, output_texts = zip(*input_texts), zip(*output_texts)
            heads = ["contrib", "context"]
            losses = []

            # loop over the two targets
            for input_text, output_text, head in zip(input_texts, output_texts, heads):
                tok_input, tok_output, labels = tokenize_batch(
                    input_text, output_text, model, tokenizer, args
                )

                if args.distributed:
                    if (grad_acc_step == 0) or (
                        (grad_acc_step + 1) % args.gradient_accumulation_steps == 0
                    ):
                        model.require_forward_param_sync = True
                        model.require_backward_grad_sync = True
                    else:
                        model.require_forward_param_sync = False
                        model.require_backward_grad_sync = False

                # Auxiliary loss: informativeness
                if args.input_type == "all":
                    outs = model(
                        input_ids=[t["input_ids"] for t in tok_input],
                        attention_mask=[t["attention_mask"] for t in tok_input],
                        decoder_input_ids=shift_left(
                            tok_output["input_ids"], tokenizer.bos_token_id
                        ),
                        decoder_attention_mask=tok_output["attention_mask"],
                        lm_labels=labels,
                    )
                else:
                    outs = model(
                        input_ids=tok_input["input_ids"],
                        attention_mask=tok_input["attention_mask"],
                        decoder_input_ids=shift_left(
                            tok_output["input_ids"], tokenizer.bos_token_id
                        ),
                        decoder_attention_mask=tok_output["attention_mask"],
                        lm_labels=labels,
                    )

                if args.input_type == "all":
                    losses += outs[0]
                    if args.target_type == "contrib":
                        if args.use_informativeness:
                            coeff = [
                                1,
                                0,
                                args.aux_scale,
                            ]
                        else:
                            coeff = [
                                1,
                                -args.aux_scale,
                                args.aux_scale if not args.use_adaptive_scale else 0,
                            ]
                    elif args.target_type == "context":
                        if args.use_informativeness:
                            coeff = [
                                1,
                                args.aux_scale,
                                0,
                            ]
                        else:
                            coeff = [
                                1,
                                args.aux_scale if not args.use_adaptive_scale else 0,
                                -args.aux_scale,
                            ]

                    loss = sum(l * c for l, c in zip(outs[0], coeff))

                elif args.use_multi_head:
                    loss = outs[0 if head == "contrib" else 1][0]

                else:
                    loss = outs[0]

                check_nan(loss)

                loss_tensor_log.append(
                    (losses[0] if args.input_type == "all" else loss).detach()
                )

                global_step, grad_acc_step, loss_tensor_log = update_step(
                    args,
                    model,
                    tokenizer,
                    optimizer,
                    loss,
                    losses,
                    eval_iter,
                    global_step,
                    grad_acc_step,
                    loss_tensor_log,
                )
Пример #15
0
class KoBartModel(object):
    """KoBart Model from SKT"""

    def __init__(self, model: str, device: str):
        config = BartConfig.from_pretrained("hyunwoongko/kobart")
        self.model = BartForConditionalGeneration(config).half().eval().to(
            device)
        self.model.model.load_state_dict(torch.load(
            model,
            map_location=device,
        ))
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(
            "hyunwoongko/kobart")
        self.device = device

    @classmethod
    def from_pretrained(
        cls,
        device: str,
        model_path: str = "path/to/model.pt",
    ):
        """
        load pretrained model from disk.
        this method is equivalent with constructor.

        Args:
            device (str): device
            model_path (str): full model path

        Returns:
            (KoBartModel): object of KoBartModel

        """
        return cls(model=model_path, device=device)

    def tokenize(
        self,
        texts: List[str],
        max_len: int = 1024,
    ) -> Dict:
        if isinstance(texts, str):
            texts = [texts]

        texts = [f"<s> {text}" for text in texts]
        eos = self.tokenizer.convert_tokens_to_ids(self.tokenizer.eos_token)
        eos_list = [eos for _ in range(len(texts))]

        tokens = self.tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            add_special_tokens=False,
            max_length=max_len - 1,
            # result + <eos>
        )

        return self.add_bos_eos_tokens(tokens, eos_list)

    def add_bos_eos_tokens(self, tokens, eos_list):
        input_ids = tokens["input_ids"]
        attention_mask = tokens["attention_mask"]
        token_added_ids, token_added_masks = [], []

        for input_id, atn_mask, eos in zip(
                input_ids,
                attention_mask,
                eos_list,
        ):
            maximum_idx = [
                i for i, val in enumerate(input_id)
                if val != self.tokenizer.convert_tokens_to_ids("<pad>")
            ]

            if len(maximum_idx) == 0:
                idx_to_add = 0
            else:
                idx_to_add = max(maximum_idx) + 1

            eos = torch.tensor([eos], requires_grad=False)
            additional_atn_mask = torch.tensor([1], requires_grad=False)

            input_id = torch.cat([
                input_id[:idx_to_add],
                eos,
                input_id[idx_to_add:],
            ]).long()

            atn_mask = torch.cat([
                atn_mask[:idx_to_add],
                additional_atn_mask,
                atn_mask[idx_to_add:],
            ]).long()

            token_added_ids.append(input_id.unsqueeze(0))
            token_added_masks.append(atn_mask.unsqueeze(0))

        tokens["input_ids"] = torch.cat(token_added_ids, dim=0)
        tokens["attention_mask"] = torch.cat(token_added_masks, dim=0)
        return tokens

    @torch.no_grad()
    def translate(
        self,
        text: str,
        beam: int = 5,
        sampling: bool = False,
        temperature: float = 1.0,
        sampling_topk: int = -1,
        sampling_topp: float = -1,
        length_penalty: float = 1.0,
        max_len_a: int = 1,
        max_len_b: int = 50,
        no_repeat_ngram_size: int = 4,
        return_tokens: bool = False,
        bad_words_ids=None,
    ):
        """
        generate sentence from input sentence.

        See Also:
            1. method and argument names follow fairseq.models.transformer.TransformerModel
            >>> from fairseq.models.transformer import TransformerModel

            2. language codes follow farseq language codes
            >>> from transformers.tokenization_mbart import FAIRSEQ_LANGUAGE_CODES

        Args:
            text (str): input string
            beam (int): beam size
            sampling (bool): sampling or not
            temperature (float): temperature value
            sampling_topk (int): topk sampling
            sampling_topp (float): topp sampling probs
            return_tokens (bool): return tokens or not

        Returns:
            (str): generated sentence string (if return_tokens=False)
            (List[str]): list of generated tokens (if return_tokens=True)

        """

        if isinstance(text, str):
            texts = [text]
        else:
            texts = text

        tokenized = self.tokenize(texts)
        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]

        generated = self.model.generate(
            input_ids.to(self.device),
            attention_mask=attention_mask.to(self.device),
            use_cache=True,
            early_stopping=False,
            decoder_start_token_id=self.tokenizer.bos_token_id,
            num_beams=beam,
            do_sample=sampling,
            temperature=temperature,
            top_k=sampling_topk if sampling_topk > 0 else None,
            top_p=sampling_topp if sampling_topk > 0 else None,
            no_repeat_ngram_size=no_repeat_ngram_size,
            bad_words_ids=[[self.tokenizer.convert_tokens_to_ids("<unk>")]]
            if not bad_words_ids else bad_words_ids +
            [[self.tokenizer.convert_tokens_to_ids("<unk>")]],
            length_penalty=length_penalty,
            max_length=max_len_a * len(input_ids[0]) + max_len_b,
        )

        if return_tokens:
            output = [
                self.tokenizer.convert_ids_to_tokens(_)
                for _ in generated.tolist()
            ]

            return (output[0] if isinstance(
                text,
                str,
            ) else output)

        else:
            output = self.tokenizer.batch_decode(
                generated.tolist(),
                skip_special_tokens=True,
            )

            return (output[0].strip() if isinstance(
                text,
                str,
            ) else [o.strip() for o in output])
Пример #16
0
 def __init__(self):
     self.model = BartForConditionalGeneration.from_pretrained("./models/bart-coder")
     self.tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
 def __init__(self, args):
     """
     currently we use the model `bart-large-cnn`
     """
     self.model = BartForConditionalGeneration.from_pretrained('model_id')
     self.tokenizer = BartTokenizer.from_pretrained(args['model_id'])
Пример #18
0
    raise NotImplementedError

  def get_iter():
    data = read_qa_data(args.data, eval(args.domains.upper()), split=args.split, has_ret=args.has_ret, use_inp=args.use_inp, load=args.num_options > 0)
    iter = string_to_tensor(tokenizer, data, max_input_len=max_input_len, max_target_len=max_target_len, max_token_per_batch=args.max_token_per_batch,
                            append_bos=append_bos, append_eos=append_eos, num_options=args.num_options, qa_merge=qa_merge, device=device)
    return iter

  print('loading models ...')
  if args.model == 'facebook/bart-large':
    model = MyBart.from_pretrained(base_model).to(device)
  else:
    if args.model_type == 'bart':
      model = MyBart.from_pretrained(base_model, state_dict=torch.load(args.model)).to(device)
    elif args.model_type == 'bart_raw':
      model = BartForConditionalGeneration.from_pretrained(base_model, state_dict=torch.load(args.model)).to(device)
    elif args.model_type == 'gpt2':
      model = MyGPT2.from_pretrained(base_model, state_dict=torch.load(args.model)).to(device)
    else:
      raise NotImplementedError

  os.makedirs(os.path.dirname(args.output), exist_ok=True)
  if args.task == 'generate':
    model.eval()
    with open(args.data, 'r') as fin, open(args.output, 'w') as fout:
      for l in tqdm(fin):
        question = l.strip()
        res = model.generate(torch.LongTensor([[0] + tokenizer.encode(question)]).to(device),
                             num_beams=1, min_length=1, max_length=50, early_stopping=False, num_return_sequences=1)
        res = [tokenizer.decode(x, skip_special_tokens=True).strip() for x in res]
        fout.write('{}\n'.format(res[0]))
Пример #19
0
def get_kobart_for_conditional_generation():
    return BartForConditionalGeneration.from_pretrained("hyunwoongko/kobart")
Пример #20
0
    def __init__(
        self,
        model_name_or_path, # teacher
        tokenizer_name,
        model_cache_dir,
        input_max_length,
        target_max_length,
        summary_column_name,
        document_column_name,
        wandb_project,
        wandb_run_name,
        student_encoder_layers,
        student_decoder_layers,
        **kwargs,
    ):
        super().__init__(
            input_max_length,
            target_max_length,
            summary_column_name,
            document_column_name,
            wandb_project,
            wandb_run_name,
        )
        self.tokenizer = BartTokenizer.from_pretrained(
            tokenizer_name if tokenizer_name else model_name_or_path,
            cache_dir=model_cache_dir,
        )
        teacher = BartForConditionalGeneration.from_pretrained(
            model_name_or_path, cache_dir=model_cache_dir,
        ).eval()

        student_updates = {
            "decoder_layers": student_decoder_layers,
            "encoder_layers": student_encoder_layers,
        }
        d_layers_to_copy = self._get_layers_to_copy(student_updates["decoder_layers"], teacher.config.decoder_layers)
        e_layers_to_copy: List = self._get_layers_to_copy(student_updates["encoder_layers"], teacher.config.encoder_layers)
        kw = teacher.config.to_diff_dict()
        kw.update(student_updates)
        # Copy weights
        student_cfg = BartConfig(**kw)
        student = BartForConditionalGeneration(student_cfg)
        student, _ = self._init_student(student, teacher)
        self._copy_to_student(d_layers_to_copy, e_layers_to_copy, student_encoder_layers, student_decoder_layers, student, teacher)
        self.model = student
        print(student)
        inputs = self.tokenizer.encode_plus("TEXT TO SUMMARIZE", max_length=1024, return_tensors="pt")

        # Summarize
        outputs = self.model.generate(
            input_ids=inputs['input_ids'], 
            attention_mask=inputs['attention_mask'], 
            max_length=400, 
            min_length=150, 
            length_penalty=2.0, 
            num_beams=4, 
            early_stopping=True
        )

        # Decode
        summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        print(summary)
Пример #21
0
from flask import Flask, request, render_template
from regression import model1, tokenizer_new, tokenize_new
import numpy as np

from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
import torch
config = BartConfig.from_json_file('output_model/hate/config.json')
model = BartForConditionalGeneration.from_pretrained('output_model/hate/')
tok = BartTokenizer.from_pretrained('output_model/hate/')

app = Flask(__name__)
app.debug = True


@app.route("/", methods=['GET', 'POST'])
def index():
    if request.method == "POST":
        name = request.form["name"]
        hate = " "
        if (len(name) > 0):
            if name.split(" ")[-1] == '':
                a, b, c = tokenize_new([name], tokenizer_new)
                out = np.round(model1.predict([a, b])[0][0])
                if out <= 3:
                    hate = "No Hate detected"
                elif out > 3 and out <= 5:
                    hate = "LOW"
                elif out > 5 and out <= 7:
                    hate = "MEDIUM"
                else:
                    hate = "HIGH"
Пример #22
0
import argparse
from pathlib import Path

import torch
from tqdm import tqdm

from transformers import BartForConditionalGeneration, BartTokenizer

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

SUMMARY_TOKENIZER = BartTokenizer.from_pretrained('bart-large-cnn')
SUMMARY_MODEL = BartForConditionalGeneration.from_pretrained('bart-large-cnn')
SUMMARY_MODEL.to(torch_device)
SUMMARY_MODEL.eval()


def clean_text(text):
    tokens = text.split(" ")
    if '[SEP]' in tokens:
        sepind = tokens.index('[SEP]')
        tokens = tokens[sepind + 1:]
    txt = ' '.join(tokens)
    txt = txt.replace(' ##', '')
    txt = txt.replace('##', '')
    txt = txt.strip()
    txt = " ".join(txt.split())
    txt = txt.replace(' .', '.')
    txt = txt.replace('( ', '(')
    txt = txt.replace(' )', ')')
Пример #23
0
 def test_generate_fp16(self):
     config, input_ids, batch_size = self._get_config_and_data()
     attention_mask = input_ids.ne(1).to(torch_device)
     model = BartForConditionalGeneration(config).eval().to(torch_device).half()
     model.generate(input_ids, attention_mask=attention_mask, do_sample=False, early_stopping=True)
Пример #24
0
def Seq2Seq(df):
    model_type = 'facebook/bart-large'

    tokenizer = BartTokenizer.from_pretrained(model_type)
    model = BartModel.from_pretrained(model_type)
    mask_model = BartForConditionalGeneration.from_pretrained(model_type)

    sep_token = '</s>'
    mask_token = '<mask>'

    mask_id = tokenizer(mask_token, return_tensors='pt')['input_ids'][0][1]
    sep_id = tokenizer(sep_token, return_tensors='pt')['input_ids'][0][1]

    optimizer = AdamW(model.parameters())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    mask_model.to(device)
    
    df['mask_text'] = 0
    df['auxiliary_text'] = 0

    for i in range(len(df)):
        sentiment = df['sentiment'].iloc[i]
        
        aspect = 'sentence'

        if DPM_type == 'Senti':
            mask_sent = 'the polarity of the ' + aspect + ' is ' + mask_token + ' ' + sep_token  + ' '
            auxiliary_sent = 'the polarity of the ' + aspect + ' is ' + sentiment + ' ' + sep_token  + ' '
   
        df['mask_text'].iloc[i] = mask_sent + df['text'].iloc[i]
        df['auxiliary_text'].iloc[i] = auxiliary_sent + df['text'].iloc[i]
        
    df['distance'] = 0
    df = df.astype('object')

    for i in range(len(df)): 

        tokenized = df['mask_text'][i:i+1].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=MAX_LEN, truncation=True)))

        sep_index = tokenized[i].index(sep_id)
        mask_index = tokenized[i].index(mask_id)

        padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", 
                              value=0, truncating="post", padding="post")

        attention_mask = np.where(padded != 0, 1, 0)

        input_ids = torch.tensor(padded).to(device)  
        attention_mask = torch.tensor(attention_mask).to(device)

        with torch.no_grad():
            last_hidden_states = model(input_ids, attention_mask=attention_mask)

        original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu().numpy()


        distance = []

        for pertubed_index in range(sep_index+1, MAX_LEN):
            padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", 
                              value=0, truncating="post", padding="post")
            if padded[0][pertubed_index] != 0 and padded[0][pertubed_index] != sep_id:
                #print(padded.shape)
                cur_id = padded[0][pertubed_index]
                padded[0][pertubed_index] = mask_id

                cur_embedding = mask_embedding(model, padded, mask_index)
                d = dist(original_mask_embedding, cur_embedding)
                distance.append((cur_id, d))

        df['distance'].iloc[i] = distance
    
    
    df['perturbed_mask_index'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        perturbed_mask_index = []
        mask_threshold = calculate_threshold(np.array(df['distance'].iloc[i])[:, 1], std_strength)
        for dis_index in range(len(df['distance'].iloc[i])):
            if df['distance'].iloc[i][dis_index][1] < mask_threshold:
                perturbed_mask_index.append(dis_index)

        df['perturbed_mask_index'].iloc[i] = perturbed_mask_index
    
    
    df['augment_token_id'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i])
        tokenized = torch.Tensor(tokenized).unsqueeze(0).to(torch.int64).to(device)
        augment_tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i])

        mask_tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i])
        sep_index = mask_tokenized.index(sep_id)

        for j in range(len(df['perturbed_mask_index'].iloc[i])):
            perturbed_mask_index = df['perturbed_mask_index'].iloc[i][j] + sep_index + 1
            mask_tokenized[perturbed_mask_index] = mask_id

        mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to(torch.int64).to(device)
        logits = mask_model(mask_tokenized).logits

        for j in range(len(df['perturbed_mask_index'].iloc[i])):
            perturbed_mask_index = df['perturbed_mask_index'].iloc[i][j] + sep_index + 1
            probs = logits[0, perturbed_mask_index].softmax(dim=0)
            values, predictions = probs.topk(1)
            augment_tokenized[perturbed_mask_index] = int(predictions.cpu().numpy())

        df['augment_token_id'].iloc[i] = augment_tokenized
    
    
    df['augment_text'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        sep_index = df['augment_token_id'].iloc[i].index(sep_id)
        df['augment_text'].iloc[i] = tokenizer.decode(df['augment_token_id'].iloc[i][sep_index+1:-1])

    return df
Пример #25
0
 def test_default_generate_kwargs(self):
     config, input_ids, _ = self._get_config_and_data()
     model = BartForConditionalGeneration(config).eval().to(torch_device)
     model.generate(input_ids)
     model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
Пример #26
0
    def load_class(self):
        # Load the tokenizer.
        if self.verbose == True:
            print('Loading {} class...'.format(self.model_name))
        if self.model_name == 'bert':
            # Load BertForSequenceClassification, the pretrained BERT model with a single
            # linear classification layer on top.
            self.model = BertForSequenceClassification.from_pretrained(
                self.
                model_type,  # Use the 12-layer BERT model, with an uncased vocab.
                # You can increase this for multi-class tasks.
                num_labels=self.num_labels,
                output_attentions=
                False,  # Whether the model returns attentions weights.
                output_hidden_states=
                False,  # Whether the model returns all hidden-states.
            )
        if self.model_name == 'distilbert':
            self.model = DistilBertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'albert':
            self.model = AlbertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'bart':
            if self.task == 'classification':
                self.model = BartForSequenceClassification.from_pretrained(
                    self.model_type,
                    num_labels=self.num_labels,
                    output_attentions=False,
                    output_hidden_states=False,
                )
            if self.task == 'summarize':
                self.model = BartForConditionalGeneration.from_pretrained(
                    self.model_type)

        if self.model_name == 'xlnet':
            self.model = XLNetForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'roberta':
            self.model = RobertaForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'camenbert':
            self.model = CamembertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'flaubert':
            self.model = FlaubertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'gpt2':
            self.model = GPT2LMHeadModel.from_pretrained(self.model_type)
Пример #27
0
ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'}

HOSTNAME = '83.212.102.161'

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True
#app.secret_key = "super secret key"

#app.config['JSON_AS_ASCII'] = False
bootstrap = Bootstrap(app)

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('bart-large-cnn')

results = []
articles = []
id = 0


def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


def submit(LONG_TEXT, id, max_words):
    article_input_ids = tokenizer.batch_encode_plus(
        [LONG_TEXT], return_tensors='pt',
        max_length=1024)['input_ids'].to(torch_device)
Пример #28
0
# Original index
## Base BERT
from transformers import BertTokenizer, BertForMaskedLM
## Base XLNet
from transformers import XLNetTokenizer, XLNetLMHeadModel
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased').eval()
## Base XLMRoberta
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
    'xlm-roberta-base').eval()
## Base Bart
from transformers import BartTokenizer, BartForConditionalGeneration
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
bart_model = BartForConditionalGeneration.from_pretrained(
    'facebook/bart-large').eval()
## Base Eletra
from transformers import ElectraTokenizer, ElectraForMaskedLM
electra_tokenizer = ElectraTokenizer.from_pretrained(
    'google/electra-small-generator')
electra_model = ElectraForMaskedLM.from_pretrained(
    'google/electra-small-generator').eval()
## Base Roberta
from transformers import RobertaTokenizer, RobertaForMaskedLM
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval()

## Bert VNese base uncased
from transformers import AutoTokenizer, AutoModelForMaskedLM
phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert_model = AutoModelForMaskedLM.from_pretrained("vinai/phobert-base")
Пример #29
0
def make_model(model_pth=None):
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
    if model_pth != None:
        print("model loaded from {}".format(model_pth))
        model.load_state_dict(torch.load(model_pth))
    return model
Пример #30
0
                              'a',
                              encoding='utf8') as fw:
                        fw.write(' .'.join(i.split('.')).replace('<n>', ' '))
                        fw.write('\n')

    print('inference_time: {}'.format(time.time() - start))


if __name__ == '__main__':
    assert args.dataset in ['cnndm','xsum','newsroom','multi-news','billsum','reddit','wikihow','arxiv','pubmed'],\
        '--dataset should be cnndm, xsum, newsroom, multi-news, billsum, reddit, wikihow, arxiv, pubmed'
    summ_use = None
    if args.dataset == 'cnndm':
        if not args.peg:
            path = 'facebook/bart-large-cnn'
            summ = BartForConditionalGeneration.from_pretrained(
                path, use_cache=False)
            tokenizer = BartTokenizer.from_pretrained(
                'facebook/bart-large-cnn')
    if args.dataset == 'xsum':
        if not args.peg:
            path = 'facebook/bart-large-xsum'
            summ = BartForConditionalGeneration.from_pretrained(
                path, use_cache=False)
            summ_use = BartForConditionalGeneration.from_pretrained(path)
            tokenizer = BartTokenizer.from_pretrained(
                'facebook/bart-large-xsum')
        else:
            path = 'google/pegasus-xsum'
            summ = PegasusForConditionalGeneration.from_pretrained(
                path, use_cache=False)
            if not args.train: