Exemplo n.º 1
0
    def load_data_to(self, ctxs: Dict[object, BiEncoderPassage], date):

        year = "_" + str(datetime.strptime(date, "%b-%d-%Y").year) + "_"

        tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

        print(f"Creating bi-encoder dict for {date}...")
        for file_path in tqdm(self.file_paths):

            if year in file_path:
                with open(file_path, 'rb') as f:
                    items = ijson.kvitems(f, '')
                    ocr_text_generators = []
                    for k, v in items:
                        if date in k:
                            ocr_text_generators.append(self.ocr_text_iter(v))

                if len(ocr_text_generators) == 0:
                    continue

                for gen in ocr_text_generators:
                    for layobj in gen:
                        title, passage, object_id = layobj
                        uid = object_id
                        title = normalize_passage(title)
                        title = title.lower()
                        passage = take_max_model_paragraphs(passage, tokenizer)
                        passage = normalize_passage(passage)
                        ctxs[uid] = BiEncoderPassage(passage, title)
Exemplo n.º 2
0
 def __init__(
     self,
     model_name_or_path,
     tokenizer_name,
     model_cache_dir,
     max_length,
     with_title,
     wandb_project,
     wandb_run_name,
     **kwargs,
 ):
     super().__init__(
         max_length,
         with_title,
         wandb_project,
         wandb_run_name,
     )
     self.tokenizer = BartTokenizerFast.from_pretrained(
         tokenizer_name if tokenizer_name else model_name_or_path,
         cache_dir=model_cache_dir,
     )
     self.model = BartForSequenceOrderingWithMultiPointer.from_pretrained(
         model_name_or_path,
         cache_dir=model_cache_dir,
     )
Exemplo n.º 3
0
 def __init__(self):
     super(BartTokenizerWithMapping, self).__init__(
         huggingface_tokenizer=BartTokenizerFast.from_pretrained(
             'facebook/bart-large-cnn'),
         truncate_left=1,
         truncate_right=1,
         starting_tokens_ids=[0],
         ending_tokens_ids=[2])
Exemplo n.º 4
0
 def __init__(self):
     self.model = BartForConditionalGeneration.from_pretrained(
         "facebook/bart-large-cnn"
     )
     self.model.half()
     self.model.to(device)
     self.model.eval()
     self.tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-large-cnn")
Exemplo n.º 5
0
def short_cnn_bart_encoding(data):
    tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-cnn')
    texts = [a['content'] for a in data]
    encodings = tokenizer(texts, truncation=True, max_length=128, padding=True, return_attention_mask=True, return_token_type_ids=True)
    for idx, article in enumerate(tqdm(data)):
        article['content'] = encodings.data['input_ids'][idx]
        article['attention_mask'] = encodings.data['attention_mask'][idx]
        article['token_type_ids'] = encodings.data['token_type_ids'][idx]
    return data
Exemplo n.º 6
0
    def load_data_to(self, ctxs: Dict[object, BiEncoderPassage]):

        tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

        if self.n_random_papers:
            print("Random newspaper subset...")
            scan_names = []
            for file_path in tqdm(self.file_paths):
                with open(file_path, 'rb') as f:
                    items = ijson.kvitems(f, '')
                    for k, v in items:
                        scan_names.append(k)
            papers = list(set([self.get_paper_name(scan) for scan in scan_names]))
            papers.sort()
            print(f"{len(papers)} total papers...")

            random.seed(789)
            random_papers = random.sample(papers, self.n_random_papers)
            print(f"Selected random papers: {random_papers}")

        print("Creating bi-encoder dict...")
        for file_path in tqdm(self.file_paths):

            with open(file_path, 'rb') as f:
                items = ijson.kvitems(f, '')
                ocr_text_generators = []
                for k, v in items:
                    if self.month_str:
                        if self.month_str in k:
                            if self.n_random_papers:
                                if self.get_paper_name(k) in random_papers:
                                    ocr_text_generators.append(self.ocr_text_iter(v))
                            else:
                                ocr_text_generators.append(self.ocr_text_iter(v))
                    else:
                        if self.n_random_papers:
                            if self.get_paper_name(k) in random_papers:
                                ocr_text_generators.append(self.ocr_text_iter(v))
                        else:
                            ocr_text_generators.append(self.ocr_text_iter(v))

            if len(ocr_text_generators) == 0:
                continue

            for gen in ocr_text_generators:
                for layobj in gen:
                    title, passage, object_id = layobj
                    uid = object_id
                    if self.normalize:
                        title = normalize_passage(title)
                        title = title.lower()
                        passage = take_max_model_paragraphs(passage, tokenizer)
                        passage = normalize_passage(passage)
                    ctxs[uid] = BiEncoderPassage(passage, title)
    def __init__(self,
                 data_dir: str,
                 batch_size=8,
                 pre_trained='',
                 with_answers=False):
        super().__init__()
        self.batch_size = batch_size
        self.data_dir = data_dir
        self.with_answers = with_answers

        if pre_trained == 't5':
            self.tokenizer = T5TokenizerFast.from_pretrained(
                't5-base',
                extra_ids=0,
                additional_special_tokens=['<A>', '<H>', '<R>', '<T>'])
        elif pre_trained == 'bart':
            self.tokenizer = BartTokenizerFast.from_pretrained(
                'facebook/bart-base',
                extra_ids=0,
                additional_special_tokens=['<A>', '<H>', '<R>', '<T>'])
        else:
            raise Exception(
                f'Unknown pre-trained model {pre_trained}, choose t5 or bart.')
 def default_tokenizer_fast(self):
     return BartTokenizerFast.from_pretrained("facebook/bart-large")
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
import pytorch_lightning as pl
from transformers import BartTokenizerFast

from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from collections import defaultdict

from transformers import BartModel, BartConfig

tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-cnn')

moral_foundations = [
    'AuthorityVice', 'AuthorityVirtue', 'FairnessVice', 'FairnessVirtue',
    'HarmVice', 'HarmVirtue', 'IngroupVice', 'IngroupVirtue', 'PurityVice',
    'PurityVirtue'
]


def get_target_moral_names(targets):
    r = []
    for idx, t in enumerate(targets):
        if t:
            r.append(moral_foundations[idx])
    return r
Exemplo n.º 10
0
    def __init__(self,
                 lr=0.001,
                 discriminator=None,
                 bart_decoder=True,
                 freeze_encoder=True,
                 freeze_decoder=True,
                 contextual_injection=True,
                 n_contextual_linear=2,
                 moral_vec_size=10,
                 use_content_loss=False,
                 content_loss_type='cosine',
                 feed_moral_tokens_to='encoder',
                 use_moral_loss=False,
                 content_loss_weighting=1,
                 moral_loss_weighting=1):
        super().__init__()
        assert n_contextual_linear >= 1
        self.lr = lr
        self.contextual_injection = contextual_injection
        self.feed_moral_tokens_to = feed_moral_tokens_to
        self.use_moral_loss = use_moral_loss
        self.use_content_loss = use_content_loss
        self.content_loss_type = content_loss_type
        self.content_loss_weighting = content_loss_weighting
        self.moral_loss_weighting = moral_loss_weighting

        self.loss_history = []
        self.training_epoch_count = 10
        self.use_original_morals = False

        self.tokenizer = BartTokenizerFast.from_pretrained(
            'facebook/bart-large-cnn')
        self.bart_scorer = BartScorer()

        # Load pretrained model
        # self.pretrained = BartModel.from_pretrained('facebook/bart-large-cnn')
        print('Loading pretrained bart-large-cnn...')
        self.pretrained = BartForConditionalGeneration.from_pretrained(
            'facebook/bart-large-cnn').to(device)
        print('Pretrained bart-large-cnn loaded')
        # print(self.pretrained)
        # sys.exit()

        self.encoder = self.pretrained.model.encoder
        self.embedding = self.pretrained.model.shared

        if freeze_encoder:
            for param in self.encoder.parameters():
                param.requires_grad = False

        self.n_vocab = self.embedding.num_embeddings
        self.n_encoder_features = self.encoder.layernorm_embedding.normalized_shape[
            0]

        # Linear layers to combine encodings and moral features
        self.linears = nn.ModuleList([
            nn.Linear(self.n_encoder_features + moral_vec_size,
                      self.embedding.embedding_dim)
        ])
        for i in range(n_contextual_linear - 1):
            self.linears.append(
                nn.Linear(self.embedding.embedding_dim,
                          self.embedding.embedding_dim))

        # Decoder
        self.decoder = self.pretrained.model.decoder
        if freeze_decoder:
            for param in self.decoder.parameters():
                param.requires_grad = False

        self.lm_head = self.pretrained.lm_head

        self.discriminator = discriminator
        for param in self.discriminator.parameters():
            param.requires_grad = False

        self.vocab_size = 50264
        self.onehot_embeddings = nn.Linear(self.vocab_size, 1024, bias=False)
        self.onehot_embeddings.weight = nn.Parameter(
            self.discriminator.build_lookups())
        self.onehot_embeddings.requires_grad = False
        self.onehot_embeddings.weight.requires_grad = False
Exemplo n.º 11
0
'''
load data
'''
train_contexts, train_questions, train_answers = read_squad(
    'data/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('data/dev-v2.0.json')
'''
generate answer end indices
'''
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)
'''
tokenizers and models
'''
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')
model = BartForQuestionAnswering.from_pretrained('facebook/bart-base')
'''
tokenize
'''
train_encodings = tokenizer(train_contexts,
                            train_questions,
                            truncation=True,
                            padding=True)
val_encodings = tokenizer(val_contexts,
                          val_questions,
                          truncation=True,
                          padding=True)
'''
last step preparing model inputs
'''
Exemplo n.º 12
0
    def __init__(self, hparams):
        super(AbstractiveSummarizer, self).__init__()

        self.hparams = hparams

        if len(self.hparams.dataset) <= 1:
            self.hparams.dataset = self.hparams.dataset[0]

        if "longformer-encdec" in self.hparams.model_name_or_path.lower():
            self.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained(
                self.hparams.model_name_or_path, gradient_checkpointing=True
            )

            self.tokenizer = BartTokenizerFast.from_pretrained(
                self.hparams.model_name_or_path, add_prefix_space=True
            )
        else:
            if self.hparams.decoder_model_name_or_path:
                self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(
                    self.hparams.model_name_or_path,
                    (
                        self.hparams.decoder_model_name_or_path
                        if self.hparams.decoder_model_name_or_path
                        else self.hparams.model_name_or_path
                    ),
                    gradient_checkpointing=self.hparams.gradient_checkpointing,
                    tie_encoder_decoder=self.hparams.tie_encoder_decoder,
                )
            else:
                self.model = AutoModelForSeq2SeqLM.from_pretrained(
                    self.hparams.model_name_or_path,
                    gradient_checkpointing=self.hparams.gradient_checkpointing,
                )

            self.tokenizer = AutoTokenizer.from_pretrained(
                self.hparams.model_name_or_path, use_fast=True
            )

        self.rouge_sentence_split_token = "<q>"
        self.tokenizer.add_tokens(self.rouge_sentence_split_token)
        self.rouge_sentence_split_token_id = self.tokenizer.convert_tokens_to_ids(
            self.rouge_sentence_split_token
        )

        # bo = beginning of
        # eo = ending of
        # seq = sequence (not using 's' because 's' stands for sentence in other places)
        # Use `bos_token` for boseq if `bos_token` is set, otherwise use "[unused0]"
        # Use `pad_token` for eoseq if `pad_token` is set, otherwise use "[unused1]"
        do_seq_special_add = False
        if self.tokenizer.bos_token:
            self.target_boseq_token = self.tokenizer.bos_token
        else:
            self.target_boseq_token = "[unused0]"
            do_seq_special_add = True

        if self.tokenizer.pad_token:
            self.target_eoseq_token = self.tokenizer.pad_token
        else:
            self.target_eoseq_token = "[unused1]"
            do_seq_special_add = True

        # Convert `target_boseq_token` and `target_eoseq_token` to IDs
        self.target_boseq_token_id = self.tokenizer.convert_tokens_to_ids(
            self.target_boseq_token
        )
        self.target_eoseq_token_id = self.tokenizer.convert_tokens_to_ids(
            self.target_eoseq_token
        )

        # If the `*oseq` tokens are not already "special" then add them as special
        # tokens so that they are ignored when decoding.
        if do_seq_special_add:
            special_tokens_dict = {
                "additional_special_tokens": [
                    self.target_boseq_token,
                    self.target_eoseq_token,
                ]
            }
            self.tokenizer.add_special_tokens(special_tokens_dict)

        if self.hparams.label_smoothing > 0:
            self.loss_func = LabelSmoothingLoss(
                self.hparams.label_smoothing,
                self.tokenizer.vocab_size,
                ignore_index=self.tokenizer.pad_token_id,
            )
        else:
            self.loss_func = nn.CrossEntropyLoss(
                ignore_index=self.tokenizer.pad_token_id
            )

        self.train_dataloader_object = None  # not created yet
        self.rouge_metrics = None
        self.rouge_scorer = None
        self.dataset = {}

        self.tokenized_data_file_paths = {}
        for split in ["train", "validation", "test"]:
            features_cache_file = os.path.join(
                self.hparams.cache_file_path, (split + "_tokenized")
            )
            self.tokenized_data_file_paths[split] = features_cache_file

        if "longformer" in self.hparams.model_name_or_path:
            longformer_modifier_ = partial(
                longformer_modifier,
                tokenizer=self.tokenizer,
                attention_window=self.model.config.attention_window,
            )
            self.collate_fn = partial(
                self.abs_collate_fn, modifier=longformer_modifier_
            )
        else:
            self.collate_fn = self.abs_collate_fn
 def get_rust_tokenizer(self, **kwargs):
     kwargs.update(self.special_tokens_map)
     return BartTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
Exemplo n.º 14
0
from transformers import BartForConditionalGeneration as BCD, BartTokenizerFast as BTF

import dataset

import sys

batch_size = int(sys.argv[1])

vit = timm.create_model('vit_base_patch32_384', pretrained=True, num_classes=0)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

bart = BCD.from_pretrained('facebook/bart-base')
tokenizer = BTF.from_pretrained('facebook/bart-base')

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Lambda(dataset.make_img_rgb),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = dataset.NarrativesDataset(root='./data/images/',
                                     file='./data/dataset.jsonl',
                                     transform=transform)

trainloader = torch.utils.data.DataLoader(trainset,
                                          batch_size=batch_size,