Exemplo n.º 1
0
    def __init__(self, args, t5_type='t5-base'):
        """
        R1 = Raw 1
        
        Training:
        R1 + R2 + R3 -> M3
        """
        super().__init__()
        self.lr = getattr(args, "lr")
        self.epochs = getattr(args, "epochs")
        self.warmup_steps = getattr(args, "warmup_steps")
        self.gpu_id = getattr(args, "gpu_id")
        self.transformer = T5_Cond_Gen_Wrapper.from_pretrained(t5_type)
        self.tokenizer = T5TokenizerFast.from_pretrained(t5_type)
        self.EM_accuracy = CategoricalAccuracy()
        self.to('cpu' if self.gpu_id == -1 else f"cuda:{self.gpu_id}")

        self.decoder_tokenizer = T5TokenizerFast.from_pretrained(t5_type)
        self.decoder_tokenizer.padding_side = 'left'  # necessary since initial decoding sequences could have different length

        self.validation_scores = []

        self.encoder = self.transformer.encoder
        self.decoder = self.transformer.decoder
        self.lm_head = self.transformer.lm_head
Exemplo n.º 2
0
def test_RumorPadaDataset():
    split = "train"
    data_processor = RumorPadaDataProcessor(["charliehebdo", "ferguson", "germanwings-crash", "ottawashooting"],
                                            "sydneysiege",  DATA_DIR, EXP_DIR)
    print()
    print(data_processor.data[split].keys())
    print(len(list(data_processor.data[split].values())[0]))
    for k, v in data_processor.data[split].items():
        if type(v) is not int:
            print(k, v[0])
        else:
            print(k, v[0], len(v[0]))
    dataset = RumorPadaDataset(split, data_processor, T5TokenizerFast.from_pretrained("t5-base"), 64, 0.5, 0.5)
    print(len(dataset))
    for i, example in enumerate(dataset):
        for k, v in example.items():
            print(k, v)
        if i == 8:
            break
    print()
    dataloader = DataLoader(dataset, 8)
    for batch in dataloader:
        for k, v in batch.items():
            print(k, v)
        break
Exemplo n.º 3
0
Arquivo: pada.py Projeto: eyalbd2/PADA
def test_AbsaSeq2SeqPadaDataProcessor():
    data_processor = AbsaSeq2SeqPadaDataProcessor(
        ["device", "laptops", "rest", "service"], "service", DATA_DIR, EXP_DIR)
    print()
    print(data_processor.data["dev"].keys())
    print(len(list(data_processor.data["dev"].values())[0]))
    for k, v in data_processor.data["dev"].items():
        if type(v) is not int:
            print(k, v[-80])
        else:
            print(k, v[-80], len(v[-80]))
    tokenizer = T5TokenizerFast.from_pretrained("t5-base")
    percent = 99.8
    for split in AbsaSeq2SeqPadaDataProcessor.ALL_SPLITS:
        print(split, len(data_processor.data[split]["example_id"]))
        print(split, max(data_processor.data[split]["input_tokens_len"]))
        print()
        tokenizer_lens = tokenizer(data_processor.data[split]["input_tokens"],
                                   is_split_into_words=True,
                                   return_length=True)["length"]
        print(split, max(tokenizer_lens))
        print(split, percentile(tokenizer_lens, percent))
        print()
        tokenizer_lens = tokenizer(
            data_processor.data[split]["output_labels_tokens"],
            is_split_into_words=True,
            return_length=True)["length"]
        print(split, max(tokenizer_lens))
        print(split, percentile(tokenizer_lens, percent))
        break
Exemplo n.º 4
0
Arquivo: pada.py Projeto: eyalbd2/PADA
def test_AbsaSeq2SeqPadaDataset():
    split = "train"
    data_processor = AbsaSeq2SeqPadaDataProcessor(
        ["device", "laptops", "rest"], "service", DATA_DIR, EXP_DIR)
    print()
    print(data_processor.data[split].keys())
    print(len(list(data_processor.data[split].values())[0]))
    for k, v in data_processor.data[split].items():
        if type(v) is not int:
            print(k, v[0])
        else:
            print(k, v[0], len(v[0]))
    dataset = AbsaSeq2SeqPadaDataset(
        split, data_processor, T5TokenizerFast.from_pretrained("t5-base"), 64,
        0.5, 0.5)
    print(len(dataset))
    for i, example in enumerate(dataset):
        for k, v in example.items():
            print(k, v)
        if i == 8:
            break
    print()
    dataloader = DataLoader(dataset, 8)
    for batch in dataloader:
        for k, v in batch.items():
            print(k, v)
        break
Exemplo n.º 5
0
 def _setup_model_and_tokenizer(self):
     self.model = T5ForConditionalGeneration.from_pretrained(
         self.config.model_name)
     if self.config.fast_tokenizer:
         self.tokenizer = T5TokenizerFast.from_pretrained(
             self.config.model_name)
     else:
         self.tokenizer = T5Tokenizer.from_pretrained(
             self.config.model_name)
Exemplo n.º 6
0
 def from_pretrained(self, model_name="t5-base"):
     """
     Download Model from HF hub
     :param model_name: T5
     :return: Download the model and tokenizer
     """
     self.tokenizer = T5Tokenizer.from_pretrained(f"{model_name}")
     self.model = T5ForConditionalGeneration.from_pretrained(
         f"{model_name}", return_dict=True)
Exemplo n.º 7
0
    def __init__(self, model_name, ckpt_path,
                 num_generations=10):
        device, gpu_ids = util.get_available_devices(assert_cuda=True)
        logging.info(device, gpu_ids)

        self.device = device
        self.num_generations = num_generations
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        self.tokenizer = T5TokenizerFast.from_pretrained(model_name)

        load_ckpt(ckpt_path, self.model, map_location=device)
        self.model.to(device)
Exemplo n.º 8
0
    def __init__(self,
                 # model_args
                 t5_model_name: str,
                 eval_metrics: List[str],

                 # model_generate_args
                 beam_size: int,
                 repetition_penalty: float,
                 length_penalty: float,
                 num_beam_groups: int,
                 diversity_penalty: float,
                 skip_special_tokens: bool,
                 clean_up_tokenization_spaces: bool,

                 # model_optimizer_args
                 weight_decay: float,
                 learning_rate: float,
                 adam_epsilon: float,

                 # trainer_args
                 train_batch_size: int,
                 eval_batch_size: int,
                 gradient_accumulation_steps: int,
                 n_gpu: int,
                 num_train_epochs: int,
                 warmup_steps: int,
                 output_dir: str,

                 # dataset_args
                 dataset_obj: Any,
                 data_procesor_obj: Any,
                 src_domains: List[str],
                 trg_domain: str,
                 data_dir: str,
                 experiment_dir: str,
                 max_seq_len: int,
                 dataset_specific_kwargs: Namespace = None,
                 num_labels: int = 2):
        super().__init__()
        self.save_hyperparameters()
        self.tokenizer = T5TokenizerFast.from_pretrained(self.hparams.t5_model_name)
        self.data_processor, self.datasets = self._init_datasets()
        self.hparams.num_labels = len(self.data_processor.labels_dict)
        self.loss_fn = CrossEntropyLoss(ignore_index=T5TextClassifier.LOSS_IGNORE_ID)
        self.model = T5ForConditionalGeneration.from_pretrained(self.hparams.t5_model_name)
        self.classifier = CnnClassifier(num_labels=self.hparams.num_labels,
                                        hidden_size=self.model.config.hidden_size,
                                        max_seq_length=self.hparams.max_seq_len)
        self.eval_metric_scorer = T5TextClassifier._init_eval_metric_scorer(self.hparams.eval_metrics)
        self.eval_predictions = dict()
Exemplo n.º 9
0
 def __init__(self, args, t5_type='t5-base'):
     """
     R1 = Raw 1
     
     Training:
     R1 + R2 + R3 -> M3
     """
     super().__init__()
     self.lr = getattr(args, "lr")
     self.epochs = getattr(args, "epochs")
     self.warmup_steps = getattr(args, "warmup_steps")
     self.gpu_id = getattr(args, "gpu_id")
     self.transformer = T5ForConditionalGeneration.from_pretrained(t5_type)
     self.tokenizer = T5TokenizerFast.from_pretrained(t5_type)
     self.EM_accuracy = CategoricalAccuracy()
     self.to('cpu' if self.gpu_id == -1 else f"cuda:{self.gpu_id}")
Exemplo n.º 10
0
def test_RumorPadaDataProcessor():
    data_processor = RumorPadaDataProcessor(["charliehebdo", "ferguson", "germanwings-crash", "ottawashooting"],
                                            "sydneysiege", DATA_DIR, EXP_DIR)
    print()
    print(data_processor.data["dev"].keys())
    print(len(list(data_processor.data["dev"].values())[0]))
    for k, v in data_processor.data["dev"].items():
        if type(v) is not int:
            print(k, v[-80])
        else:
            print(k, v[-80], len(v[-80]))
    tokenizer = T5TokenizerFast.from_pretrained("t5-base")
    percent = 99.8
    for split in RumorPadaDataProcessor.ALL_SPLITS:
        print(split, len(data_processor.data[split]["example_id"]))
        print()
        tokenizer_lens = tokenizer(data_processor.data[split]["input_str"], is_split_into_words=False,
                                   max_length=128, return_length=True)["length"]
        print(split, max(tokenizer_lens))
        print(split, percentile(tokenizer_lens, percent))
        break
Exemplo n.º 11
0
    def load_model(self, model_dir: str = "outputs", use_gpu: bool = False):
        """
        loads a checkpoint for inferencing/prediction
        Args:
            model_dir (str, optional): path to model directory. Defaults to "outputs".
            use_gpu (bool, optional): if True, model uses gpu for inferencing/prediction. Defaults to True.
        """
        self.model = T5ForConditionalGeneration.from_pretrained(f"{model_dir}")
        self.tokenizer = T5Tokenizer.from_pretrained(f"{model_dir}")

        if use_gpu:
            if torch.cuda.is_available():
                self.device = torch.device("cuda")
            else:
                raise Exception(
                    "exception ---> no gpu found. set use_gpu=False, to use CPU"
                )
        else:
            self.device = torch.device("cpu")

        self.model = self.model.to(self.device)
Exemplo n.º 12
0
class QuestionGenerator:
    """Class loads pipeline for generating questions from text"""
    model = T5ForConditionalGeneration.from_pretrained(
        "ThomasSimonini/t5-end2end-question-generation")
    tokenizer = T5TokenizerFast.from_pretrained("t5-base")
    tokenizer.sep_token = '<sep>'
    tokenizer.add_tokens(['<sep>'])

    @staticmethod
    def generate(text: str):
        """
        generates questions
        for given text

        :param text: sentence or paragraph for question generation
        :return: list of questions
        """
        try:
            if len(text) < 50:
                raise Exception("input too small")
            generator_args = {
                'temperature': 1.02,
                'num_beams': 1,
                'max_length': 70
            }
            text = "generate questions: " + text + " </s>"
            input_ids = QuestionGenerator.tokenizer.encode(text,
                                                           return_tensors="pt")
            res = QuestionGenerator.model.generate(input_ids, **generator_args)
            output = QuestionGenerator.tokenizer.batch_decode(
                res, skip_special_tokens=True)
            output = output[0].split("<sep>")
            if len(output[-1]) == 0 or output[-1][-1] != "?":
                output.pop()
            output = [" ".join(i.split()) for i in output]
            return list(set(output))
        except Exception as ex:
            raise ex
Exemplo n.º 13
0
    def __init__(self,
                 data_dir: str,
                 batch_size=8,
                 pre_trained='',
                 with_answers=False):
        super().__init__()
        self.batch_size = batch_size
        self.data_dir = data_dir
        self.with_answers = with_answers

        if pre_trained == 't5':
            self.tokenizer = T5TokenizerFast.from_pretrained(
                't5-base',
                extra_ids=0,
                additional_special_tokens=['<A>', '<H>', '<R>', '<T>'])
        elif pre_trained == 'bart':
            self.tokenizer = BartTokenizerFast.from_pretrained(
                'facebook/bart-base',
                extra_ids=0,
                additional_special_tokens=['<A>', '<H>', '<R>', '<T>'])
        else:
            raise Exception(
                f'Unknown pre-trained model {pre_trained}, choose t5 or bart.')
Exemplo n.º 14
0
Arquivo: base.py Projeto: eyalbd2/PADA
def test_AbsaSeq2SeqDataset():
    data_processor = AbsaSeq2SeqDataProcessor(["device", "laptops", "rest"],
                                              "service", DATA_DIR)
    print()
    print(data_processor.data["dev"].keys())
    print(len(list(data_processor.data["dev"].values())[0]))
    for k, v in data_processor.data["dev"].items():
        if type(v) is not int:
            print(k, v[0])
        else:
            print(k, v[0], len(v[0]))
    dataset = AbsaSeq2SeqDataset("dev", data_processor,
                                 T5TokenizerFast.from_pretrained("t5-base"),
                                 64)
    print(len(dataset))
    for example in dataset:
        for k, v in example.items():
            print(k, v)
        break
    dataloader = DataLoader(dataset, 4)
    for batch in dataloader:
        for k, v in batch.items():
            print(k, v)
        break
Exemplo n.º 15
0
Arquivo: base.py Projeto: eyalbd2/PADA
def test_RumorDataset():
    data_processor = RumorDataProcessor(
        ["charliehebdo", "ferguson", "germanwings-crash", "ottawashooting"],
        "sydneysiege", DATA_DIR)
    print()
    print(data_processor.data["dev"].keys())
    print(len(list(data_processor.data["dev"].values())[0]))
    for k, v in data_processor.data["dev"].items():
        if type(v) is not int:
            print(k, v[0])
        else:
            print(k, v[0], len(v[0]))
    dataset = RumorDataset("dev", data_processor,
                           T5TokenizerFast.from_pretrained("t5-base"), 64)
    print(len(dataset))
    for example in dataset:
        for k, v in example.items():
            print(k, v)
        break
    dataloader = DataLoader(dataset, 4)
    for batch in dataloader:
        for k, v in batch.items():
            print(k, v)
        break
Exemplo n.º 16
0
    def __init__(self,
                 split='train',
                 raw_dataset=None,
                 rank=-1,
                 topk=-1,
                 verbose=True,
                 args=None,
                 mode='train'):
        super().__init__()

        self.raw_dataset = raw_dataset
        self.topk = topk
        self.verbose = verbose
        self.args = args

        self.mode = mode

        # Loading datasets to data
        self.split = split
        if self.verbose:
            print('Data source: ', self.split)

        data = self.raw_dataset.data

        if topk > 0:
            data = data[:topk]
            if self.verbose:
                print(f"Use only {topk} data")

        self.n_gpus = torch.cuda.device_count()

        self.rank = rank

        self.data = data

        if self.verbose:
            # if 'sent' not in self.data_out:
            #     print("# all images:", len(self.data))
            # else:
            print("# all sentences:", len(self.data))

        self.n_boxes = args.n_boxes

        if 't5' in self.args.backbone:
            if self.args.use_vision:
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone,
                    # max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
            else:
                self.tokenizer = T5TokenizerFast.from_pretrained(
                    args.backbone,
                    # max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)

        elif 'bart' in self.args.backbone:
            self.tokenizer = BartTokenizer.from_pretrained(
                args.backbone,
                # max_length=self.args.max_text_length,
                do_lower_case=self.args.do_lower_case)

            additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                    [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
            special_tokens_dict = {
                'additional_special_tokens': additional_special_tokens
            }
            num_added_toks = self.tokenizer.add_special_tokens(
                special_tokens_dict)

        self.source_to_h5 = {
            'train': nlvr_feature_dir.joinpath(f'train_obj36.h5'),
            'valid': nlvr_feature_dir.joinpath(f'valid_obj36.h5'),
            'test': nlvr_feature_dir.joinpath(f'test_obj36.h5'),
        }
Exemplo n.º 17
0
 def t5_base_tokenizer_fast(self):
     return T5TokenizerFast.from_pretrained("t5-base")
Exemplo n.º 18
0
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

config = T5Config(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = T5TokenizerFast.from_pretrained(tokenizer_dir, max_len=512)

model = T5ForConditionalGeneration(config=config)
model.num_parameters()

train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=f"{data_dir}/train_texts.txt",
    block_size=128,
)

test_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=f"{data_dir}/valid_texts.txt",
    block_size=128,
)
Exemplo n.º 19
0
    def __init__(self,
                 split='train,valid',
                 raw_dataset=None,
                 rank=-1,
                 topk=-1,
                 verbose=True,
                 args=None,
                 mode='train'):
        super().__init__()

        self.raw_dataset = raw_dataset
        self.topk = topk
        self.verbose = verbose
        self.args = args

        self.mode = mode

        # Loading datasets to data
        self.sources = split.split(',')
        if self.verbose:
            print('Data sources: ', self.sources)

        if 't5' in self.args.backbone:
            if self.args.use_vision:
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone,
                    # max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
            else:
                self.tokenizer = T5TokenizerFast.from_pretrained(
                    args.backbone,
                    # max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
        elif 'bart' in self.args.backbone:
            self.tokenizer = BartTokenizer.from_pretrained(
                args.backbone,
                # max_length=self.args.max_text_length,
                do_lower_case=self.args.do_lower_case)

            additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                    [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
            special_tokens_dict = {
                'additional_special_tokens': additional_special_tokens
            }
            num_added_toks = self.tokenizer.add_special_tokens(
                special_tokens_dict)

        self.img_ids_to_source = {}
        data_info_dicts = []
        for source in self.sources:
            data_info_path = dataset_dir.joinpath(f'GQA/{source}.json')
            with open(data_info_path) as f:
                _data_info_dicts = json.load(f)
                # source_img_ids.append([d['img_id'] for d in _data_info_dicts])
                for _d in _data_info_dicts:
                    self.img_ids_to_source[_d['img_id']] = source
                    _d['source'] = source

                data_info_dicts.extend(_data_info_dicts)
            if self.verbose:
                print(f"Loaded {len(_data_info_dicts)} data from", source)

        data = data_info_dicts

        self.n_gpus = torch.cuda.device_count()

        self.rank = rank

        if self.topk > 0:
            data = data[:self.topk]
            if self.verbose:
                print(f"Use only {self.topk} data")

        self.data = data

        if self.verbose:
            print("# all sentences:", len(self.data))

        self.n_boxes = args.n_boxes

        self.source_to_featname = {
            'train': 'others',
            'valid': 'others',
            'submit': 'others',
            'testdev': 'testdev'
        }

        self.featname_to_h5 = {
            'others': vg_dir.joinpath('features/vg_gqa_obj36.h5'),
            'testdev': gqa_dir.joinpath('features/gqa_testdev_obj36.h5'),
        }
Exemplo n.º 20
0
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from transformers import (Adafactor, T5ForConditionalGeneration,
                          T5TokenizerFast as T5Tokenizer)

MODEL_NAME = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)


class TranslationDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        text_max_token_len: int = 110,
        translation_max_token_len: int = 100,
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.translation_max_token_len = translation_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        #Read line of DataFrame
Exemplo n.º 21
0
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension,
                                data_files=data_files,
                                cache_dir=model_args.cache_dir)

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer

    if model_args.tokenizer_name:
        tokenizer = T5TokenizerFast.from_pretrained(
            model_args.tokenizer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    elif model_args.model_name_or_path:
        tokenizer = T5TokenizerFast.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.config_name:
        config = T5Config.from_pretrained(model_args.config_name,
                                          cache_dir=model_args.cache_dir,
Exemplo n.º 22
0
    def __init__(self,
                 split='train',
                 raw_dataset=None,
                 rank=-1,
                 topk=-1,
                 verbose=True,
                 args=None,
                 mode='train'):
        super().__init__()

        self.raw_dataset = raw_dataset
        self.topk = topk
        self.verbose = verbose
        self.args = args

        self.mode = mode

        # Loading datasets to data
        self.source = split
        if self.verbose:
            print('Data source: ', self.source)

        if self.args.tokenizer is None:
            self.args.tokenizer = self.args.backbone

        if 't5' in self.args.tokenizer:
            if self.args.use_vision:
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone,
                    # max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
            else:
                self.tokenizer = T5TokenizerFast.from_pretrained(
                    args.backbone,
                    # max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
        elif 'bart' in self.args.tokenizer:
            self.tokenizer = BartTokenizer.from_pretrained(
                args.backbone,
                # max_length=self.args.max_text_length,
                do_lower_case=self.args.do_lower_case)

            if args.use_vis_order_embedding:
                additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                        [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
                special_tokens_dict = {
                    'additional_special_tokens': additional_special_tokens
                }
                num_added_toks = self.tokenizer.add_special_tokens(
                    special_tokens_dict)

        if self.args.oscar_tags:
            # Load VG Classes
            vg_classes = []
            with open(vg_dir.joinpath('objects_vocab.txt')) as f:
                for obj in f.readlines():
                    vg_classes.append(obj.split(',')[0].lower().strip())
            self.vg_classes = vg_classes

        with open(wmt_data_dir.joinpath(f'raw/{self.source}.en')) as f:
            source_text_list = f.readlines()

        with open(wmt_data_dir.joinpath(f'raw/{self.source}.de')) as f:
            target_text_list = f.readlines()

        with open(
                wmt_data_dir.joinpath(f'image_splits/{self.source}.txt')) as f:
            image_ids = f.readlines()

        assert len(source_text_list) == len(target_text_list)
        assert len(source_text_list) == len(image_ids)

        data = []
        for source_text, target_text, image_id in zip(source_text_list,
                                                      target_text_list,
                                                      image_ids):
            datum = {
                'img_id': image_id.strip().split('.')[0],
                'source_text': source_text.strip(),
                'target_text': target_text.strip()
            }
            data.append(datum)

        if self.verbose:
            print(f"Loaded {len(data)} data from", split)

        self.n_gpus = torch.cuda.device_count()

        self.rank = rank
        if self.topk > 0:
            data = data[:self.topk]
            if self.verbose:
                print(f"Use only {self.topk} data")

        self.data = data

        if self.verbose:
            print("# all sentences:", len(self.data))

        self.source_to_h5 = {
            'train':
            flickr30k_feature_dir.joinpath('trainval_boxes36.h5'),
            'val':
            flickr30k_feature_dir.joinpath('trainval_boxes36.h5'),
            'test_2016_flickr':
            flickr30k_feature_dir.joinpath('trainval_boxes36.h5'),
            'test_2017_flickr':
            flickr30k_feature_dir.joinpath('test2017_boxes36.h5'),
            'test_2018_flickr':
            flickr30k_feature_dir.joinpath('test2018_boxes36.h5'),
        }
Exemplo n.º 23
0
    def __init__(self,
                 split='karpathy_train',
                 raw_dataset=None,
                 rank=-1,
                 topk=-1,
                 verbose=True,
                 args=None,
                 mode='train'):
        super().__init__()

        self.raw_dataset = raw_dataset
        self.topk = topk
        self.verbose = verbose
        self.args = args

        self.mode = mode

        # Loading datasets to data
        self.source = split
        if self.verbose:
            print('Data source: ', self.source)

        if self.args.tokenizer is None:
            self.args.tokenizer = self.args.backbone

        if 't5' in self.args.tokenizer:
            if self.args.use_vision:
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone,
                    # max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
            else:
                self.tokenizer = T5TokenizerFast.from_pretrained(
                    args.backbone,
                    # max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
        elif 'bart' in self.args.tokenizer:
            self.tokenizer = BartTokenizer.from_pretrained(
                args.backbone,
                # max_length=self.args.max_text_length,
                do_lower_case=self.args.do_lower_case)

            additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                    [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
            special_tokens_dict = {
                'additional_special_tokens': additional_special_tokens
            }
            num_added_toks = self.tokenizer.add_special_tokens(
                special_tokens_dict)

        if self.args.oscar_tags:
            # Load VG Classes
            vg_classes = []
            with open(vg_dir.joinpath('objects_vocab.txt')) as f:
                for obj in f.readlines():
                    vg_classes.append(obj.split(',')[0].lower().strip())
            self.vg_classes = vg_classes

        data_info_path = dataset_dir.joinpath('COCO/dataset_coco.json')
        with open(data_info_path) as f:
            karpathy_data = json.load(f)

        split_rename = {
            'train': 'train',
            'restval': 'train',
            'val': 'val',
            'test': 'test'
        }

        n_images = 0

        data = []
        for datum in karpathy_data['images']:
            re_split = split_rename[datum['split']]
            if re_split != self.source.split('_')[-1]:
                continue

            if re_split == 'train':
                for d in datum['sentences']:
                    img_id = datum['filename'].split('.')[0]
                    new_datum = {
                        'img_id': img_id,
                        'sent': d['raw'].strip(),
                        'targets':
                        [d['raw'].strip() for d in datum['sentences']],
                        'is_train': True,
                    }
                    data.append(new_datum)
            else:
                img_id = datum['filename'].split('.')[0]
                new_datum = {
                    'img_id': img_id,
                    # 'sent': d['raw'],
                    'targets': [d['raw'].strip() for d in datum['sentences']],
                    'is_train': False,
                }
                data.append(new_datum)

            n_images += 1

        if self.verbose:
            print(f"{self.source} has {n_images} images")
            print(f"Loaded {len(data)} data from", split)

        self.n_gpus = torch.cuda.device_count()

        self.rank = rank
        if self.topk > 0:
            data = data[:self.topk]
            if self.verbose:
                print(f"Use only {self.topk} data")

        self.data = data

        if self.verbose:
            print("# all sentences:", len(self.data))

        self.source_to_h5 = {}

        if self.args.max_n_boxes == 36:
            self.source_to_h5.update({
                'train2014':
                coco_dir.joinpath('features').joinpath('train2014_obj36.h5'),
                'val2014':
                coco_dir.joinpath('features').joinpath('val2014_obj36.h5'),
            })
Exemplo n.º 24
0
    def __init__(self, split='train', rank=-1, topk=-1, verbose=True, args=None, is_train=True):

        self.topk = topk
        self.verbose = verbose
        self.args = args

        # Loading datasets to data
        self.source = split
        if self.verbose:
            print('Data sources: ', self.source)


        if 't5' in self.args.backbone:
            if self.args.use_vision:
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone,
                    max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
            else:
                self.tokenizer = T5TokenizerFast.from_pretrained(
                    args.backbone,
                    max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
        elif 'bart' in self.args.backbone:
            self.tokenizer = BartTokenizer.from_pretrained(
                args.backbone,
                # max_length=self.args.max_text_length,
                do_lower_case=self.args.do_lower_case)

            if args.use_vis_order_embedding:
                additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                        [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
                special_tokens_dict = {'additional_special_tokens': additional_special_tokens}
                num_added_toks = self.tokenizer.add_special_tokens(special_tokens_dict)

        self.losses = args.losses.split(',')

        data_info_path = dataset_dir.joinpath(f'VCR/{self.source}.jsonl')
        with open(data_info_path) as f:
            data_info_dicts = [json.loads(s) for s in f]
            if self.topk > 0:
                data_info_dicts = data_info_dicts[:self.topk]
            for datum in data_info_dicts:
                datum['backbone'] = self.args.backbone
                datum['losses'] = self.losses

        with Pool(8) as pool:
            if self.verbose:
                data = [datum for _data in tqdm(
                    pool.imap(get_datum, data_info_dicts), total=len(data_info_dicts), ncols=100) for datum in _data]
            else:
                data = [datum for _data in pool.imap(
                    get_datum, data_info_dicts) for datum in _data]

        if self.verbose:
            print(f"Loaded {len(data)} data from", self.source)


        self.n_gpus = torch.cuda.device_count()

        self.rank = rank

        if self.topk > 0:
            data = data[:self.topk]
            if self.verbose:
                print(f"Use only {self.topk} data")

        self.data = data

        if self.verbose and is_train and ('t5' in self.args.backbone or 'bart' in self.args.backbone):
            from collections import Counter
            task_counter = Counter()
            for datum in data:
                try:
                    task_counter.update([datum['task']])
                except KeyError:
                    print(datum)
                    exit()

            print(task_counter)
            for k, v in task_counter.items():
                print(k, f'{v/len(data)*100:.1f}%')

        if self.verbose:
            print("# examples:", len(data))

        self.source_to_h5 = {
            'train': vcr_feature_dir.joinpath(f'train_boxes36.h5'),
            'val': vcr_feature_dir.joinpath(f'val_boxes36.h5'),
            'test': vcr_feature_dir.joinpath(f'test_boxes36.h5'),

            'train_GT': vcr_feature_dir.joinpath(f'train_boxes_GT.h5'),
            'val_GT': vcr_feature_dir.joinpath(f'val_boxes_GT.h5'),
            'test_GT': vcr_feature_dir.joinpath(f'test_boxes_GT.h5'),
        }

        self.n_boxes = args.n_boxes
Exemplo n.º 25
0
from torch.utils.data import Dataset, DataLoader
from transformers import T5TokenizerFast
import torch
import yaml

with open('config.yml', 'r') as f:
    config = yaml.safe_load(f)

TOKENIZER = T5TokenizerFast.from_pretrained(config['model']['model_name'],
                                            do_lower_case=True)


class ShapingDataset(Dataset):
    def __init__(self, texts, summaries):
        super().__init__()
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = TOKENIZER
        self.summary_length = config['model']['summary_length']
        self.token_length = config['model']['token_length']

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        texts = str(self.texts[item])
        summaries = str(self.summaries[item])

        texts_enconding = self.tokenizer(
            texts,
            padding='max_length',
Exemplo n.º 26
0
    def __init__(self,
                 split='train',
                 raw_dataset=None,
                 rank=-1,
                 topk=-1,
                 verbose=True,
                 args=None,
                 mode='train'):
        super().__init__()

        self.raw_dataset = raw_dataset
        self.topk = topk
        self.verbose = verbose
        self.args = args

        self.mode = mode

        # Loading datasets to data
        self.split = split
        self.sources = split.split(',')
        if self.verbose:
            print('Data sources: ', self.sources)

        if 't5' in self.args.backbone:
            if self.args.use_vision:
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone)
            else:
                self.tokenizer = T5TokenizerFast.from_pretrained(args.backbone)

        elif 'bart' in self.args.backbone:
            self.tokenizer = BartTokenizer.from_pretrained(
                args.backbone,
                # max_length=self.args.max_text_length,
                do_lower_case=self.args.do_lower_case)

            additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                    [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
            special_tokens_dict = {
                'additional_special_tokens': additional_special_tokens
            }
            num_added_toks = self.tokenizer.add_special_tokens(
                special_tokens_dict)

        # mattnet_maskrcnn_detections_path = refcoco_dir.joinpath(
        #     'detections/refcocog_umd/res101_coco_minus_refer_notime_dets.json')
        # with open(mattnet_maskrcnn_detections_path) as f:
        #     mattnet_maskrcnn_detections = json.load(f)

        data = []
        self.refer = REFER('refcocog',
                           'umd',
                           img_dir=coco_img_dir,
                           ref_dir=refcoco_dir,
                           verbose=verbose)
        ref_ids = self.refer.getRefIds(split=split)

        for ref_id in ref_ids:
            ref = self.refer.Refs[ref_id]
            image_id = ref["image_id"]
            ref_id = ref["ref_id"]
            refBox = self.refer.getRefBox(ref_id)
            for sent, sent_id in zip(ref["sentences"], ref["sent_ids"]):
                caption = sent["raw"]
                data.append({
                    "caption": caption,
                    "sent_id": sent_id,
                    "image_id": image_id,
                    "refBox": refBox,
                    "ref_id": ref_id,
                })

        self.n_gpus = torch.cuda.device_count()

        self.rank = rank

        if self.topk > 0:
            data = data[:self.topk]
            if self.verbose:
                print(f"Use only {self.topk} data")

        self.data = data

        if self.verbose:
            print("# all sentences:", len(self.data))

        self.n_boxes = args.n_boxes

        self.source_to_h5 = {
            'train': refcocog_feature_dir.joinpath(f'train_boxes_GT.h5')
        }

        if self.args.RefCOCO_GT:
            self.source_to_h5['val'] = refcocog_feature_dir.joinpath(
                f'val_boxes_GT.h5')
            self.source_to_h5['test'] = refcocog_feature_dir.joinpath(
                f'test_boxes_GT.h5')
        else:
            self.source_to_h5['val'] = refcocog_feature_dir.joinpath(
                f'val_boxes_mattnet.h5')
            self.source_to_h5['test'] = refcocog_feature_dir.joinpath(
                f'test_boxes_mattnet.h5')
Exemplo n.º 27
0
    def __init__(self, split='vg', rank=-1, topk=-1, verbose=True, args=None, is_train=True):

        self.topk = topk
        self.verbose = verbose
        self.args = args


        # Loading datasets to data
        self.sources = split.split(',')
        if self.verbose:
            print('Data sources: ', self.sources)

        # Answer Table from LXMERT (Could be removed)
        self.answer_table = AnswerTable()
        if self.verbose:
            print("Load an answer table of size %d." % (len(self.answer_table.ans2id_map())))

        self.img_ids_to_source = {}

        losses = args.losses.split(',')

        data = []
        for img_source in self.sources:
            data_info_path = dataset_dir.joinpath(f'lxmert/{img_source}.json')
            with open(data_info_path) as f:
                _data = json.load(f)
                if self.verbose:
                    print(f"Loaded {len(_data)} data from", img_source)
                # source_img_ids.append([d['img_id'] for d in _data])
                for datum in _data:
                    self.img_ids_to_source[datum['img_id']] = img_source
                    # datum['img_source'] = img_source
                    datum['args'] = args
                    datum['is_train'] = is_train
                    datum['caption_only'] = args.caption_only

                    datum['lm'] = 'lm' in losses
                    datum['qa'] = 'qa' in losses
                    datum['ground_caption'] = 'ground_caption' in losses
                    datum['refer'] = 'refer' in losses
                    datum['itm'] = 'itm' in losses
                    datum['caption'] = 'caption' in losses

                    datum['backbone'] = self.args.backbone

                data.extend(_data)

        # Modify the answers
        if 'qa' in args.losses:
            for datum in data:
                labelf = datum['labelf']
                for _qa_source, labels in labelf.items():
                    for label in labels:
                        for ans in list(label.keys()):
                            new_ans = self.answer_table.convert_ans(ans)
                            if self.answer_table.used(new_ans):
                                if ans != new_ans:
                                    label[new_ans] = label.pop(ans)
                            else:
                                label.pop(ans)

        if self.verbose:
            print("# images:", len(data))

        if self.topk > 0:
            data = data[:self.topk]
            if self.verbose:
                print(f"Use only {self.topk} data")

        if 'qa' in args.losses:
            self.evaluator = QAEvaluator(data)

        with Pool(8) as pool:
            if self.verbose:
                data = [datum for _data in tqdm(
                    pool.imap(get_datum, data), total=len(data), ncols=100, desc="Creating pretrainig data examples") for datum in _data]
            else:
                data = [datum for _data in pool.imap(
                    get_datum, data) for datum in _data]

        if self.args.itm_cocoonly:
            caption_sources = ['mscoco']
        else:
            caption_sources = ['mscoco', 'vg']
        self.data_captions = [datum for datum in data if datum['text_source'] in caption_sources]
        self.n_data_captions = len(self.data_captions)

        if self.verbose:
            print('# itm data:', self.n_data_captions)

        self.data = data
        self.n_data = len(self.data)

        if self.verbose and is_train:
            from collections import Counter
            task_counter = Counter()
            for datum in data:
                try:
                    task_counter.update([datum['task']])
                except KeyError:
                    print(datum)
                    exit()

            print(task_counter)
            for k, v in task_counter.items():
                print(k, f'{v/len(data)*100:.1f}%')

        if self.verbose:
            print("# examples:", len(data))

        self.source_to_h5 = {
            'mscoco_resplit_train_train2014': coco_dir.joinpath('features').joinpath('train2014_obj36.h5'),
            'mscoco_resplit_train_val2014': coco_dir.joinpath('features').joinpath('val2014_obj36.h5'),
            'mscoco_resplit_val': coco_dir.joinpath('features').joinpath('resplit_val_obj36.h5'),
            'vgnococo': vg_dir.joinpath('features').joinpath('vg_gqa_obj36.h5'),

        }

        self.n_boxes = args.n_boxes

        if 't5' in self.args.backbone:
            if self.args.use_vision:
                # self.tokenizer = VLT5Tokenizer.from_pretrained(
                #     args.backbone, do_lower_case=args.do_lower_case)
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone, do_lower_case=args.do_lower_case)
            else:
                # self.tokenizer = T5Tokenizer.from_pretrained(
                #     args.backbone, do_lower_case=args.do_lower_case)
                self.tokenizer = T5TokenizerFast.from_pretrained(
                    args.backbone, do_lower_case=args.do_lower_case)
        elif 'bart' in self.args.backbone:
            self.tokenizer = BartTokenizer.from_pretrained(args.backbone)
            additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                    [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
            special_tokens_dict = {'additional_special_tokens': additional_special_tokens}
            self.tokenizer.add_special_tokens(special_tokens_dict)
Exemplo n.º 28
0
    def __init__(self, split='train', raw_dataset=None, rank=-1, topk=-1, verbose=True, args=None, mode='train'):
        super().__init__()

        self.raw_dataset = raw_dataset
        self.topk = topk
        self.verbose = verbose
        self.args = args

        self.mode = mode

        # Loading datasets to data
        self.sources = split.split(',')
        if self.verbose:
            print('Data sources: ', self.sources)

        if 't5' in self.args.backbone:
            if self.args.use_vision:
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone,
                    max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
            else:
                self.tokenizer = T5TokenizerFast.from_pretrained(
                    args.backbone,
                    max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)

        elif 'bart' in self.args.backbone:
            self.tokenizer = BartTokenizer.from_pretrained(
                args.backbone,
                # max_length=self.args.max_text_length,
                do_lower_case=self.args.do_lower_case)

            if args.use_vis_order_embedding:
                additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                        [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
                special_tokens_dict = {'additional_special_tokens': additional_special_tokens}
                num_added_toks = self.tokenizer.add_special_tokens(special_tokens_dict)

        self.answer_normalizer = VQAEvaluator()

        self.img_ids_to_source = {}
        data_info_dicts = []
        for source in self.sources:
            data_info_path = dataset_dir.joinpath(f'vqa/{source}.json')
            with open(data_info_path) as f:
                _data_info_dicts = json.load(f)
                for _d in _data_info_dicts:
                    if 'vg_qa_full' == source:
                        self.img_ids_to_source[_d['img_id']] = 'vg'
                    elif 'train2014' in _d['img_id']:
                        self.img_ids_to_source[_d['img_id']] = 'train2014'
                    elif 'val2014' in _d['img_id']:
                        self.img_ids_to_source[_d['img_id']] = 'val2014'
                    else:
                        self.img_ids_to_source[_d['img_id']] = source
                        _d['source'] = source

                data_info_dicts.extend(_data_info_dicts)
            if self.verbose:
                print(f"Loaded {len(_data_info_dicts)} data from", source)

        data = data_info_dicts

        self.n_gpus = torch.cuda.device_count()

        self.rank = rank

        if self.topk > 0:
            data = data[:self.topk]
            if self.verbose:
                print(f"Use only {self.topk} data")

        self.data = data

        if self.verbose:
            print("# all sentences:", len(self.data))

        self.n_boxes = args.n_boxes
        self.source_to_h5 = {
            'train': coco_feature_dir.joinpath(f'train2014_obj36.h5'),
            'minival': coco_feature_dir.joinpath(f'val2014_obj36.h5'),
            'nominival': coco_feature_dir.joinpath(f'val2014_obj36.h5'),
            'test': coco_feature_dir.joinpath(f'test2015_obj36.h5'),

            'vg': dataset_dir.joinpath('VG/features').joinpath('vg_gqa_obj36.h5'),

            'train2014': coco_feature_dir.joinpath(f'train2014_obj36.h5'),
            'val2014': coco_feature_dir.joinpath(f'val2014_obj36.h5'),
        }
Exemplo n.º 29
0
    def __init__(self,
                 split='train',
                 raw_dataset=None,
                 rank=-1,
                 topk=-1,
                 verbose=True,
                 args=None,
                 mode='train'):
        super().__init__()

        self.topk = topk
        self.verbose = verbose
        self.args = args

        self.mode = mode

        # Loading datasets to data
        self.split = split
        self.sources = split.split(',')
        if self.verbose:
            print('Data sources: ', self.sources)

        if 't5' in self.args.backbone:
            if self.args.use_vision:
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone,
                    max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
            else:
                self.tokenizer = T5TokenizerFast.from_pretrained(
                    args.backbone,
                    max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)

        elif 'bart' in self.args.backbone:
            self.tokenizer = BartTokenizer.from_pretrained(
                args.backbone,
                # max_length=self.args.max_text_length,
                do_lower_case=self.args.do_lower_case)

            additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                    [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
            special_tokens_dict = {
                'additional_special_tokens': additional_special_tokens
            }
            num_added_toks = self.tokenizer.add_special_tokens(
                special_tokens_dict)

        self.img_ids_to_source = {}
        data_info_dicts = []
        for source in self.sources:
            data_info_path = dataset_dir.joinpath(f'VCR/{source}.jsonl')
            with open(data_info_path) as f:
                _data_info_dicts = [json.loads(s) for s in f]
                for _d in _data_info_dicts:
                    self.img_ids_to_source[_d['img_id']] = source
                    _d['source'] = source

                data_info_dicts.extend(_data_info_dicts)
            if self.verbose:
                print(f"Loaded {len(_data_info_dicts)} data from", source)

        data = data_info_dicts

        self.rank = rank

        if self.topk > 0:
            data = data[:self.topk]
            if self.verbose:
                print(f"Use only {self.topk} data")

        self.data = data

        if self.verbose:
            print("# all sentences:", len(self.data))

        self.n_boxes = args.n_boxes

        self.source_to_h5 = {
            'train': vcr_feature_dir.joinpath(f'train_boxes36.h5'),
            'val': vcr_feature_dir.joinpath(f'val_boxes36.h5'),
            'test': vcr_feature_dir.joinpath(f'test_boxes36.h5'),
            'train_GT': vcr_feature_dir.joinpath(f'train_boxes_GT.h5'),
            'val_GT': vcr_feature_dir.joinpath(f'val_boxes_GT.h5'),
            'test_GT': vcr_feature_dir.joinpath(f'test_boxes_GT.h5'),
        }
Exemplo n.º 30
0
 def get_tokenizer(self, opt):
     return T5TokenizerFast.from_pretrained(opt['t5_model_arch'],
                                            truncation=True)