예제 #1
0
def bert_fn():
    # Read train data and model
    train_dataset = pd.read_pickle(DATA['unbalanced_train'])
    train_lb_encoder = preprocessing.LabelEncoder()
    train_y = train_lb_encoder.fit_transform(train_dataset.label.values)

    xtrain, xvalid, ytrain, yvalid = train_test_split(
        train_dataset.preprocessed_tweets.values,
        train_y,
        stratify=train_y,
        random_state=42,
        test_size=.1,
        shuffle=True)

    # import BERT-base pretrained model
    bert = AutoModel.from_pretrained('bert-base-uncased')

    # Load the BERT tokenizer
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    max_seq_len = 20

    # tokenize and encode sequences in the training set
    tokens_train = tokenizer.batch_encode_plus(xtrain.tolist(),
                                               max_length=max_seq_len,
                                               pad_to_max_length=True,
                                               truncation=True,
                                               return_token_type_ids=False)

    # tokenize and encode sequences in the validation set
    tokens_val = tokenizer.batch_encode_plus(xvalid.tolist(),
                                             max_length=max_seq_len,
                                             pad_to_max_length=True,
                                             truncation=True,
                                             return_token_type_ids=False)

    # for train set
    train_seq = torch.tensor(tokens_train['input_ids'])
    train_mask = torch.tensor(tokens_train['attention_mask'])
    train_y = torch.tensor(ytrain.tolist())

    # for validation set
    val_seq = torch.tensor(tokens_val['input_ids'])
    val_mask = torch.tensor(tokens_val['attention_mask'])
    val_y = torch.tensor(yvalid.tolist())

    #define a batch size
    batch_size = 32

    # wrap tensors
    train_data = TensorDataset(train_seq, train_mask, train_y)

    # sampler for sampling the data during training
    train_sampler = RandomSampler(train_data)

    # dataLoader for train set
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=batch_size)

    # wrap tensors
    val_data = TensorDataset(val_seq, val_mask, val_y)

    # sampler for sampling the data during training
    val_sampler = SequentialSampler(val_data)

    # dataLoader for validation set
    val_dataloader = DataLoader(val_data,
                                sampler=val_sampler,
                                batch_size=batch_size)

    # freeze all the parameters
    for param in bert.parameters():
        param.requires_grad = False

    # pass the pre-trained BERT to our define architecture
    model = BERT_Arch(bert)

    # define the optimizer
    optimizer = AdamW(model.parameters(), lr=1e-3)

    #compute the class weights
    class_wts = compute_class_weight('balanced', np.unique(ytrain), ytrain)

    # convert class weights to tensor
    weights = torch.tensor(class_wts, dtype=torch.float)

    # loss function
    cross_entropy = nn.NLLLoss(weight=weights)

    # number of training epochs
    epochs = 10

    # set initial loss to infinite
    best_valid_loss = float('inf')

    # empty lists to store training and validation loss of each epoch
    train_losses = []
    valid_losses = []

    #for each epoch
    for epoch in range(epochs):

        print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

        #train model
        train_loss, _ = train(model, train_dataloader, cross_entropy,
                              optimizer)

        #evaluate model
        valid_loss, _ = evaluate(model, train_dataloader, cross_entropy,
                                 optimizer)

        # append training and validation loss
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        print(f'\nTraining Loss: {train_loss:.3f}')
        print(f'Validation Loss: {valid_loss:.3f}')
예제 #2
0
def transformers(
    path_to_senteval: str,
    pretrained_model_name_or_path: str,
    output_filepath: Path = None,
    mean_pool: bool = False,
    cuda_device: int = -1,
    prototyping_config: bool = False,
    verbose: bool = False,
) -> None:
    """Evaluates a pre-trained model from the Transformers library against the SentEval benchmark."""

    from transformers import AutoModel, AutoTokenizer

    # SentEval prepare and batcher
    def prepare(params, samples):
        return

    @torch.no_grad()
    def batcher(params, batch):
        batch = _cleanup_batch(batch)
        # Re-tokenize the input text using the pre-trained tokenizer
        batch = [" ".join(tokens) for tokens in batch]
        # HACK (John): This will save us in the case of tokenizers with no default max_length
        # Why does this happen? Open an issue on Transformers.
        max_length = params.tokenizer.max_length if hasattr(
            tokenizer, "max_length") else 512
        inputs = params.tokenizer.batch_encode_plus(batch,
                                                    pad_to_max_length=True,
                                                    max_length=max_length,
                                                    return_tensors="pt")
        # Place all input tensors on same device as the model
        inputs = {
            name: tensor.to(params.device)
            for name, tensor in inputs.items()
        }

        sequence_output, pooled_output = model(**inputs)[:2]

        # If mean_pool, we take the average of the token-level embeddings, accounting for pads.
        # Otherwise, we take the pooled output for this specific model, which is typically the
        # embedding of a special tokens embedding, like [CLS] or <s>, which is prepended to the
        # input during tokenization.
        if mean_pool:
            embeddings = torch.sum(
                sequence_output * inputs["attention_mask"].unsqueeze(-1),
                dim=1) / torch.clamp(torch.sum(
                    inputs["attention_mask"], dim=1, keepdims=True),
                                     min=1e-9)
        else:
            embeddings = pooled_output
        embeddings = embeddings.cpu().numpy()

        return embeddings

    # Determine the torch device
    device = _get_device(cuda_device)

    # Load the Transformers tokenizer
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
    typer.secho(
        (f"{SUCCESS} Tokenizer '{pretrained_model_name_or_path}' from Transformers loaded"
         " successfully."),
        fg=typer.colors.GREEN,
        bold=True,
    )

    # Load the Transformers model
    model = AutoModel.from_pretrained(pretrained_model_name_or_path)
    model.to(device)
    model.eval()
    typer.secho(
        f'{SUCCESS} Model "{pretrained_model_name_or_path}" from Transformers loaded successfully.',
        fg=typer.colors.GREEN,
        bold=True,
    )

    # Performs a few setup steps and returns the SentEval params
    params_senteval = _setup_senteval(path_to_senteval, prototyping_config,
                                      verbose)
    params_senteval["tokenizer"] = tokenizer
    params_senteval["model"] = model
    params_senteval["device"] = device
    _run_senteval(params_senteval, path_to_senteval, batcher, prepare,
                  output_filepath)

    return
예제 #3
0
# print(sample_list[0])
# print()
# print(tokenizer.decode(batch_input_ids[0].tolist()))
# print(batch_input_ids.size())
# print(batch_attention_mask.size())
# print(batch_token_type_ids.size())
# print(len(tok2char_span_list))
# print(batch_shaking_tag.size())


# # Model

# In[21]:


encoder = AutoModel.from_pretrained(model_path)


# In[22]:


fake_input = torch.zeros([batch_size, max_seq_len, encoder.config.hidden_size]).to(device)
shaking_type = hyper_parameters["shaking_type"]
ent_extractor = TPLinkerNER(encoder, len(tags), fake_input, shaking_type, visual_field)
ent_extractor = ent_extractor.to(device)


# In[23]:


metrics = Metrics(handshaking_tagger)
예제 #4
0
    def __init__(self,
                 config=None,
                 *inputs,
                 args,
                 vocab_sets,
                 tasks,
                 save_directory=None,
                 **kwargs):
        """
        Relevant inputs should be provided using kwargs. This method is defined this way to match parent's and siblings' method signatures.
        If `save_directory` is None, will initialize a new model and numericalizer, otherwise, will load them from `save_directory`
        Inputs:
            args
            vocab_sets
            save_directory: The directory where numericalizer can be loaded from.
        """
        super().__init__(PretrainedConfig())  # dummy PretrainedConfig
        self.args = args

        encoder_embeddings = args.pretrained_model
        config = AutoConfig.from_pretrained(encoder_embeddings,
                                            cache_dir=args.embeddings)
        args.dimension = config.hidden_size

        # tasks is not passed during initialization only in server mode
        # call this function after task is recognized
        if tasks:
            self.set_generation_output_options(tasks)

        self.src_lang, self.tgt_lang = adjust_language_code(
            config, args.pretrained_model, kwargs.get('src_lang', 'en'),
            kwargs.get('tgt_lang', 'en'))

        self.numericalizer = TransformerNumericalizer(
            encoder_embeddings,
            args,
            max_generative_vocab=args.max_generative_vocab,
            save_dir=save_directory,
            config=config,
            src_lang=self.src_lang,
            tgt_lang=self.tgt_lang,
            vocab_sets=vocab_sets,
            tasks=tasks,
        )

        logger.info('Initializing encoder and decoder embeddings')

        if args.do_ned:
            if type(config) == BertConfig:
                if save_directory is not None:
                    self.encoder_embeddings = BertModelForNER(
                        config, args.num_db_types, args.db_unk_id)
                else:
                    self.encoder_embeddings = BertModelForNER(
                        config, args.num_db_types,
                        args.db_unk_id).from_pretrained(
                            encoder_embeddings,
                            num_db_types=args.num_db_types,
                            db_unk_id=args.db_unk_id,
                            cache_dir=args.embeddings)
            elif type(config) == XLMRobertaConfig:
                if save_directory is not None:
                    self.encoder_embeddings = XLMRobertaModelForNER(
                        config, args.num_db_types, args.db_unk_id)
                else:
                    self.encoder_embeddings = XLMRobertaModelForNER(
                        config, args.num_db_types,
                        args.db_unk_id).from_pretrained(
                            encoder_embeddings,
                            num_db_types=args.num_db_types,
                            db_unk_id=args.db_unk_id,
                            cache_dir=args.embeddings)
            else:
                raise ValueError(
                    'Model is not supported for using entity embeddings for NER'
                )
        else:
            if save_directory is not None:
                self.encoder_embeddings = AutoModel.from_config(config)
            else:
                self.encoder_embeddings = AutoModel.from_pretrained(
                    encoder_embeddings,
                    config=config,
                    cache_dir=args.embeddings)

        self.encoder_embeddings.resize_token_embeddings(
            self.numericalizer.num_tokens)

        logger.info(f'Vocabulary has {self.numericalizer.num_tokens} tokens')

        self.encoder = IdentityEncoder(self.numericalizer, args, config,
                                       self.encoder_embeddings)
        self.decoder = MQANDecoder(self.numericalizer, args)
예제 #5
0
    def __build_model(self) -> None:
        """ Init transformer model + tokenizer + classification head."""

        if self.hparams.transformer_type == 'roberta-long':
            self.transformer = RobertaLongForMaskedLM.from_pretrained(
                self.hparams.encoder_model,
                output_hidden_states=True,
                gradient_checkpointing=True)

        elif self.hparams.transformer_type == 'longformer':
            self.transformer = AutoModel.from_pretrained(
                self.hparams.encoder_model,
                output_hidden_states=True,
                gradient_checkpointing=True,  #critical for training speed.
            )

        else:  #BERT
            self.transformer = AutoModel.from_pretrained(
                self.hparams.encoder_model,
                output_hidden_states=True,
            )

        logger.warning(f'model is {self.hparams.encoder_model}')

        if self.hparams.transformer_type == 'longformer':
            logger.warning('Turnin ON gradient checkpointing...')

            self.transformer = AutoModel.from_pretrained(
                self.hparams.encoder_model,
                output_hidden_states=True,
                gradient_checkpointing=True,  #critical for training speed.
            )

        else:
            self.transformer = AutoModel.from_pretrained(
                self.hparams.encoder_model,
                output_hidden_states=True,
            )

        # set the number of features our encoder model will return...
        self.encoder_features = 768

        # Tokenizer
        if self.hparams.transformer_type == 'longformer' or self.hparams.transformer_type == 'roberta-long':
            self.tokenizer = Tokenizer(
                pretrained_model=self.hparams.encoder_model,
                max_tokens=self.hparams.max_tokens_longformer)
            self.tokenizer.max_len = 4096

        else:
            self.tokenizer = Tokenizer(
                pretrained_model=self.hparams.encoder_model, max_tokens=512)

        #others:
        #'emilyalsentzer/Bio_ClinicalBERT' 'simonlevine/biomed_roberta_base-4096-speedfix'

        # Classification head
        if self.hparams.single_label_encoding == 'default':
            self.classification_head = nn.Sequential(
                nn.Linear(self.encoder_features, self.encoder_features * 2),
                nn.Tanh(),
                nn.Linear(self.encoder_features * 2, self.encoder_features),
                nn.Tanh(),
                nn.Linear(self.encoder_features,
                          self.data.label_encoder.vocab_size),
            )

        elif self.hparams.single_label_encoding == 'graphical':
            logger.critical('Graphical embedding not yet implemented!')
예제 #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name_or_path", type=str, 
            help="Transformers' model name or path")
    parser.add_argument("--pooler", type=str, 
            choices=['cls', 'cls_before_pooler', 'avg', 'avg_top2', 'avg_first_last'], 
            default='cls', 
            help="Which pooler to use")
    parser.add_argument("--mode", type=str, 
            choices=['dev', 'test', 'fasttest'],
            default='test', 
            help="What evaluation mode to use (dev: fast mode, dev results; test: full mode, test results); fasttest: fast mode, test results")
    parser.add_argument("--task_set", type=str, 
            choices=['sts', 'transfer', 'full', 'na'],
            default='sts',
            help="What set of tasks to evaluate on. If not 'na', this will override '--tasks'")
    parser.add_argument("--tasks", type=str, nargs='+', 
            default=['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
                     'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC',
                     'SICKRelatedness', 'STSBenchmark'], 
            help="Tasks to evaluate on. If '--task_set' is specified, this will be overridden")
    
    args = parser.parse_args()
    
    # Load transformers' model checkpoint
    model = AutoModel.from_pretrained(args.model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Set up the tasks
    if args.task_set == 'sts':
        args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']
    elif args.task_set == 'transfer':
        args.tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC']
    elif args.task_set == 'full':
        args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']
        args.tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC']

    # Set params for SentEval
    if args.mode == 'dev' or args.mode == 'fasttest':
        # Fast mode
        params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
        params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
                                         'tenacity': 3, 'epoch_size': 2}
    elif args.mode == 'test':
        # Full mode
        params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
        params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
                                         'tenacity': 5, 'epoch_size': 4}
    else:
        raise NotImplementedError

    # SentEval prepare and batcher
    def prepare(params, samples):
        return
    
    def batcher(params, batch, max_length=None):
        # Handle rare token encoding issues in the dataset
        if len(batch) >= 1 and len(batch[0]) >= 1 and isinstance(batch[0][0], bytes):
            batch = [[word.decode('utf-8') for word in s] for s in batch]

        sentences = [' '.join(s) for s in batch]

        # Tokenization
        if max_length is not None:
            batch = tokenizer.batch_encode_plus(
                sentences,
                return_tensors='pt',
                padding=True,
                max_length=max_length,
                truncation=True
            )
        else:
            batch = tokenizer.batch_encode_plus(
                sentences,
                return_tensors='pt',
                padding=True,
            )

        # Move to the correct device
        for k in batch:
            batch[k] = batch[k].to(device)
        
        # Get raw embeddings
        with torch.no_grad():
            outputs = model(**batch, output_hidden_states=True, return_dict=True)
            last_hidden = outputs.last_hidden_state
            pooler_output = outputs.pooler_output
            hidden_states = outputs.hidden_states

        # Apply different poolers
        if args.pooler == 'cls':
            # There is a linear+activation layer after CLS representation
            return pooler_output.cpu()
        elif args.pooler == 'cls_before_pooler':
            return last_hidden[:, 0].cpu()
        elif args.pooler == "avg":
            return ((last_hidden * batch['attention_mask'].unsqueeze(-1)).sum(1) / batch['attention_mask'].sum(-1).unsqueeze(-1)).cpu()
        elif args.pooler == "avg_first_last":
            first_hidden = hidden_states[0]
            last_hidden = hidden_states[-1]
            pooled_result = ((first_hidden + last_hidden) / 2.0 * batch['attention_mask'].unsqueeze(-1)).sum(1) / batch['attention_mask'].sum(-1).unsqueeze(-1)
            return pooled_result.cpu()
        elif args.pooler == "avg_top2":
            second_last_hidden = hidden_states[-2]
            last_hidden = hidden_states[-1]
            pooled_result = ((last_hidden + second_last_hidden) / 2.0 * batch['attention_mask'].unsqueeze(-1)).sum(1) / batch['attention_mask'].sum(-1).unsqueeze(-1)
            return pooled_result.cpu()
        else:
            raise NotImplementedError

    results = {}

    for task in args.tasks:
        se = senteval.engine.SE(params, batcher, prepare)
        result = se.eval(task)
        results[task] = result
    
    # Print evaluation results
    if args.mode == 'dev':
        print("------ %s ------" % (args.mode))

        task_names = []
        scores = []
        for task in ['STSBenchmark', 'SICKRelatedness']:
            task_names.append(task)
            if task in results:
                scores.append("%.2f" % (results[task]['dev']['spearman'][0] * 100))
            else:
                scores.append("0.00")
        print_table(task_names, scores)

        task_names = []
        scores = []
        for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']:
            task_names.append(task)
            if task in results:
                scores.append("%.2f" % (results[task]['devacc']))    
            else:
                scores.append("0.00")
        task_names.append("Avg.")
        scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
        print_table(task_names, scores)

    elif args.mode == 'test' or args.mode == 'fasttest':
        print("------ %s ------" % (args.mode))

        task_names = []
        scores = []
        for task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']:
            task_names.append(task)
            if task in results:
                if task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
                    scores.append("%.2f" % (results[task]['all']['spearman']['all'] * 100))
                else:
                    scores.append("%.2f" % (results[task]['test']['spearman'].correlation * 100))
            else:
                scores.append("0.00")
        task_names.append("Avg.")
        scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
        print_table(task_names, scores)

        task_names = []
        scores = []
        for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']:
            task_names.append(task)
            if task in results:
                scores.append("%.2f" % (results[task]['devacc']))    
            else:
                scores.append("0.00")
        task_names.append("Avg.")
        scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
        print_table(task_names, scores)
fpath = '/home/dpappas/bioasq_2021/BioASQ-task9bPhaseB-testset{}'.format(b)
ofpath = '/home/dpappas/bioasq_2021/batch{}_system_1_factoid.json'.format(b)
# model_name      = "ktrapeznikov/biobert_v1.1_pubmed_squad_v2"
# my_model_path   = '/home/dpappas/bioasq_factoid/snipBefAfter1_ktrapeznikov__biobert_v1.1_pubmed_squad_v2_MLP_100_9.pth.tar'
# hidden          = 768
model_name = "ktrapeznikov/albert-xlarge-v2-squad-v2"
my_model_path = "/home/dpappas/bioasq_factoid/albert_ktrapeznikov__albert-xlarge-v2-squad-v2_MLP_100_42_5e-05.pth.tar"
hidden = 2048
d = json.load(open(fpath))

use_cuda = torch.cuda.is_available()
device = torch.device("cuda") if (use_cuda) else torch.device("cpu")

bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
pprint(bert_tokenizer.special_tokens_map)
bert_model = AutoModel.from_pretrained(model_name).to(device)
bert_model.eval()

my_model = Ontop_Modeler(hidden, 100).to(device)
load_model_from_checkpoint(my_model_path)
gb = my_model.eval()


def fix_phrase(phr):
    if len(phr) == 0:
        return ''
    while not phr[0].isalnum():
        phr = phr[1:]
    while not phr[-1].isalnum():
        phr = phr[:-1]
    return phr
예제 #8
0
 def setUp(self) -> None:
     """Set up for the tests--load tokenizer."""
     self.test_tokenizer = AutoTokenizer.from_pretrained("allenai/biomed_roberta_base")
     self.model = AutoModel.from_pretrained("allenai/biomed_roberta_base")
     self.model.resize_token_embeddings(len(self.test_tokenizer))
     self.out_dir = 'tests/models/test_output'
예제 #9
0
import os
import ray
import glob
import time
import torch
from transformers import AutoModel
import torchfly_dev

ray.init(memory=12 * 1024**3,
         object_store_memory=8 * 1024**3,
         redis_max_memory=8 * 1024**3)

model = AutoModel.from_pretrained("roberta-large")

device = torch.device("cuda")
model = model.cuda()

for i in range(100):

    start = time.time()

    obj = torchfly_dev.async_save(model.state_dict(), f"tmp.pth")

    time.sleep(4)
    end = time.time()

    print(f"Time takes: {end-start-4}s")

time.sleep(100)
예제 #10
0
def get(
    model_name: str,
    make_copy: bool,
    override_weights_file: Optional[str] = None,
    override_weights_strip_prefix: Optional[str] = None,
    **kwargs,
) -> transformers.PreTrainedModel:
    """
    Returns a transformer model from the cache.

    # Parameters

    model_name : `str`
        The name of the transformer, for example `"bert-base-cased"`
    make_copy : `bool`
        If this is `True`, return a copy of the model instead of the cached model itself. If you want to modify the
        parameters of the model, set this to `True`. If you want only part of the model, set this to `False`, but
        make sure to `copy.deepcopy()` the bits you are keeping.
    override_weights_file : `str`, optional
        If set, this specifies a file from which to load alternate weights that override the
        weights from huggingface. The file is expected to contain a PyTorch `state_dict`, created
        with `torch.save()`.
    override_weights_strip_prefix : `str`, optional
        If set, strip the given prefix from the state dict when loading it.
    """
    global _model_cache
    spec = TransformerSpec(model_name, override_weights_file,
                           override_weights_strip_prefix)
    transformer = _model_cache.get(spec, None)
    if transformer is None:
        if override_weights_file is not None:
            from allennlp.common.file_utils import cached_path
            import torch

            override_weights_file = cached_path(override_weights_file)
            override_weights = torch.load(override_weights_file)
            if override_weights_strip_prefix is not None:

                def strip_prefix(s):
                    if s.startswith(override_weights_strip_prefix):
                        return s[len(override_weights_strip_prefix):]
                    else:
                        return s

                valid_keys = {
                    k
                    for k in override_weights.keys()
                    if k.startswith(override_weights_strip_prefix)
                }
                if len(valid_keys) > 0:
                    logger.info("Loading %d tensors from %s", len(valid_keys),
                                override_weights_file)
                else:
                    raise ValueError(
                        f"Specified prefix of '{override_weights_strip_prefix}' means no tensors "
                        f"will be loaded from {override_weights_file}.")
                override_weights = {
                    strip_prefix(k): override_weights[k]
                    for k in valid_keys
                }

            transformer = AutoModel.from_pretrained(
                model_name,
                state_dict=override_weights,
                **kwargs,
            )
        else:
            transformer = AutoModel.from_pretrained(
                model_name,
                **kwargs,
            )
        _model_cache[spec] = transformer
    if make_copy:
        import copy

        return copy.deepcopy(transformer)
    else:
        return transformer
예제 #11
0
    def __init__(self,  \

        doc_definition,
        sent_definition,
        pretrained,
        num_workers,
        num_epochs,
        dropout_sent = 0.0,
        dropout_doc = 0.0,
        use_sent_objective = True,
        concat_sent_scores = True,
        dataset_class = DatasetBertTC,
        scorer_class = ScorerXray,
        grad_max_norm = 1.0,
        loss_reduction = 'sum',
        batch_size = 5,
        lr = 1e-5,
        lr_ratio = 1.0,
        attention_query_dim = 100,
        max_length = 50,
        max_sent_count = 50,
        linebreak_bound = True,
        keep_ws = False,
        project_sent = False,
        project_size = 200,
        optimizer_params = None,
        dataloader_params = None,
        hyperparams = None,
        dataset_params = None,


        ):

        super(ModelBertTC, self).__init__( \
            hyperparams = hyperparams,
            dataset_params = dataset_params,
            dataloader_params = dataloader_params,
            optimizer_params = optimizer_params,
            num_workers = num_workers,
            num_epochs = num_epochs,
            dataset_class = dataset_class,
            scorer_class = scorer_class
            )

        self.pretrained = pretrained
        self.use_sent_objective = use_sent_objective
        self.concat_sent_scores = concat_sent_scores
        self.grad_max_norm = grad_max_norm
        self.loss_reduction = loss_reduction

        self.doc_definition = doc_definition
        self.sent_definition = sent_definition
        self.num_workers = num_workers
        self.batch_size = batch_size
        self.lr = lr
        self.lr_ratio = lr_ratio
        self.max_length = max_length
        self.max_sent_count = max_sent_count

        self.linebreak_bound = linebreak_bound
        self.keep_ws = keep_ws

        self.project_sent = project_sent
        self.project_size = project_size

        if self.concat_sent_scores:
            assert self.use_sent_objective

        self.bert = AutoModel.from_pretrained(self.pretrained)

        hidden_size = self.bert.config.hidden_size

        self.sent_attention = nn.ModuleDict(OrderedDict())
        self.doc_output_layers = nn.ModuleDict(OrderedDict())
        self.sent_ffnn = nn.ModuleDict(OrderedDict())
        self.sent_classifiers = nn.ModuleDict(OrderedDict())

        for k, label_set in doc_definition.items():


            self.sent_classifiers[k] = SentClassifiers( \
                                        input_dim = hidden_size,
                                        num_tags = 2,
                                        loss_reduction = self.loss_reduction,
                                        dropout = dropout_sent,
                                        sent_definition = sent_definition[k],
                                        )

            if self.concat_sent_scores:
                n = len(sent_definition[k]) * 2
            else:
                n = 0

            if self.project_sent:
                self.sent_ffnn[k] = FeedForward( \
                        input_dim = hidden_size+n,
                        num_layers = 1,
                        hidden_dims = self.project_size,
                        activations = get_activation('tanh'),
                        dropout = 0)

                out_dim = self.project_size
            else:
                out_dim = hidden_size + n


            self.sent_attention[k] = Attention( \
                                    input_dim = out_dim,
                                    dropout = dropout_doc,
                                    use_ffnn = True,
                                    activation = 'tanh',
                                    query_dim = attention_query_dim)

            self.doc_output_layers[k] = nn.Linear(out_dim, len(label_set))

        self.get_summary()
예제 #12
0
 def __init__(self, config, gpu_list, *args, **params):
     super(XLNetEncoder, self).__init__()
     self.bert = AutoModel.from_pretrained(config.get("model", "bert_path"))
     self.max_seq_len = config.getint("model", "max_seq_len")
import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertModel, AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
예제 #14
0
 def __init__(self, pretrained_model_name_or_path):
     BertModule.__init__(self)
     lm_model = AutoModel.from_pretrained(pretrained_model_name_or_path)
     PreTrainedModel.__init__(self, config=lm_model.config)
     self.lm_model = lm_model
     self.type = type(lm_model)
예제 #15
0
    def from_encoder_decoder_pretrained(
        cls,
        encoder_pretrained_model_name_or_path: str = None,
        decoder_pretrained_model_name_or_path: str = None,
        *model_args,
        **kwargs
    ) -> PreTrainedModel:
        r""" Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints.


        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
        To train the model, you need to first set it back in training mode with `model.train()`.

        Params:
            encoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
                information necessary to initiate the encoder. Either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            decoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
                information necessary to initiate the decoder. Either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method

            kwargs: (`optional`) Remaining dictionary of keyword arguments.
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:

        Examples::

            >>> from transformers import EncoderDecoderModel
            >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
        """

        kwargs_encoder = {
            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
        }

        kwargs_decoder = {
            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
        }

        # Load and initialize the encoder and decoder
        # The distinction between encoder and decoder at the model level is made
        # by the value of the flag `is_decoder` that we need to set correctly.
        encoder = kwargs_encoder.pop("model", None)
        if encoder is None:
            assert (
                encoder_pretrained_model_name_or_path is not None
            ), "If `model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has to be defined"
            from .modeling_auto import AutoModel

            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
        encoder.config.is_decoder = False

        decoder = kwargs_decoder.pop("model", None)
        if decoder is None:
            assert (
                decoder_pretrained_model_name_or_path is not None
            ), "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has to be defined"
            from .modeling_auto import AutoModelForCausalLM

            if "config" not in kwargs_decoder:
                from transformers import AutoConfig

                decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path)
                if decoder_config.is_decoder is False:
                    logger.info(
                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
                    )
                    decoder_config.is_decoder = True

                kwargs_decoder["config"] = decoder_config

            if kwargs_decoder["config"].is_decoder is False:
                logger.warning(
                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, make sure that the attribute `is_decoder` of `decoder_config` passed to `.from_encoder_decoder_pretrained(...)` is set to `True` or do not pass a `decoder_config` to `.from_encoder_decoder_pretrained(...)`"
                )

            decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)

        return cls(encoder=encoder, decoder=decoder)
예제 #16
0
    def __init__(self):
        super().__init__()

        # self.model = AutoModel.from_pretrained('allenai/longformer-base-4096')
        self.model = AutoModel.from_pretrained('roberta-base')
def create_model(encoder_name="bert-base-uncased",
                 dec_vocabsize=None,
                 abs_dec_vocabsize=None,
                 dec_layers=6,
                 dec_dim=640,
                 dec_heads=8,
                 dropout=0.,
                 maxlen=20,
                 smoothing=0.,
                 numbeam=1,
                 tensor2tree=None,
                 abstensor2tree=None,
                 abs_id=-100,
                 entropycontrib=1.):
    if encoder_name != "bert-base-uncased":
        raise NotImplementedError(
            f"encoder '{encoder_name}' not supported yet.")
    pretrained = AutoModel.from_pretrained(encoder_name)
    encoder = pretrained

    class BertEncoderWrapper(torch.nn.Module):
        def __init__(self, model, dropout=0., **kw):
            super(BertEncoderWrapper, self).__init__(**kw)
            self.model = model
            self.proj = torch.nn.Linear(pretrained.config.hidden_size,
                                        dec_dim,
                                        bias=False)
            self.dropout = torch.nn.Dropout(dropout)

        def forward(self, input_ids, attention_mask=None):
            ret, _ = self.model(input_ids, attention_mask=attention_mask)
            if pretrained.config.hidden_size != dec_dim:
                ret = self.proj(ret)
            ret = self.dropout(ret)
            ret = (ret, None, None)
            return ret

    class DummyEncoder(torch.nn.Module):
        def __init__(self, dim, **kw):
            super(DummyEncoder, self).__init__(**kw)
            self.dim = dim

        def forward(self, input_ids, attention_mask=None):
            return torch.zeros(input_ids.size(0),
                               1,
                               self.dim,
                               device=input_ids.device)

    encoder = BertEncoderWrapper(encoder, dropout=dropout)

    decoder_config = BartConfig(
        d_model=dec_dim,
        pad_token_id=0,
        bos_token_id=1,
        vocab_size=abs_dec_vocabsize,
        decoder_attention_heads=dec_heads // 2,
        decoder_layers=dec_layers,
        dropout=dropout,
        attention_dropout=min(0.1, dropout / 2),
        decoder_ffn_dim=dec_dim * 4,
        encoder_attention_heads=dec_heads,
        encoder_layers=dec_layers,
        encoder_ffn_dim=dec_dim * 4,
    )
    adv_decoder_config = BartConfig(
        d_model=dec_dim,
        pad_token_id=0,
        bos_token_id=1,
        vocab_size=dec_vocabsize,
        decoder_attention_heads=dec_heads // 2,
        decoder_layers=dec_layers,
        dropout=dropout,
        attention_dropout=min(0.1, dropout / 2),
        decoder_ffn_dim=dec_dim * 4,
        encoder_attention_heads=dec_heads,
        encoder_layers=dec_layers,
        encoder_ffn_dim=dec_dim * 4,
    )

    decoder_lm_config = BartConfig(
        d_model=dec_dim,
        pad_token_id=0,
        bos_token_id=1,
        vocab_size=dec_vocabsize,
        decoder_attention_heads=dec_heads // 2,
        decoder_layers=dec_layers,
        dropout=dropout,
        attention_dropout=min(0.1, dropout / 2),
        decoder_ffn_dim=dec_dim * 4,
        encoder_attention_heads=dec_heads,
        encoder_layers=dec_layers,
        encoder_ffn_dim=dec_dim * 4,
    )

    model = BartGenerator(decoder_config)
    model.model.encoder = encoder

    advmodel = BartGenerator(adv_decoder_config)
    advmodel.model.encoder = encoder

    decoder_lm = BartGenerator(decoder_lm_config)
    decoder_lm.model.encoder = DummyEncoder(dec_dim)

    orderless = {"op:and", "SW:concat"}

    trainmodel = GeneratorTrain(model,
                                advmodel,
                                smoothing=smoothing,
                                tensor2tree=abstensor2tree,
                                orderless=orderless,
                                abs_id=abs_id,
                                entropycontrib=entropycontrib)
    advtrainmodel = AdversaryTrain(advmodel,
                                   decoder_lm,
                                   smoothing=smoothing,
                                   tensor2tree=tensor2tree,
                                   orderless=orderless)
    testmodel = BartGeneratorTest(model,
                                  maxlen=maxlen,
                                  numbeam=numbeam,
                                  tensor2tree=abstensor2tree,
                                  orderless=orderless)
    return trainmodel, advtrainmodel, testmodel
예제 #18
0
            text2 = text[end:]
            word_id += 1


#     return wsd
    return pd.DataFrame([{'form': text, 'WSD': str(wsd)}])

if __name__ == "__main__":
    args = parser.parse_args()
    text = args.text
    multigpu = args.multigpu

    with open('Dict/processed_dictionary.json', 'rb') as f:
        urimal_dict = json.load(f)

    bert_model = AutoModel.from_pretrained("monologg/distilkobert")
    tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
    model = BiEncoderModel(bert_model)
    model.to('cuda')

    model_list = os.listdir(f"checkpoint/{args.model_date}")
    model_fname = 'saved_checkpoint_fin'

    #     model = torch.load(f"checkpoint/{args.model_date}/{model_fname}")
    model = torch.load(f"checkpoint/WSD_v2/{model_fname}", map_location='cuda')
    model.eval()

    batch_generator = BatchGenerator(tokenizer, 128)
    eval_df = text_process(text, urimal_dict)
    eval_ds = ContextDataset(eval_df)
    eval_dl = context_dataloader(eval_ds, batch_generator, 1)
예제 #19
0
    def get_bert_embeddings(self, path, dynamic=True, static=False):
        """
        SciBert embeddings computation using Transformes. It store and transforms the texts into SciBERT embeddings. The embeddings are stored in csv files.
        
        :param path: path to save the embeddings
        :param dynamic: boolean variable to compute the dynamic embeddings (By defalut: True).
        :param static: boolean variable to compute the static embeddings (By defalut: False).
        :returns: static embeddings if static=True
        
        """  
        
        self.__data_preparation()
        
        data_stat=[]

        bert = AutoModel.from_pretrained(self.model).embeddings
        bert=bert.to(self.device)



        for idx_batch, sequence in enumerate(self.data_dataloader,1):
            sequence=sequence.to(self.device)

            ids_tokens=np.where((self.indexed_tokens[idx_batch-1]!=102) &(self.indexed_tokens[idx_batch-1]!=103) &(self.indexed_tokens[idx_batch-1]!=101) )[0]
            tokens=np.array(self.tokenized_texts[idx_batch-1])[ids_tokens]
            index=[]
            index_num=[]
            for i in range(len(tokens)):
                if [idx for idx, x in enumerate(tokens[i]) if x=='#'] ==[]:
                    index.append(i)
                else:
                    index_num.append(i)
                
            

            bert_embeddings=bert(sequence)[0][:,ids_tokens].cpu().detach()

            embeddings=torch.tensor(np.zeros((bert_embeddings.shape[1]-len(index_num),bert_embeddings.shape[2])))
            count=0
            if index_num!=[]:
                for idx in range (len(ids_tokens)):
                     if np.where(index_num==np.array([idx]))[0].size!=0:
                         nums=bert_embeddings[0][idx]*bert_embeddings[0][idx-1]
                         embeddings[idx-count-1]=nums.cpu().detach()
                         count+=1
                     else:
                         embeddings[idx-count]=bert_embeddings[0][idx].cpu().detach()
            else:
                
                embeddings=bert_embeddings[0]
            
            if static:
                for emb in embeddings:
                    data_stat.append(emb)
                    
   
 

            if dynamic: 
                i=1
                data_csv=[]
                labelstf= []
                labelstf.append('Word')   
                for n in range (self.neurons):
                    labelstf.append('Neuron'+str(n+1))  
                for emb in embeddings:
                    data_csv.append(np.hstack((self.words[idx_batch-1][i-1], emb)))
                    i+=1
                with open(path+self.file_names+'.csv', 'w', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    
                
                    writer.writerow(labelstf)
                    writer.writerows(data_csv)
                
        if static:        
            wordar=np.vstack(data_stat)
            del data_stat
            meanBERT=np.mean(wordar, axis=0)
            stdBERT=np.std(wordar, axis=0)
            kurtosisBERT=kurtosis(wordar, axis=0)
            skewnessBERT=skew(wordar, axis=0)
            skewnessBERT=skew(wordar, axis=0)
            minBERT=np.min(wordar, axis=0)
            maxBERT=np.max(wordar, axis=0)
            statisticalMeasures=np.hstack((meanBERT, stdBERT, kurtosisBERT, skewnessBERT,minBERT, maxBERT))
            
            del embeddings
            #del bert_embeddings
            del bert
            del self.data_dataloader
            del self.tokenized_texts
            del self.data
            
            

            
            
                   
            return statisticalMeasures
        else:
            del embeddings
            #del bert_embeddings
            del bert
            del self.data_dataloader
            del self.tokenized_texts
            del self.data
            
            gc.collect()
예제 #20
0
def main():
    parser = utils.ArgParser()
    parser.add_argument("dataset_name", type=str, help="dataset name")
    arguments.add_dataset_path_arg(parser)
    arguments.add_test_arg(parser)
    parser.add_argument("--metadata_name", type=str, default="text_data", help="change which metadata to load")
    parser.add_argument("--cuda", action="store_true", help="use cuda")
    parser.add_argument("--multi_gpu", action="store_true", help="use multiple gpus")
    parser.add_argument("--model_path", type=str, default=None,
                        help="Cache path for transformers package.")
    parser.add_argument("--model_name", type=str, default="bert-base-uncased", help="Which model to use.")
    parser.add_argument("--model_source", type=str, default="transformers", help="Where to get the models from.")
    parser.add_argument("--layers", type=str, default="-2,-1",
                        help="Read the features from these layers. Careful: Multiple layers must be specified like "
                             "this: --layers=-2,-1 because of argparse handling minus as new argument.")
    parser.add_argument("--batch_size", type=int, default=1, help="Batch size.")
    parser.add_argument("--workers", type=int, default=0, help="Dataloader workers.")
    parser.add_argument("--add_name", type=str, default="", help="Add additional identifier to output files.")
    parser.add_argument("-f", "--force", action="store_true", help="Overwrite embedding if exists.")
    parser.add_argument("--encoder_only", action="store_true",
                        help="Flag for hybrid models (BART: bilinear and unilinear) that return "
                             "both encoder and decoder output, if the decoder output should be discarded.")
    parser.add_argument("--set_tokenizer", type=str, default="",
                        help=f"Manually define the tokenizer instead of determining it from model name. "
                             f"Options: {nntrainer.data_text.TextPreprocessing.values()}")
    parser.add_argument("--add_special_tokens", action="store_true",
                        help=f"Set the tokenizer to add special tokens (like [CLS], [SEP] for BERT).")
    parser.add_argument("--token_stride", action="store_true",
                        help=f"If set, too long texts will be strided over instead of cut to max.")
    parser.add_argument("--token_stride_factor", type=int, default=2,
                        help=f"Default 2 means to stride half the window size. Set to 1 for non-overlapping windows.")
    parser.add_argument("--print_model", action="store_true", help=f"Print model and config")

    args = parser.parse_args()
    data_path = Path("data")
    dataset_path = data_path / args.dataset_name
    model_name = args.model_name
    token_stride = args.token_stride
    model_ident = f"{args.model_source}_{model_name.replace('/', '--')}_{args.layers}"
    full_ident = f"text_feat_{args.dataset_name}_meta_{args.metadata_name}_{model_ident}{args.add_name}"

    # setup paths
    text_features_path = dataset_path
    os.makedirs(text_features_path, exist_ok=True)
    lengths_file = text_features_path / f"{full_ident}_sentence_splits.json"
    data_file_only = f"{full_ident}.h5"
    data_file = text_features_path / data_file_only

    '''
    if data_file.exists() and lengths_file.exists() and not args.force:
        print(f"{data_file} already exists. nothing to do.")
        return
    '''
    
    # Load pretrained model
    print("*" * 20, f"Loading model {model_name} from {args.model_source}")
    if args.model_source == "transformers":
        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=args.model_path)
        model: BertModel = AutoModel.from_pretrained(model_name, cache_dir=args.model_path)
        if args.print_model:
            print("*" * 40, "Model")
            print(f"{model}")
            print("*" * 40, "Config")
            print(model.config)
        # noinspection PyUnresolvedReferences
        max_text_len = model.config.max_position_embeddings
        model.eval()
    else:
        raise NotImplementedError(f"Model source unknown: {args.model_source}")
    if args.cuda:
        if args.multi_gpu:
            model = nn.DataParallel(model).cuda()
        else:
            model = model.cuda()
    print(f"Running model on device {next(model.parameters()).device}")
    print(f"Maximum input length {max_text_len}")

    # define preprocessor
    is_tp = False
    add_special_tokens = args.add_special_tokens
    if args.set_tokenizer != "":
        print(f"Set tokenizer via flag to {args.set_tokenizer}")
        preprocessor = get_text_preprocessor(args.set_tokenizer)
    elif model_name == "bert-base-uncased":
        # paper results
        preprocessor = get_text_preprocessor(nntrainer.data_text.TextPreprocessing.BERT_PAPER)
    elif model_name.startswith(TextModelConst.BERT) or model_name.startswith(TextModelConst.DISTILBERT):
        # new results bert-large-cased
        preprocessor = get_text_preprocessor(nntrainer.data_text.TextPreprocessing.BERT_NEW)
    elif model_name.startswith(TextModelConst.GPT2):
        # new results with gpt2
        preprocessor = get_text_preprocessor(nntrainer.data_text.TextPreprocessing.GPT2)
    else:
        print(f"WARNING: no text preprocessing defined for model {model_name}, using default preprocessing which "
              f"does not add any special tokens.")
        preprocessor = get_text_preprocessor(nntrainer.data_text.TextPreprocessing.SIMPLE)
    # else:
    #     raise NotImplementedError(f"No preprocessing defined for model {model_name}")

    # define feature layers to extract
    layer_list_int = [int(layer.strip()) for layer in args.layers.strip().split(",")]

    # load metadata
    meta_file = dataset_path / f"{args.metadata_name}.json"
    print(f"Loading meta file of {meta_file.stat().st_size // 1024 ** 2:.0f} MB")
    timer_start = timer()
    meta_dict = json.load(meta_file.open("rt", encoding="utf8"))
    print(f"Took {timer() - timer_start:.1f} seconds for {len(meta_dict)}.")
    text_dict: Dict[str, List[str]] = {}
    for key, meta in meta_dict.items():
        text_dict[key] = [item for key, item in meta.items()]
        #text_dict[key] = [seg["text"] for seg in meta["segments"]]

    # get max number of words length
    total_words = 0
    max_words = 0
    for key, val in tqdm(text_dict.items(), desc="Compute total_words and max_words"):
        num_words = sum(len(text.split(" ")) for text in val)
        total_words += num_words
        max_words = max(num_words, max_words)
    print(f"Total {total_words} average {total_words / len(meta_dict):.2f} max {max_words}")

    # create dataset and loader
    print("*" * 20, "Loading and testing dataset.")
    dataset = TextConverterDataset(tokenizer, text_dict, preprocessor, max_text_len=max_text_len,
                                   token_stride=token_stride,
                                   add_special_tokens=add_special_tokens)
    dataloader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers,
                                 collate_fn=dataset.collate_fn)

    # print first datapoint
    for key, value in dataset[0].items():
        print(f"{key}: {value}\n")

    if args.test:
        # print first datapoint
        for point in dataset:
            for key, value in dict(point).items():
                print(f"{key}: {value}\n")
        print("Test, stopping here.")
        return

    # loop videos and encode features
    print("*" * 20, "Running the encoding.")
    print(f"Encoding text with model: {model_name}, layers: {layer_list_int}, "
          f"batch size: {args.batch_size}, workers: {args.workers}")
    temp_file = text_features_path / f"TEMP_{utils.get_timestamp_for_filename()}_{data_file_only}"
    data_h5 = h5py.File(temp_file, "w")
    lengths = {}
    total_feat_dim = None
    printed_warning = False
    pbar = tqdm(desc="compute text features", total=maths.ceil(len(dataset) / args.batch_size))
    for i, batch in enumerate(dataloader):  # type: TextDataBatchPoint
        if args.cuda:
            batch.to_cuda(non_blocking=True)
        batch_size = len(batch.key)

        total_max_seq_len = batch.tokens.shape[1]
        if total_max_seq_len <= max_text_len:
            # everything is fine
            # compute model output and read hidden states
            model_outputs = model(input_ids=batch.tokens, attention_mask=batch.mask, output_hidden_states=True)
            hidden_states = model_outputs["hidden_states"]
            # pbar.write(f"tokens {batch.tokens.shape[1]}")
            # pbar.write(f"outputs {list(state.shape[1] for state in hidden_states)}")
            # concatenate the features from the requested layers of the hidden state (-1 is the output layer)
            features = []
            for layer_num in layer_list_int:
                layer_features = hidden_states[layer_num]
                features.append(layer_features.detach().cpu().numpy())
            # concatenate features of individual hidden layers
            features = np.concatenate(features, axis=-1)  # shape (batch_size, max_sent_len, num_layers * feat_dim)
            # pbar.write(f"features {features.shape}")
        else:
            print('Hoy')
            # if batch tokens is too long we need multiple steps depending on stride
            stride = max_text_len // args.token_stride_factor
            positions = list(range(0, total_max_seq_len - stride, stride))
            all_model_outputs = []
            pbar.write(f"Length {total_max_seq_len}! Split with window {max_text_len} stride {stride} "
                       f"into {len(positions)} batches at positions {positions} ")
            for pos in positions:
                end_pos = pos + max_text_len
                these_tokens = batch.tokens[:, pos:end_pos]
                these_masks = batch.mask[:, pos:end_pos]
                these_model_outputs = model(input_ids=these_tokens, attention_mask=these_masks,
                                            output_hidden_states=True)
                these_hidden_states = these_model_outputs["hidden_states"]
                # pbar.write(f"tokens {these_tokens.shape[1]}")
                # pbar.write(f"outputs {list(state.shape[1] for state in these_hidden_states)}")
                # concatenate the features from the requested layers of the hidden state (-1 is the output layer)
                features = []
                for layer_num in layer_list_int:
                    layer_features = these_hidden_states[layer_num]
                    if pos != 0:
                        layer_features = layer_features[:, stride:]
                    features.append(layer_features.detach().cpu().numpy())
                # concatenate features of individual hidden layers
                features = np.concatenate(features, axis=-1)  # shape (batch_size, max_sent_len, num_layers * feat_dim)
                # pbar.write(f"features {features.shape}")
                all_model_outputs.append(features)
            # concatenate outputs back together
            features = np.concatenate(all_model_outputs, axis=1)

        # compute total output size, need to know this for model architecture
        if total_feat_dim is None:
            total_feat_dim = features.shape[-1]

        # extract single datapoint information from the batch
        for batch_num in range(batch_size):
            key = batch.key[batch_num]
            length = batch.lengths[batch_num]

            # given length (number of tokens), cut off the padded tokens
            feature = features[batch_num, :length]

            # store sentence lengths so features can be mapped to sentences later
            sentence_lengths = batch.sentence_lengths[batch_num]

            if is_tp:
                sentence_lengths = [int(np.round(length / 4)) for length in sentence_lengths]

            # make sure correspondence between paragraph features and sentence lengths is still there
            if feature.shape[0] != sum(sentence_lengths) and not printed_warning:
                pbar.write("*" * 40)
                pbar.write(f"WARNING: Feature sequence length {feature.shape[0]} is not equal sum of the sentence "
                           f"lengths: "f"{sum(sentence_lengths)}")
                pbar.write(f"{sentence_lengths}")
                pbar.write(f"It may be hard to get the correspondence between tokens and features back and the "
                           f"correct hierarchical sentence structure back from these features..")
                printed_warning = True

            # write features
            data_h5[key] = feature
            lengths[key] = sentence_lengths
        pbar.update()
    pbar.close()
    data_h5.close()

    print(f"Wrote data to {temp_file}, moving to {data_file}")
    if data_file.is_file():
        os.remove(data_file)
        time.sleep(0.1)
    shutil.move(temp_file, data_file)

    # write lengths file
    json.dump(lengths, lengths_file.open("wt", encoding="utf8"))

    print(f"Wrote sentence splits to {lengths_file}")
    print(f"Total feature dim of {len(layer_list_int)} is {total_feat_dim}")
    def __init__(self,
                 freeze_bert_params=True,
                 dropout_prob=0.1,
                 num_heads=3,
                 base='bert-base-uncased'):
        print("BERTAttentionClasswiseWeighted Being Used!\n\n\n")

        super(BERTAttentionClasswiseWeighted, self).__init__()
        self.embeddings = AutoModel.from_pretrained(
            base)  #, output_hidden_states = True)

        if freeze_bert_params:
            for param in self.embeddings.parameters():
                param.requires_grad = False

        self.num_heads = num_heads

        self.dropout_common = nn.Dropout(dropout_prob)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.dropout3 = nn.Dropout(dropout_prob)
        self.dropout4 = nn.Dropout(dropout_prob)
        self.dropout5 = nn.Dropout(dropout_prob)
        self.dropout6 = nn.Dropout(dropout_prob)
        self.dropout7 = nn.Dropout(dropout_prob)

        self.fc1 = LinearBlock(768, 512)
        self.fc2 = LinearBlock(512, 512)

        self.fc_out1 = LinearBlock(512, 512)
        self.fc_out2 = LinearBlock(512, 512)
        self.fc_out3 = LinearBlock(512, 512)
        self.fc_out4 = LinearBlock(512, 512)
        self.fc_out5 = LinearBlock(512, 512)
        self.fc_out6 = LinearBlock(512, 512)
        self.fc_out7 = LinearBlock(512, 512)

        self.fc_out1_2 = LinearBlock(512, 256)
        self.fc_out2_2 = LinearBlock(512, 256)
        self.fc_out3_2 = LinearBlock(512, 256)
        self.fc_out4_2 = LinearBlock(512, 256)
        self.fc_out5_2 = LinearBlock(512, 256)
        self.fc_out6_2 = LinearBlock(512, 256)
        self.fc_out7_2 = LinearBlock(512, 256)

        self.attn1 = MultiHeadAttention(self.num_heads, self.num_heads * 256)
        self.attn2 = MultiHeadAttention(self.num_heads, self.num_heads * 256)
        self.attn3 = MultiHeadAttention(self.num_heads, self.num_heads * 256)
        self.attn4 = MultiHeadAttention(self.num_heads, self.num_heads * 256)
        self.attn5 = MultiHeadAttention(self.num_heads, self.num_heads * 256)
        self.attn6 = MultiHeadAttention(self.num_heads, self.num_heads * 256)
        self.attn7 = MultiHeadAttention(self.num_heads, self.num_heads * 256)

        # Penultimate layers
        # Variance prediction layers
        self.log_var1 = nn.Linear(self.num_heads * 256, 1)
        self.log_var2 = nn.Linear(self.num_heads * 256, 1)
        self.log_var3 = nn.Linear(self.num_heads * 256, 1)
        self.log_var4 = nn.Linear(self.num_heads * 256, 1)
        self.log_var5 = nn.Linear(self.num_heads * 256, 1)
        self.log_var6 = nn.Linear(self.num_heads * 256, 1)
        self.log_var7 = nn.Linear(self.num_heads * 256, 1)

        self.out1 = nn.Linear(self.num_heads * 256, 2)
        self.out2 = nn.Linear(self.num_heads * 256, 3)
        self.out3 = nn.Linear(self.num_heads * 256, 3)
        self.out4 = nn.Linear(self.num_heads * 256, 3)
        self.out5 = nn.Linear(self.num_heads * 256, 3)
        self.out6 = nn.Linear(self.num_heads * 256, 2)
        self.out7 = nn.Linear(self.num_heads * 256, 2)
예제 #22
0
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer

# Import our models. The package will take care of downloading the models automatically
tokenizer = AutoTokenizer.from_pretrained("/data/project/learn_code/data/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("/data/project/learn_code/data/sup-simcse-bert-base-uncased")

# Tokenize input texts
texts = [
    "There's a kid on a skateboard.",
    "A kid is skateboarding.",
    "A kid is inside the house."
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))

# simcse 的核心有两个
# 1. loss function: 拉近相似的样本,推开不相似的样本
# 1. 使用dropout做数据增强
예제 #23
0
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

race_sampled_train['label'] = race_sampled_train['label'].replace(label_dict)
race_sampled_val['label'] = race_sampled_val['label'].replace(label_dict)

torch.set_grad_enabled(True)

# Store the model we want to use
MODEL_NAME = "bert-base-cased"

# Creating the model and tokenizer
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize Data
encoded_train = tokenizer.batch_encode_plus(race_sampled_train['name'].apply(get_name_pair).values.tolist(),
                                            return_attention_mask=True,
                                            padding=True,
                                            return_tensors='pt')

encoded_val = tokenizer.batch_encode_plus(race_sampled_val['name'].apply(get_name_pair).values.tolist(),
                                            return_attention_mask=True,
                                            padding=True,
                                            return_tensors='pt')

input_ids_train = encoded_train['input_ids']
attention_masks_train = encoded_train['attention_mask']
예제 #24
0
 def _load_transformer(self):
     pretrained_transformer = self.hparams.pretrained_transformer
     if not pretrained_transformer:
         raise Exception("no transformer identifier specified")
     return AutoModel.from_pretrained(pretrained_transformer)
예제 #25
0
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from torchvision import datasets
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased")

#we add 2 more to it, because we will use it to measure the sentences for
#the tokenizer, which has a starting and an end character additionally

def longest_sentence_size(sentences):
    current_max=0
    for sentence in sentences:
        if(len(sentence)>current_max):
            current_max=len(sentence)
    return current_max+2

#it wants the sentences as a list already split to words/punctuation
#in the first column
def text_tokenizing(text_dataframe):
    input_ids=[]
    attention_masks=[]
    targets=[]
    for_preparing=tokenizer("")
    start_symbol=for_preparing.input_ids[0]
예제 #26
0
from transformers import AutoTokenizer, AutoModel

from roberta_ast_label_pretrain import FinetuningRoberta, RobertaPretrain
from roberta_mask_pretrain import RobertaMaskPretrain, FinetuningMaskRoberta
import pandas as pd
import xarray as xa
from scipy import stats

if __name__ == '__main__':
    test_data = pd.read_pickle("data_cache/java_train_0.pkl")
    # model_all = RobertaPretrain.load_from_checkpoint(
    #     "pretrained_module/roberta_ast_label_pretrain_on_java_all-type_label/model.ckpt")
    #
    tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1",
                                              resume_download=True)
    model = AutoModel.from_pretrained("huggingface/CodeBERTa-small-v1",
                                      resume_download=True)

    encoded = tokenizer.encode_plus(test_data.docstring.array[0],
                                    test_data.code.array[0],
                                    return_tensors="pt")
    hiddens, first_output, attentions = model(**encoded,
                                              output_attentions=True)

    attn_array = xa.DataArray(
        data=[attn.detach().numpy() for attn in attentions],
        dims=["layer", "batch", "head", "query_pos", "key_pos"])
    tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"][0])
    df = attn_array.to_dataframe("attn")
    filter = df["attn"] > (1 / attn_array.sizes["key_pos"])
    df = df[filter]
    df["relative_pos"] = df.index.get_level_values("query_pos")
예제 #27
0
def main():
    #print("MODEL NAME, BATCH SIZE, AVG LATENCY (ms), AVG MEM USAGE (MiB)")
    #parser
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str)
    parser.add_argument('--num_inference', type=int)
    parser.add_argument('--batch_size', type=int)
    parser.add_argument('--gpu', action="store_true", default=False)
    args = parser.parse_args()
    model_name = args.model_name
    num_inference = args.num_inference
    batch_size = args.batch_size
    use_gpu = args.gpu and torch.cuda.is_available()
    # stores latency / memory usage values
    l_inference_latency = list()
    l_memory_capacity = list()
    # call corresponding DNN model...
    # TODO: ADD OTHER MODELS - RESNET50, ...
    # TODO: FIX NLP MODELS' SEQUENCE LENGTH
    if (model_name == "resnet18"):
        with torch.no_grad():
            model = models.resnet18(True, True)
            if use_gpu:
                model = model.cuda()
            # inference
            for i in range(num_inference):
                # input
                inputs = torch.zeros(batch_size, 3, 224, 224)
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(
                average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(
                average_90_percent(l_memory_capacity))
            print(",".join([
                "RESNET18",
                str(batch_size), str_avg_inf_time, str_avg_mem_usage
            ]))

    elif (model_name == "wide_resnet101_2"):
        with torch.no_grad():
            model = models.wide_resnet101_2(True, True)
            if use_gpu:
                model = model.cuda()
            # inference
            for i in range(num_inference):
                # input
                inputs = torch.zeros(batch_size, 3, 224, 224)
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(
                average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(
                average_90_percent(l_memory_capacity))
            print(",".join([
                "WIDE-RESNET101-2",
                str(batch_size), str_avg_inf_time, str_avg_mem_usage
            ]))

    elif (model_name == "mobilenet"):
        with torch.no_grad():
            model = models.mobilenet_v2(True, True)
            if use_gpu:
                model = model.cuda()
            # warmup
            for i in range(num_inference):
                inputs = torch.zeros(batch_size, 3, 224, 224)
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(
                average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(
                average_90_percent(l_memory_capacity))
            print(",".join([
                "MOBILENET_V2",
                str(batch_size), str_avg_inf_time, str_avg_mem_usage
            ]))

    elif (model_name == "bert"):
        with torch.no_grad():
            tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
            model = AutoModel.from_pretrained("bert-base-uncased")
            if use_gpu:
                model = model.cuda()
            # inference
            for i in range(num_inference):
                # BERT maximum sequence length 512
                sample_text = "BERT" * int(512 / 4)
                texts = [sample_text] * batch_size
                inputs = tokenizer(texts, return_tensors="pt")
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(**inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(
                average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(
                average_90_percent(l_memory_capacity))
            print(",".join([
                "BERT-BASE-UNCASED",
                str(batch_size), str_avg_inf_time, str_avg_mem_usage
            ]))

    elif (model_name == "gpt2"):
        with torch.no_grad():
            tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
            model = GPT2Model.from_pretrained("gpt2")
            if use_gpu:
                model = model.cuda()
            # inference
            for i in range(num_inference):
                # GPT2 maximum sequence length 124
                sample_text = "GPT2" * int(1024 / 4)
                texts = [sample_text] * batch_size
                inputs = tokenizer(texts, return_tensors="pt")
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(**inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(
                average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(
                average_90_percent(l_memory_capacity))
            print(",".join(
                ["GPT2",
                 str(batch_size), str_avg_inf_time, str_avg_mem_usage]))

    elif (model_name == "dlrm"):
        print("Unimplemented model: DLRM")
        # TODO: MAKE IT WORK... PLEASE
        '''
        with torch.no_grad():
            model = DLRM_Net()
            if use_gpu:
                model = model.cuda()
            # inference
            for i in range(num_inference):
                inputs = ????
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(**inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(average_90_percent(l_memory_capacity))
            print(",".join(["GPT2", str(batch_size), str_avg_inf_time, str_avg_mem_usage]))
        '''
    else:
        print("Unidentified model name: {}".format(model_name))
        return
예제 #28
0
df = pd.read_sql_query("select distinct(title), yes_no from giant_jobs", conn)
# won't be needing this anymore
conn.close()
#print(df.head) # -> everything looks good

# create the test, train, and validation sets
train_text, temp_text, train_labels, temp_labels = train_test_split(
    df['title'], df['yes_no'], test_size=0.3, stratify=df['yes_no'])

val_text, test_text, val_labels, test_labels = train_test_split(
    temp_text, temp_labels, test_size=0.3, stratify=temp_labels)

# print("test_text", type(test_text)) # Series

# import the bert model and tokenizer
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# the set of length of messages in the training set
set_len = [len(i.split()) for i in train_text]

# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(train_text.tolist(),
                                           max_length=25,
                                           padding='max_length',
                                           truncation=True)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(val_text.tolist(),
                                         max_length=25,
                                         padding='max_length',
예제 #29
0
def load_embeddings(cachedir,
                    encoder_emb_names,
                    decoder_emb_names,
                    max_generative_vocab=50000,
                    logger=_logger):
    logger.info(f'Getting pretrained word vectors and pretrained models')

    encoder_emb_names = encoder_emb_names.split('+')
    decoder_emb_names = decoder_emb_names.split('+')

    all_vectors = {}
    encoder_vectors = []
    decoder_vectors = []

    numericalizer = None
    for emb_name in encoder_emb_names:
        if not emb_name:
            continue
        if _is_bert(emb_name) or _is_xlmr(emb_name):
            if numericalizer is not None:
                raise ValueError(
                    'Cannot specify multiple Transformer embeddings')

            config = AutoConfig.from_pretrained(emb_name, cache_dir=cachedir)
            config.output_hidden_states = True
            if _is_bert(emb_name):
                numericalizer = BertNumericalizer(
                    emb_name,
                    config=config,
                    max_generative_vocab=max_generative_vocab,
                    cache=cachedir)
            elif _is_xlmr(emb_name):
                numericalizer = XLMRobertaNumericalizer(
                    emb_name,
                    config=config,
                    max_generative_vocab=max_generative_vocab,
                    cache=cachedir)

            # load the tokenizer once to ensure all files are downloaded
            AutoTokenizer.from_pretrained(emb_name, cache_dir=cachedir)

            encoder_vectors.append(
                TransformerEmbedding(
                    AutoModel.from_pretrained(emb_name,
                                              config=config,
                                              cache_dir=cachedir)))

        else:
            if numericalizer is not None:
                logger.warning(
                    'Combining BERT embeddings with other pretrained embeddings is unlikely to work'
                )

            if emb_name in all_vectors:
                encoder_vectors.append(all_vectors[emb_name])
            else:
                vec = _name_to_vector(emb_name, cachedir)
                all_vectors[emb_name] = vec
                encoder_vectors.append(vec)

    for emb_name in decoder_emb_names:
        if not emb_name:
            continue
        if _is_bert(emb_name) or _is_xlmr(emb_name):
            raise ValueError(
                'Transformer embeddings cannot be specified in the decoder')

        if emb_name in all_vectors:
            decoder_vectors.append(all_vectors[emb_name])
        else:
            vec = _name_to_vector(emb_name, cachedir)
            all_vectors[emb_name] = vec
            decoder_vectors.append(vec)

    if numericalizer is None:
        numericalizer = SimpleNumericalizer(
            max_generative_vocab=max_generative_vocab, pad_first=False)

    return numericalizer, encoder_vectors, decoder_vectors
예제 #30
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help=
        "Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default=None,
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument(
        "--num_image_embeds",
        default=1,
        type=int,
        help="Number of Image Embeddings from the Image Encoder")
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--patience",
                        default=5,
                        type=int,
                        help="Patience for Early Stopping.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--num_workers",
                        type=int,
                        default=8,
                        help="number of worker threads for dataloading")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1

    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # Setup model
    labels = get_mmimdb_labels()
    num_labels = len(labels)
    transformer_config = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir,
    )
    transformer = AutoModel.from_pretrained(args.model_name_or_path,
                                            config=transformer_config,
                                            cache_dir=args.cache_dir)
    img_encoder = ImageEncoder(args)
    config = MMBTConfig(transformer_config, num_labels=num_labels)
    model = MMBTForClassification(config, transformer, img_encoder)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_examples(args, tokenizer, evaluate=False)
        label_frequences = train_dataset.get_label_frequencies()
        label_frequences = [label_frequences[l] for l in labels]
        label_weights = (torch.tensor(
            label_frequences, device=args.device, dtype=torch.float) /
                         len(train_dataset))**-1
        criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     criterion)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        torch.save(model_to_save.state_dict(),
                   os.path.join(args.output_dir, WEIGHTS_NAME))
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = MMBTForClassification(config, transformer, img_encoder)
        model.load_state_dict(
            torch.load(os.path.join(args.output_dir, WEIGHTS_NAME)))
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))

        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                "/")[-1] if checkpoint.find("checkpoint") != -1 else ""
            model = MMBTForClassification(config, transformer, img_encoder)
            model.load_state_dict(torch.load(checkpoint))
            model.to(args.device)
            result = evaluate(args, model, tokenizer, criterion, prefix=prefix)
            result = dict(
                (k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results