Exemplo n.º 1
0
    def __init__(
            self,
            model_name_or_path: str = 'albert-large-uncased',  # './vocab.txt'
            datasets_loader: str = 'race',  # 'RACELocalLoader.py'
            task_name: str = 'all',
            max_seq_length: int = 512,
            train_batch_size: int = 32,
            eval_batch_size: int = 32,
            num_workers: int = 8,
            num_preprocess_processes: int = 8,
            use_sentence_selection: bool = True,
            best_k_sentences: int = 5,
            **kwargs):
        super().__init__()
        self.model_name_or_path = model_name_or_path
        self.dataset_loader = datasets_loader
        self.task_name = task_name
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size
        self.num_workers = num_workers
        self.num_preprocess_processes = num_preprocess_processes
        self.use_sentence_selection = use_sentence_selection
        self.best_k_sentences = best_k_sentences

        self.tokenizer = AlbertTokenizerFast.from_pretrained(
            self.model_name_or_path, use_fast=True, do_lower_case=True)
        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'],
                                               use_stemmer=True)
        self.dataset = None
def init_albert():
    albert_max_len = 512
    albert_path = "albert_base_turkish_uncased/"
    albert_model_name = "loodos-albert-base-turkish-uncased_seqlen512_batch64_epochs10/"
    albert_tokenizer = AlbertTokenizerFast.from_pretrained(albert_path,
                                                           do_lower_case=False,
                                                           keep_accents=True)
    albert_model_class = Model(albert_max_len, albert_path, albert_model_name,
                               albert_tokenizer, "albert")
    print("3. ALBERT LOADED")
    return albert_model_class
Exemplo n.º 3
0
    new_examples = defaultdict(list)

    for text in texts:
        instances = create_instances_from_document(tokenizer,
                                                   text,
                                                   max_seq_length=512)
        for instance in instances:
            for key, value in instance.items():
                new_examples[key].append(value)

    return new_examples


if __name__ == '__main__':
    random.seed(0)
    nltk.download('punkt')
    tokenizer = AlbertTokenizerFast.from_pretrained('albert-large-v2')
    wikitext = load_dataset('wikitext',
                            'wikitext-103-v1',
                            cache_dir='./data/cache')

    tokenized_datasets = wikitext.map(
        partial(tokenize_function, tokenizer),
        batched=True,
        num_proc=cpu_count(),
        remove_columns=["text"],
    )

    tokenized_datasets.save_to_disk('./data/albert_tokenized_wikitext')
    tokenizer.save_pretrained('./data/tokenizer')
Exemplo n.º 4
0
def fillmask(tweet):

    tokenizer = AlbertTokenizerFast.from_pretrained(save_directory)
    model = AlbertModel.from_pretrained("albert-base-v2")

    # With Bert
    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    #model = BertModel.from_pretrained("bert-base-uncased")

    # fonction that find words in common in the tweet and in the hateful words database
    def words_in_string(word_list, a_string):
        return set(word_list).intersection(a_string.split())

    # import the hateful words database
    text_file = open("./model/fill-masking/hate_words.txt", "r")
    lines = text_file.readlines()
    lines = [item.replace("\n", "") for item in lines]

    # tweet input
    tweet = tweet + " !!!"
    tweet = tweet
    tweet_splited = re.findall(r"[\w']+|[.,!?;]", tweet)
    tweet = " ".join(list(map(str, tweet_splited)))
    words = []

    # reset result
    result = []

    # apply the function
    for word in words_in_string(lines, tweet):
        words.append(word)

    if len(words) > 0:
        if len(words) == 1:
            tweet = tweet.replace(word, "[MASK]")

            # train the model
            encoded_input = tokenizer(tweet, return_tensors='pt')

            output = model(**encoded_input)

            # replace hateful words
            unmasker = pipeline('fill-mask', model='bert-base-uncased')

            res = []
            for dict in unmasker(tweet):
                res.append(dict["token_str"])

            score = [af.score(word) for word in res]
            top_words = sorted(range(len(score)),
                               key=lambda i: score[i],
                               reverse=True)[:3]
            top_3 = [res[i] for i in top_words]

            result = [word, top_3]

        else:
            sentiment_scores = [af.score(word) for word in words]
            worst = words[sentiment_scores.index(min(sentiment_scores))]
            tweet = tweet.replace(
                words[sentiment_scores.index(min(sentiment_scores))], "[MASK]")

            # train the model
            encoded_input = tokenizer(tweet, return_tensors='pt')

            output = model(**encoded_input)

            # replace hateful words
            unmasker = pipeline('fill-mask', model='bert-base-uncased')

            res = []
            for dict in unmasker(tweet):
                res.append(dict["token_str"])

            score = [af.score(word) for word in res]
            top_words = sorted(range(len(score)),
                               key=lambda i: score[i],
                               reverse=True)[:3]
            top_3 = [res[i] for i in top_words]

            result = [worst, top_3]

    else:
        return ""

    return result
Exemplo n.º 5
0
#from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, BertTokenizer, BertModel
from transformers import AlbertTokenizerFast, AlbertForSequenceClassification, AlbertModel
from transformers import pipeline
from scipy.special import softmax
from afinn import Afinn
af = Afinn()
import pandas as pd
import numpy as np
import torch
import re
import sentencepiece

save_directory = './model/classification/AlBERT'
tokenizer = AlbertTokenizerFast.from_pretrained(save_directory)
model = AlbertForSequenceClassification.from_pretrained(save_directory)


##################
# CLASSIFICATION #
##################
def predict(tweet):

    # Tokenize Tweet
    encoded_input = tokenizer.encode(tweet,
                                     truncation=True,
                                     padding=True,
                                     return_tensors="pt")

    # Predict Tweet Classes
    output = model(encoded_input)
Exemplo n.º 6
0
    def load(cls,
             pretrained_model_name_or_path,
             revision=None,
             tokenizer_class=None,
             use_fast=True,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        model config or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        kwargs["revision"] = revision

        if tokenizer_class is None:
            tokenizer_class = cls._infer_tokenizer_class(
                pretrained_model_name_or_path)

        logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if "AlbertTokenizer" in tokenizer_class:
            if use_fast:
                ret = AlbertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "XLMRobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLMRobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = RobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "XLNetTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLNetTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error(
                    'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.'
                )
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "CamembertTokenizer" in tokenizer_class:
            if use_fast:
                ret = CamembertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRQuestionEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRContextEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRContextEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Exemplo n.º 7
0
    def preprocess(tokenizer: AlbertTokenizerFast, scorer: rouge_scorer,
                   max_seq_length: int, use_sentence_selection: bool,
                   best_k_sentences: int, x: Dict) -> Dict:
        choices_features = []
        label_map = {"A": 0, "B": 1, "C": 2, "D": 3}
        question = x["question"]
        article = x['article']
        if use_sentence_selection:
            qa = [question + option for option in x["options"]]

            # question_tokens = np.array(tokenizer(qa, add_special_tokens=False, truncation=True, max_length=25,
            #                                      padding='max_length')['input_ids'])
            # question_tokens = np.array(tokenizer(qa, add_special_tokens=False)['input_ids'])
            sentences = article.split('.')
            sentences = [s for s in sentences if s != '']
            # sentences_tokens = np.array(tokenizer(sentences, add_special_tokens=False, truncation=True, max_length=25,
            #                                       padding='max_length')['input_ids'])
            # sentences_tokens = np.array(tokenizer(sentences, add_special_tokens=False)['input_ids'])
            question_len = len(qa)
            sentences_len = len(sentences)
            sentence_scores = np.empty((sentences_len, question_len))
            for (i, j) in product(range(sentences_len), range(question_len)):
                scores = scorer.score(sentences[i], qa[j])
                sentence_scores[i, j] = scores['rouge1'].precision + scores[
                    'rouge2'].precision
            # sentence_scores = np.dot(sentences_tokens, question_tokens.T) / (np.linalg.norm(
            #     sentences_tokens, axis=1).reshape(-1, 1) @ np.linalg.norm(
            #     question_tokens, axis=1).reshape(1, -1))
            max_sentence_score = np.max(sentence_scores, axis=1)
            best_sentence_indices = max_sentence_score.argsort(
            )[-best_k_sentences:][::-1]
            final_indices = set()
            for index in best_sentence_indices:
                final_indices.add(index - 1)
                final_indices.add(index)
                final_indices.add(index + 1)
            final_indices.discard(-1)
            final_indices.discard(sentences_len)

            article = '.'.join([sentences[i] for i in sorted(final_indices)])

        question_len = len(tokenizer.tokenize(question))

        option: str
        for option in x["options"]:
            question_option = question + option
            # if question.find("_") != -1:
            #     # fill in the banks questions
            #     question_option = question.replace("_", option)
            # else:
            #     question_option = question + " [SEP] " + option
            option_len = len(tokenizer.tokenize(option))

            inputs = tokenizer(article,
                               question_option,
                               add_special_tokens=True,
                               max_length=max_seq_length,
                               truncation=True,
                               padding='max_length',
                               return_tensors='pt')
            token_type_ids = np.array(inputs['token_type_ids'])
            inputs['article_len'] = int(
                np.where(token_type_ids == 1)[1][0]) - 2
            # inputs['question_len'] = question_len
            inputs['option_len'] = option_len

            choices_features.append(inputs)

        labels = label_map.get(x["answer"], -1)
        label = torch.tensor(labels).long()

        return {
            "label":
            label,
            "input_ids":
            torch.cat([cf["input_ids"]
                       for cf in choices_features]).reshape(-1),
            "attention_mask":
            torch.cat([cf["attention_mask"]
                       for cf in choices_features]).reshape(-1),
            "token_type_ids":
            torch.cat([cf["token_type_ids"]
                       for cf in choices_features]).reshape(-1),
            "article_len":
            torch.tensor([cf["article_len"]
                          for cf in choices_features]).long(),
            "question_len":
            torch.tensor([question_len] * 4).long(),
            # "question_len": torch.Tensor([cf["question_len"] for cf in choices_features]),
            "option_len":
            torch.tensor([cf["option_len"] for cf in choices_features]).long(),
        }
Exemplo n.º 8
0
 def prepare_data(self):
     datasets.load_dataset(self.dataset_loader, self.task_name)
     AlbertTokenizerFast.from_pretrained(self.model_name_or_path,
                                         use_fast=True)
Exemplo n.º 9
0
with open('data/bypublisher/articles.txt') as f:
    articles = f.readlines()
articles = [x.strip().lower() for x in articles]

with open('data/bypublisher/labels.txt') as f:
    labels = f.readlines()
labels = [int(x.strip()) for x in labels]

train_texts, other_texts, train_labels, other_labels = train_test_split(
    articles, labels, test_size=.2)

val_texts = other_texts[:len(other_texts) // 2]

val_labels = other_labels[:len(other_labels) // 2]

tokenizer = AlbertTokenizerFast.from_pretrained("albert-base-v1")

train_encodings = tokenizer(train_texts, truncation=True, padding='max_length')
val_encodings = tokenizer(val_texts, truncation=True, padding='max_length')
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length')


class HyperpartisanshipDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
Exemplo n.º 10
0
def main():
    parser = HfArgumentParser(
        (AlbertTrainingArguments, DatasetArguments, CollaborationArguments))
    training_args, dataset_args, collaboration_args = parser.parse_args_into_dataclasses(
    )

    logger.info(
        f"Found {len(collaboration_args.initial_peers)} initial peers: {collaboration_args.initial_peers}"
    )
    if len(collaboration_args.initial_peers) == 0:
        raise ValueError(
            "Please specify at least one network endpoint in initial peers.")

    collaboration_args_dict = asdict(collaboration_args)
    setup_logging(training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    config = AlbertConfig.from_pretrained(dataset_args.config_path,
                                          cache_dir=dataset_args.cache_dir)
    tokenizer = AlbertTokenizerFast.from_pretrained(
        dataset_args.tokenizer_path, cache_dir=dataset_args.cache_dir)
    model = get_model(training_args, config, tokenizer)
    model.to(training_args.device)

    tokenized_datasets = load_from_disk(Path(dataset_args.dataset_path))
    # This data collator will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

    opt, scheduler = get_optimizer_and_scheduler(training_args, model)

    validators, local_public_key = metrics_utils.make_validators(
        collaboration_args_dict['experiment_prefix'])
    dht = hivemind.DHT(
        start=True,
        initial_peers=collaboration_args_dict.pop('initial_peers'),
        listen=not collaboration_args_dict['client_mode'],
        listen_on=collaboration_args_dict.pop('dht_listen_on'),
        endpoint=collaboration_args_dict.pop('endpoint'),
        record_validators=validators)

    total_batch_size_per_step = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
    statistics_expiration = collaboration_args_dict.pop(
        'statistics_expiration')
    adjusted_target_batch_size = collaboration_args_dict.pop('target_batch_size') \
                                 - collaboration_args_dict.pop('batch_size_lead')

    collaborative_optimizer = hivemind.CollaborativeOptimizer(
        opt=opt,
        dht=dht,
        scheduler=scheduler,
        prefix=collaboration_args_dict.pop('experiment_prefix'),
        compression_type=hivemind.utils.CompressionType.Value(
            collaboration_args_dict.pop('compression')),
        batch_size_per_step=total_batch_size_per_step,
        throughput=collaboration_args_dict.pop('bandwidth'),
        target_batch_size=adjusted_target_batch_size,
        client_mode=collaboration_args_dict.pop('client_mode'),
        verbose=True,
        start=True,
        **collaboration_args_dict)

    class TrainerWithIndependentShuffling(Trainer):
        def get_train_dataloader(self) -> DataLoader:
            """ Shuffle data independently for each peer to avoid duplicating batches [important for quality] """
            torch.manual_seed(hash(local_public_key))
            return super().get_train_dataloader()

    trainer = TrainerWithIndependentShuffling(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        optimizers=(collaborative_optimizer,
                    NoOpScheduler(collaborative_optimizer)),
        callbacks=[
            CollaborativeCallback(dht, collaborative_optimizer, model,
                                  local_public_key, statistics_expiration)
        ])
    trainer.remove_callback(transformers.trainer_callback.PrinterCallback)
    trainer.remove_callback(transformers.trainer_callback.ProgressCallback)

    # Training
    if training_args.do_train:
        latest_checkpoint_dir = max(Path(
            training_args.output_dir).glob('checkpoint*'),
                                    default=None,
                                    key=os.path.getctime)

        trainer.train(model_path=latest_checkpoint_dir)
Exemplo n.º 11
0
def main():
    parser = HfArgumentParser((AlbertTrainingArguments, DatasetArguments, CollaborationArguments))
    training_args, dataset_args, collaboration_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    config = AlbertConfig.from_pretrained(dataset_args.config_path, cache_dir=dataset_args.cache_dir)

    tokenizer = AlbertTokenizerFast.from_pretrained(dataset_args.tokenizer_path, cache_dir=dataset_args.cache_dir)

    # find latest checkpoint in output_dir
    output_dir = Path(training_args.output_dir)
    logger.info(f'Checkpoint dir {output_dir}, contents {list(output_dir.glob("checkpoint*"))}')
    latest_checkpoint_dir = max(output_dir.glob('checkpoint*'), default=None, key=os.path.getctime)

    if latest_checkpoint_dir is not None:
        logger.info(f'Loading model from {latest_checkpoint_dir}')
        model = AlbertForPreTraining.from_pretrained(latest_checkpoint_dir)
    else:
        logger.info(f'Training from scratch')
        model = AlbertForPreTraining(config)
        model.resize_token_embeddings(len(tokenizer))

    tokenized_dataset_path = Path(dataset_args.dataset_path)

    tokenized_datasets = load_from_disk(tokenized_dataset_path)

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": training_args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    optimizer = FusedLAMB(
        optimizer_grouped_parameters,
        lr=training_args.learning_rate,
        betas=(training_args.adam_beta1, training_args.adam_beta2),
        eps=training_args.adam_epsilon,
    )

    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=training_args.max_steps
    )

    trainer = CollaborativeTrainer(
        model=model, args=training_args, collaboration_args=collaboration_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        optimizers=(optimizer, lr_scheduler)
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=latest_checkpoint_dir)