def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     model = BertForMaskedLM(config=config)
     model.eval()
     loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
예제 #2
0
    def __init__(self, bert_path):
        vocab_file_name = 'vocab.txt'
        # 日本語文章をBERTに食わせるためにJumanを読み込む
        self.juman_tokenizer = JumanTokenizer()
        # 事前学習済みのBERTモデルを読み込む
        self.model = BertModel.from_pretrained(bert_path)
        # 事前学習済みのBERTモデルのTokenizerを読み込む
        self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                            do_lower_case=False, do_basic_tokenize=False)
        self.vocab_size = len(self.bert_tokenizer.vocab)

        # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む
        self.model = BertForMaskedLM.from_pretrained(bert_path)

        # 除外するヘッダ等トークン
        except_tokens = ["[MASK]", 
        #"[PAD]",
        "[UNK]", "[CLS]", "[SEP]",
        "(", ")", "・", "/", "、", "。", "!", "?", "「", "」", "…", "’", "』", "『", ":", "※"
        ]
        self.except_ids = [self.bert_tokenizer.vocab[token] for token in except_tokens]

        # vocab_sizeのうち、except_ids以外は、利用する
        self.candidate_ids = [i for i in range(self.vocab_size)
                        if i not in self.except_ids]
예제 #3
0
    def initialize_detector(self):
        t1 = time.time()
        try:
            import kenlm
        except ImportError:
            raise ImportError(
                'mypycorrector dependencies are not fully installed, '
                'they are required for statistical language model.'
                'Please use "pip install kenlm" to install it.'
                'if you are Win, Please install kenlm in cgwin.')

        self.lm = kenlm.Model(self.language_model_path)
        logger.debug('Loaded language model: %s, spend: %s s' %
                     (self.language_model_path, str(time.time() - t1)))

        # 词、频数dict
        t2 = time.time()
        self.word_freq = self.load_word_freq_dict(self.word_freq_path)
        self.char_freq = self.load_char_freq_dict(self.char_freq_path)
        t3 = time.time()
        logger.debug(
            'Loaded word freq, char freq file: %s, size: %d, spend: %s s' %
            (self.word_freq_path, len(self.word_freq), str(t3 - t2)))
        # 自定义混淆集
        self.custom_confusion = self._get_custom_confusion_dict(
            self.custom_confusion_path)
        t4 = time.time()
        logger.debug('Loaded confusion file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_confusion), str(t4 - t3)))
        # 自定义切词词典
        self.custom_word_freq = self.load_word_freq_dict(
            self.custom_word_freq_path)
        self.person_names = self.load_word_freq_dict(self.person_name_path)
        self.place_names = self.load_word_freq_dict(self.place_name_path)
        self.stopwords = self.load_word_freq_dict(self.stopwords_path)
        # 合并切词词典及自定义词典
        self.custom_word_freq.update(self.person_names)
        self.custom_word_freq.update(self.place_names)
        self.custom_word_freq.update(self.stopwords)

        self.word_freq.update(self.custom_word_freq)
        t5 = time.time()
        logger.debug('Loaded custom word file: %s, size: %d, spend: %s s' %
                     (self.custom_confusion_path, len(
                         self.custom_word_freq), str(t5 - t4)))
        self.tokenizer = Tokenizer(dict_path=self.word_freq_path,
                                   custom_word_freq_dict=self.custom_word_freq,
                                   custom_confusion_dict=self.custom_confusion)
        # bert预训练模型
        t6 = time.time()
        self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab)
        self.MASK_TOKEN = "[MASK]"
        self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids(
            [self.MASK_TOKEN])[0]
        # Prepare model
        self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
        logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                     (self.bert_model_dir, time.time() - t6))
        self.initialized_detector = True
예제 #4
0
파일: bert.py 프로젝트: github-chx/nlpaug
    def __init__(self, model_path, tokenizer_path):
        super(Bert, self).__init__()
        self.model_path = model_path
        self.tokenizer_path = tokenizer_path

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.model = BertForMaskedLM.from_pretrained(model_path)
예제 #5
0
def main(args):

    # set tokenizer
    vocab = PreDefinedVocab(
        vocab_file=args.vocab_file,
        unk_token='[UNK]',
        sep_token='[SEP]',
        pad_token='[PAD]',
        mask_token='[MASK]',
        cls_token='[CLS]',
    )

    tokenizer = WordpieceTokenizer(vocab)

    to_word = False

    # select a sampling module
    if args.sampling_strategy == 'random':
        sampling_fn = sampler.UniformSampler()

    # select a augmentation module
    if args.augmentation_strategy == 'dropout':
        generator_fn = generator.DropoutGenerator()
    elif args.augmentation_strategy == 'blank':
        generator_fn = generator.BlankGenerator(
            mask_token=tokenizer.vocab.mask_token)
    elif args.augmentation_strategy == 'unigram':
        generator_fn = generator.UnigramGenerator(
            args.unigram_frequency_for_generation)
        to_word = True
    elif args.augmentation_strategy == 'bigramkn':
        generator_fn = generator.BigramKNGenerator(
            args.bigram_frequency_for_generation)
        to_word = True
    elif args.augmentation_strategy == 'wordnet':
        generator_fn = generator.WordNetGenerator(lang=args.lang_for_wordnet)
        to_word = True
    elif args.augmentation_strategy == 'word2vec':
        generator_fn = generator.Word2vecGenerator(args.w2v_file)
        to_word = True
    elif args.augmentation_strategy == 'ppdb':
        generator_fn = generator.PPDBGenerator(args.ppdb_file)
        to_word = True
    elif args.augmentation_strategy == 'bert':
        from pytorch_transformers import BertTokenizer, BertForMaskedLM
        bert = BertForMaskedLM.from_pretrained(args.model_name_or_path)
        generator_fn = generator.BertGenerator(tokenizer, bert,
                                               args.temparature)

    augmentor_fn = augmentor.ReplacingAugmentor(tokenizer,
                                                sampling_fn,
                                                generator_fn,
                                                to_word=to_word)

    with open(args.input, 'r') as f:
        for line in f:
            line = line.rstrip()
            augmented_line = augmentor_fn(line, args.augmentation_rate)
            print(augmented_line)
예제 #6
0
def prepare_models():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased',
                                      output_attentions=True)
    model.eval()
    mask_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    mask_model.eval()
    return tokenizer, model, mask_model
예제 #7
0
파일: bert.py 프로젝트: wshBak/nlpaug
    def __init__(self, model_path='bert-base-uncased', tokenizer_path=None, device='cuda'):
        super().__init__()
        self.model_path = model_path
        self.device = device

        self.tokenizer = BertTokenizer.from_pretrained(model_path)
        self.model = BertForMaskedLM.from_pretrained(model_path)
        self.model.to(device)
        self.model.eval()
예제 #8
0
 def initialize_bert_detector(self):
     t1 = time.time()
     self.bert_tokenizer = BertTokenizer(vocab_file=self.bert_model_vocab)
     self.MASK_TOKEN = "[MASK]"
     self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids(
         [self.MASK_TOKEN])[0]
     # Prepare model
     self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir)
     logger.debug("Loaded model ok, path: %s, spend: %.3f s." %
                  (self.bert_model_dir, time.time() - t1))
     self.initialized_bert_detector = True
예제 #9
0
 def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False):
     # 日本語文章をBERTに食わせるためにJumanを読み込む
     self.juman_tokenizer = JumanTokenizer()
     # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む
     self.model = BertForMaskedLM.from_pretrained(bert_path)
     # 事前学習済みのBERTモデルのTokenizerを読み込む
     self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name,
                                         do_lower_case=False,
                                         do_basic_tokenize=False)
     # CUDA-GPUを利用するかどうかのフラグ読み込み
     self.use_cuda = use_cuda
예제 #10
0
파일: bertnlp.py 프로젝트: kohilin/ealm
def init(maxlen=512):
    global config, tokenizer, model, sim_model, MAX_LENGTH
    MAX_LENGTH = maxlen

    bert_model_name = 'bert-base-uncased'
    config = BertConfig.from_pretrained(bert_model_name)
    config.output_hidden_states = True
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    model = BertForMaskedLM.from_pretrained(bert_model_name, config=config)
    model.to(DEVICE)
    model.eval()

    sim_model = smodel.WebBertSimilarity(device=DEVICE)
예제 #11
0
    def __init__(self, vocabulary, config):
        spacy.prefer_gpu()
        self.spacyNlp = spacy.load("en_core_web_lg")
        self.simThreshold = 0.70

        self.modelName = config["bertModel"]
        self.guesses = config["numberOfGuesses"]
        self.useSimilarity = config["useSimilarity"]
        self.useSynonyms = config["useSynonyms"]

        self.bertModel = BertForMaskedLM.from_pretrained(self.modelName).cuda()
        self.bertTokenizer = BertTokenizer.from_pretrained(self.modelName)
        self.maskingToken = "[MASK]"
        self.paddingToken = "[PAD]"
        self.startToken = "[CLS]"
        self.endToken = "[SEP]"

        self.vocabulary = vocabulary
예제 #12
0
def sample_predict_token():
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize input
    text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
    tokenized_text = tokenizer.tokenize(text)

    # Mask a token that we will try to predict back with `BertForMaskedLM`
    masked_index = 8
    tokenized_text[masked_index] = '[MASK]'
    assert tokenized_text == [
        '[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]',
        'was', 'a', 'puppet', '##eer', '[SEP]'
    ]

    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
    segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()

    # If you have a GPU, put everything on cuda
    #tokens_tensor = tokens_tensor.to('cuda')
    #segments_tensors = segments_tensors.to('cuda')
    #model.to('cuda')

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]

    # confirm we were able to predict 'henson'
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_token)
    assert predicted_token == 'henson'
예제 #13
0
    def __init__(self,
                 model_path='bert-base-uncased',
                 temperature=1.0,
                 top_k=None,
                 top_p=None,
                 device='cuda'):
        super().__init__(device,
                         temperature=temperature,
                         top_k=top_k,
                         top_p=top_p)
        self.model_path = model_path

        # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        # self.model = AutoModel.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(model_path)
        self.model = BertForMaskedLM.from_pretrained(model_path)

        self.model.to(self.device)
        self.model.eval()
예제 #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--bert-model',
                        type=str,
                        default='bert-base-multilingual-uncased')
    args = parser.parse_args()

    model = BertForMaskedLM.from_pretrained(args.bert_model).cuda()
    model.eval()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    while True:
        with torch.no_grad():
            sentence = input('> ')
            bundle = SingleInputBundle([tokenizer.tokenize(sentence)],
                                       tokenizer.vocab)
            bundle.cuda()
            print(
                predict_top_k(model, tokenizer.vocab, tokenizer.ids_to_tokens,
                              bundle))
예제 #15
0
    def __init__(self, model_directory, vocab_file, lower=False):

        # Load pre-trained model (weights)

        self.model = BertForMaskedLM.from_pretrained(model_directory)
        self.model.eval()
        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.model = self.model.cuda()

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer(vocab_file=vocab_file,
                                       do_lower_case=lower)

        self.CLS = '[CLS]'
        self.SEP = '[SEP]'
        self.MASK = '[MASK]'
        self.mask_id = self.tokenizer.convert_tokens_to_ids([self.MASK])[0]
        self.sep_id = self.tokenizer.convert_tokens_to_ids([self.SEP])[0]
        self.cls_id = self.tokenizer.convert_tokens_to_ids([self.CLS])[0]
예제 #16
0
    def __init__(self, tokenizer, bert_path):
        self.tokenizer = tokenizer
        self.vocab_size = len(self.tokenizer.vocab)

        # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む
        self.model = BertForMaskedLM.from_pretrained(bert_path)

        # 除外するヘッダ等トークン
        except_tokens = [
            "[MASK]",
            #"[PAD]",
            "[UNK]",
            "[CLS]",
            "[SEP]"
        ]
        self.except_ids = [
            self.tokenizer.vocab[token] for token in except_tokens
        ]

        # vocab_sizeのうち、except_ids以外は、利用する
        self.candidate_ids = [
            i for i in range(self.vocab_size) if i not in self.except_ids
        ]
예제 #17
0
#!/usr/bin/python3
import torch
from pytorch_transformers import BertForMaskedLM, BertTokenizer
import sys
import torch.nn.functional as F
import numpy as np

# Load pre-trained model and tokenizer
model = BertForMaskedLM.from_pretrained('bert-large-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# Read items from file
with open('items_agr_punct.csv', encoding='utf8') as f:
    text = f.read().splitlines()

# Write to file
orig_stdout = sys.stdout
f = open('out_agr_punct.txt', 'w')
sys.stdout = f

# Write Column Headers
print("Surprisal, VerbCondition, FillerCondition, EmbeddingLevel")

for s in text:
    splits = s.split(',')
    item = "[CLS] " + splits[0] + " [SEP]"
    tokenized_text = tokenizer.tokenize(item)

    # Find index of the masked token
    words = splits[0].split(' ')
    masked_index = words.index('[MASK]') + 1
예제 #18
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument(
        "--bert_model",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )

    parser.add_argument("--epochs",
                        type=int,
                        default=3,
                        help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--wp",
                        type=bool,
                        default=False,
                        help="if train on wp")
    parser.add_argument(
        '--from_scratch',
        action='store_true',
        help='do not load prtrain model, only random initialize')
    parser.add_argument("--output_step",
                        type=int,
                        default=100000,
                        help="Number of step to save model")

    args = parser.parse_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    samples_per_epoch = []
    num_data_epochs = args.epochs
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)
    # Set seed
    set_seed(args)

    args.output_mode = "classification"

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    while True:
        try:
            tokenizer = BertTokenizer.from_pretrained(
                args.bert_model, do_lower_case=args.do_lower_case)
            if tokenizer._noi_token is None:
                tokenizer._noi_token = '[NOI]'
                if args.bert_model == 'bert-base-uncased' or 'bert-large-uncased':
                    tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused0]')
                else:
                    tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused1]')
                # else:
                #     raise ValueError("No clear choice for insert NOI for tokenizer type {}".format(args.model_name_or_path))
                tokenizer.ids_to_tokens[1] = '[NOI]'
                logger.info("Adding [NOI] to the vocabulary 1")
        except:
            continue
        break

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    if args.from_scratch:
        model = BertForMaskedLM()
    else:
        model = BertForMaskedLM.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory,
            args=args)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, lm_label_ids = batch
            outputs = model(
                input_ids,
                segment_ids,
                input_mask,
                lm_label_ids,
            )
            loss = outputs[0]
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps

            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if global_step % args.output_step == 0 and args.local_rank in [
                    -1, 0
            ]:
                # Save model checkpoint
                output_dir = os.path.join(args.output_dir,
                                          'checkpoint-{}'.format(global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(
                    model, 'module'
                ) else model  # Take care of distributed/parallel training
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                logger.info("Saving model checkpoint to %s", output_dir)

        if args.local_rank in [-1, 0]:
            # Save model checkpoint
            output_dir = os.path.join(args.output_dir,
                                      'checkpoint-{}'.format(global_step))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            model_to_save = model.module if hasattr(
                model, 'module'
            ) else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            torch.save(args, os.path.join(output_dir, 'training_args.bin'))
            logger.info("Saving model checkpoint to %s", output_dir)
        logger.info("PROGRESS: {}%".format(
            round(100 * (epoch + 1) / args.epochs, 4)))
        logger.info("EVALERR: {}%".format(tr_loss))

    # Save a trained model
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        logger.info("Saving model checkpoint to %s", args.output_dir)
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
예제 #19
0
파일: run_lama.py 프로젝트: kenanfa3/ebert
    model_emb = load_embedding(args.modelname)

    allowed_vocabulary = None
    if args.allowed_vocabulary:
        with open(args.allowed_vocabulary) as handle:
            lines = [line.strip() for line in handle]
        encoded = [
            model_emb.tokenizer.encode(token, add_special_tokens=False)
            for token in lines
        ]
        assert all([len(x) == 1 for x in encoded])
        allowed_vocabulary = set([x[0] for x in encoded if len(x) == 1])

    model = EmbInputBertModel.from_pretrained(args.modelname,
                                              output_attentions=True)
    language_model = BertForMaskedLM.from_pretrained(args.modelname).cls

    model = model.to(device=args.device)
    language_model = language_model.to(device=args.device)

    model.eval()
    language_model.eval()

    mappers = {
        method: load_mapper(f"{args.wikiname}.{args.modelname}.{method}")
        for method in args.methods
    }

    for pattern in tqdm(patterns):
        relation, template = pattern["relation"], pattern["template"]
예제 #20
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir',
                        type=Path,
                        required=False,
                        default=None)
    parser.add_argument(
        "--bert_model",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--do_lower_case",
                        type=boolean_string,
                        default=False,
                        action="store_true")
    parser.add_argument(
        "--reduce_memory",
        type=boolean_string,
        default=False,
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        type=boolean_string,
                        default=False,
                        help="Whether not to use CUDA when available")
    parser.add_argument("--batch_size",
                        default=1,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument(
        '--fp16',
        type=boolean_string,
        default=False,
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--type",
                        default="greedy",
                        type=str,
                        help="greedy: greedy generation. sample: sampling")
    parser.add_argument('--noi_decay',
                        type=int,
                        default=3,
                        help="round number to decay NOI prob")
    parser.add_argument('--reduce_decay',
                        type=int,
                        default=1,
                        help="round number to decay reduce prob")
    parser.add_argument('--verbose', type=int, default=1, help="verbose level")
    parser.add_argument('--n_test',
                        type=int,
                        default=5000,
                        help="number of test examples")
    parser.add_argument('--prevent',
                        type=boolean_string,
                        default=True,
                        help="avoid generating several words")
    parser.add_argument('--reduce_stop',
                        type=boolean_string,
                        default=True,
                        help="reduce stopwords")
    parser.add_argument('--lessrepeat',
                        action='store_true',
                        help="reduce repetition (only for tokenwise)")
    args = parser.parse_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    if not args.output_dir:
        args.output_dir = args.bert_model

    epoch_file = args.pregenerated_data / f"test.key.txt"
    total_examples = 1000
    args.max_seq_length = 256

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    args.device = device
    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)
    # Set seed
    set_seed(args)

    args.output_mode = "classification"

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    # Prepare model
    model = BertForMaskedLM.from_pretrained(args.bert_model)

    sep_tok = tokenizer.vocab['[SEP]']
    cls_tok = tokenizer.vocab['[CLS]']
    pad_tok = tokenizer.vocab['[PAD]']

    model.to(device)
    model.eval()

    print(args)

    logging.info("***** Running generation *****")
    logging.info(f"  Num examples = {total_examples}")
    logging.info("  Batch size = %d", args.batch_size)

    epoch_dataset = PregeneratedDataset(epoch=0,
                                        training_path=args.pregenerated_data,
                                        tokenizer=tokenizer,
                                        num_data_epochs=1)
    epoch_sampler = SequentialSampler(epoch_dataset)
    generate_dataloader = DataLoader(epoch_dataset,
                                     sampler=epoch_sampler,
                                     batch_size=args.batch_size)
    file_name = os.path.join(args.output_dir, f"{args.type}.txt")
    f = open(file_name, "w", 1)
    print(file_name)

    prevent = [tokenizer.vocab.get(x)
               for x in PREVENT_LIST] if args.prevent else None
    if args.reduce_stop:
        REDUCE_LIST = REDUCE_LIST | STOP_LIST
    reduce = None
    if args.prevent:
        reduce = [tokenizer.vocab.get(x) for x in reduce_list]
        reduce = [s for s in reduce if s]

    with tqdm(total=len(generate_dataloader), desc=f"Epoch {0}") as pbar:
        for step, batch in enumerate(generate_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, lm_label_ids = batch
            pdb.set_trace()
            if args.type == "greedy":
                predict_ids = greedy_search(model,
                                            input_ids,
                                            segment_ids,
                                            input_mask,
                                            args=args,
                                            tokenizer=tokenizer,
                                            prevent=prevent,
                                            reduce=reduce)
            elif args.type == 'sampling':
                predict_ids = sample_generate(model,
                                              input_ids,
                                              segment_ids,
                                              input_mask,
                                              temperature=0.8,
                                              args=args,
                                              tokenizer=tokenizer,
                                              prevent=prevent,
                                              reduce=reduce)
            else:
                raise NotImplementedError
            output = " ".join([
                str(
                    tokenizer.ids_to_tokens.get(x, "noa").encode(
                        'ascii', 'ignore').decode('ascii'))
                for x in predict_ids[0].detach().cpu().numpy()
                if x != sep_tok and x != pad_tok and x != cls_tok
            ]) + "\n"
            output = output.replace(" ##", "")
            f.write(output)
            pbar.update(1)
예제 #21
0
 def __init__(self):
     self.tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
     self.model = BertForMaskedLM.from_pretrained(MODEL_PATH)
     self.model.eval()
     self.model.to(DEVICE)
예제 #22
0
from flask import Flask, request
from flask_cors import CORS
import torch
import numpy as np
from pytorch_transformers import BertTokenizer, BertForMaskedLM
import nltk

app = Flask(__name__)
CORS(app)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased',
                                        output_attentions=True)
model.eval()


@app.route('/fillblanks', methods=['POST'])
def predict():
    sentence_orig = request.form.get('text')
    if '____' not in sentence_orig:
        return sentence_orig

    sentence = sentence_orig.replace('____', 'MASK')
    tokens = nltk.word_tokenize(sentence)
    sentences = nltk.sent_tokenize(sentence)
    sentence = " [SEP] ".join(sentences)
    sentence = "[CLS] " + sentence + " [SEP]"
    tokenized_text = tokenizer.tokenize(sentence)
    masked_index = tokenized_text.index('mask')
    tokenized_text[masked_index] = "[MASK]"
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
예제 #23
0
torch.manual_seed(0)
np.random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
random.seed(0)

parser = argparse.ArgumentParser()
parser.add_argument('--file', type=str, default='../PlotExtraction/fairy.txt')
parser.add_argument('--outfile', type=str, default='bert_fairy.txt')
parser.add_argument('--model', type=str, default='./bert/fairy')

args = parser.parse_args()

tokenizer = BertTokenizer.from_pretrained(args.model)
model = BertForMaskedLM.from_pretrained(args.model, output_attentions=False)
model.eval()


def capitalizeFirst(phrase):
    words = phrase.split()
    words[0] = words[0].capitalize()
    return ' '.join(words)


def is_punctuation(s):
    return len(set(s).intersection(set(string.punctuation))) > 0


def getScore(sentence):
    tokenized_text = tokenizer.tokenize('[CLS] ' + "[MASK] " + sentence + ' [SEP]')
예제 #24
0
                input_ids = torch.tensor(tokenizer.encode(s),
                                         device=device).unsqueeze(
                                             0)  # Batch size 1
                results.append(
                    clf.forward(input_ids)[0].detach().cpu().numpy().flatten())
        return np.array(results).reshape(-1, 2)


print('loading models and data...')
default = 'bert-base-uncased'
mdir = '/scratch/users/vision/chandan/pacmed/glue/SST-2-3epoch'  # '/scratch/users/vision/chandan/pacmed/glue/SST-2-middle/'
device = 'cpu'

tokenizer = BertTokenizer.from_pretrained(mdir)
clf = BertForSequenceClassification.from_pretrained(mdir).eval().to(device)
masked_predictor = BertForMaskedLM.from_pretrained(default).eval().to(device)

lines = open('data/stsa.binary.test', 'r').read()
lines = [line for line in lines.split('\n') if not line is '']
classes = [int(line[0]) for line in lines]
reviews = [line[2:] for line in lines]

num_reviews = 1821  # 1821
save_freq = 1
scores_iid = {}
scores_conditional = {}
scores_remove = {}
scores_lime = {}

# loop over reviews
print('looping over dset...')
예제 #25
0
from flask import Flask, request
from flask_cors import CORS
import torch
import random
import numpy as np
from pytorch_transformers import BertTokenizer, BertForMaskedLM
import nltk

app = Flask(__name__)
CORS(app)

base_dir = '/finetuned_lm-review/finetuned_lm'
tokenizer = BertTokenizer.from_pretrained(base_dir)
model = BertForMaskedLM.from_pretrained(base_dir, output_attentions=False)
model.eval()


def duplicates(lst, item):
    return [i for i, x in enumerate(lst) if x == item]


@app.route('/autocomplete', methods=['POST'])
def predict():
    sentence = ""
    sentence_orig = request.form.get('text')
    sentence_length = request.form.get('len')
    decoding_type = request.form.get('decoding_type')
    domain_type = request.form.get('domain_type')
    filler = ' '.join(['MASK' for _ in range(int(sentence_length))])

    if domain_type == 'review':
예제 #26
0
def main():
    parser = argparse.ArgumentParser()
    add_dict_options(parser, ARGS)
    args = parser.parse_args()
    set_seed(args.seed)

    if args.prefix_file: prefix_sampler = torch.load(args.prefix_file)
    if args.transfo:
        tokenizer = TransfoXLTokenizer.from_pretrained(args.transfo_model)
        model = TransfoXLLMHeadModel.from_pretrained(args.transfo_model)
    elif args.bert:
        tokenizer = BertTokenizer.from_pretrained(args.bert_model)
        model = BertForMaskedLM.from_pretrained(args.bert_model)
    else:
        tokenizer = GPT2Tokenizer.from_pretrained(args.gpt2_model)
        model = GPT2LMHeadModel.from_pretrained(args.gpt2_model)
        init_sos(model)
    if args.resume:
        model.load_state_dict(
            torch.load(args.resume, map_location=lambda s, l: s))
    if not args.simple_sample: model = nn.DataParallel(model)
    model.cuda()

    if args.bert:
        text_batches = list(split(list(sys.stdin), 128))
        for text_batch in tqdm(text_batches, desc='Augmenting'):
            for _ in range(args.num_samples):
                mtext_batch = [
                    ' '.join('[MASK]' if (
                        random.random() < 0.2 and '\t' not in x) else x
                             for x in sent.split(' ')) for sent in text_batch
                ]
                print('\n'.join(
                    x.replace('[SEP]', '\t').strip() for x in augment_texts(
                        model, tokenizer, mtext_batch, max_len=args.msl)))
                sys.stdout.flush()
        return

    sample_batches = [
        SampleBatch(model, tokenizer, prefix_sampler)
        for _ in range(args.num_buffers)
    ]
    if args.simple_sample:
        for _ in tqdm(range(args.num_samples)):
            print(sample_batches[0].simple_sample(pair=args.paired,
                                                  transfo=args.transfo))
            sys.stdout.flush()
        return

    n_output = 0
    pbar = tqdm(total=args.num_samples, desc='Generating')
    while n_output < args.num_samples:
        try:
            sample_batch = random.choice(sample_batches)
            sample_batch.try_add_sample()
            fin_texts = sample_batch.step(pair=args.paired)
        except ValueError:
            sample_batch.try_add_sample()
            continue
        for fin_text in fin_texts:
            if n_output >= args.num_samples:
                return
            print(fin_text.replace(EOS_TOKEN, '').replace('<eos>', '\t'))
            sys.stdout.flush()
            pbar.update(1)
            n_output += 1
            if (n_output + 1) % args.balance_every == 0:
                pbar.set_postfix(dict(last_balance=n_output))
                SampleBatch.balance(sample_batches)