Пример #1
0
    def test_inference_masked_lm(self):
        model = RobertaForMaskedLM.from_pretrained("roberta-base")

        input_ids = torch.tensor(
            [[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 11, 50265))
        self.assertEqual(output.shape, expected_shape)
        # compare the actual values for a slice.
        expected_slice = torch.tensor([[[33.8802, -4.3103, 22.7761],
                                        [4.6539, -2.8098, 13.6253],
                                        [1.8228, -3.6898, 8.8600]]])

        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
        # roberta.eval()
        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()

        self.assertTrue(
            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
Пример #2
0
    def __init__(self,
                 model_path='roberta-base',
                 temperature=1.0,
                 top_k=None,
                 top_p=None,
                 device='cuda'):
        super().__init__(device,
                         temperature=temperature,
                         top_k=top_k,
                         top_p=top_p)
        self.model_path = model_path

        # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        # self.model = AutoModel.from_pretrained(model_path)
        self.tokenizer = RobertaTokenizer.from_pretrained(model_path)
        self.model = RobertaForMaskedLM.from_pretrained(model_path)

        self.model.to(self.device)
        self.model.eval()
Пример #3
0
    def test_inference_masked_lm(self):
        model = RobertaForMaskedLM.from_pretrained('roberta-base')

        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 11, 50265))
        self.assertEqual(
            output.shape,
            expected_shape
        )
        # compare the actual values for a slice.
        expected_slice = torch.Tensor(
            [[[33.8843, -4.3107, 22.7779],
              [ 4.6533, -2.8099, 13.6252],
              [ 1.8222, -3.6898,  8.8600]]]
        )
        self.assertTrue(
            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
        )
Пример #4
0
    def test_tokenize(self):
        current_dir = os.path.dirname(os.path.realpath(__file__))
        vocab_path = os.path.join(current_dir, 'data', 'vocab.txt')
        tokenized_smiles = [
            12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19, 18, 33, 17,
            16, 18, 23, 181, 17, 22, 19, 18, 17, 19, 16, 33, 20, 19, 55, 17,
            16, 38, 23, 18, 17, 33, 17, 19, 18, 35, 20, 19, 18, 16, 20, 22, 16,
            16, 22, 16, 21, 23, 20, 23, 22, 16, 23, 22, 16, 21, 23, 18, 19, 16,
            20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 13
        ]

        model = RobertaForMaskedLM.from_pretrained(
            'seyonec/SMILES_tokenized_PubChem_shard00_50k')
        model.num_parameters()

        tokenizer = SmilesTokenizer(
            vocab_path, max_len=model.config.max_position_embeddings)

        assert tokenized_smiles == tokenizer.encode(
            "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"
        )
Пример #5
0
    def __init__(self, args):
        # self.dict_file = "{}/{}".format(args.roberta_model_dir, args.roberta_vocab_name)
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        if args.model_path is not None:
            print("Testing CoLAKE...")
            print('loading model parameters from {}...'.format(
                args.model_path))
            config = RobertaConfig.from_pretrained('roberta-base',
                                                   type_vocab_size=3)
            self.model = RobertaForMaskedLM(config=config)
            states_dict = torch.load(os.path.join(args.model_path,
                                                  'model.bin'))
            self.model.load_state_dict(states_dict, strict=False)
        else:
            print("Testing RoBERTa baseline...")
            self.model = RobertaForMaskedLM.from_pretrained('roberta-base')

        self._build_vocab()
        self._init_inverse_vocab()
        self._model_device = 'cpu'
        self.max_sentence_length = args.max_sentence_length
Пример #6
0
def test_sequence():
    result_dir = "../results/"

    tok_dir = "tokenizer_model/"
    tokenizer = train.get_tok(tok_dir)

    csv_path = "../utils/test.csv"
    txt_name = "test.txt"
    utils.make_train_txt(csv_path, txt_name)

    dp = DP("../utils/test.csv")
    param = dp.param.to_numpy()[-7:]
    param = param.tolist()
    param.append("probability")

    mod_dir = "transformer_model/checkpoint-33000/"
    model = RobertaForMaskedLM.from_pretrained(mod_dir)
    fill_mask = pipeline("fill-mask",
                         model=model,
                         tokenizer=tokenizer,
                         device=0)

    PD, PB, ill, count = start(fill_mask)

    denorm = dp.denormalize(ill)

    l = []
    for x in range(len(PD)):
        tmp = []

        for j in range(7):
            tmp.append(random.randrange(denorm[x][j][0], denorm[x][j][1] + 1))
        tmp.append(PB[x])

        l.append(tmp)

    df = pd.DataFrame(l, columns=param)
    df.to_csv(result_dir + "result.csv", index=False, encoding="cp949")
    def __init__(self, config):
        super().__init__()
        self.train_config = config

        self.roberta = RobertaForMaskedLM.from_pretrained('roberta-base')
        _ = self.roberta.eval()
        for param in self.roberta.parameters():
            param.requires_grad = False

        self.pred_model = self.roberta.roberta
        self.enc_model = self.pred_model.embeddings.word_embeddings

        # self.proj_head = DVProjectionHead()
        # self.proj_head = DVProjectionHead_ActiFirst()
        self.proj_head = DVProjectionHead_EmbActi()

        self.tkz = RobertaTokenizer.from_pretrained("roberta-base")
        self.collator = TokenizerCollate(self.tkz)

        self.lossfunc = nn.BCEWithLogitsLoss()

        self.acc = Accuracy(threshold=0.0)
        self.f1 = F1(threshold=0.0)
Пример #8
0
def evaluate(args):
    """
    Args:
        ckpt: model checkpoints.
        hparams_file: the string should end with "hparams.yaml"
    """
    trainer = Trainer(gpus=args.gpus,
                      distributed_backend=args.distributed_backend,
                      deterministic=True)

    # reload test dataloader
    # print(trainer.test())
    print("path_to_model_checkpoint", args.path_to_model_checkpoint)
    # print(BertForQA)

    model = BertForQA.load_from_checkpoint(
        checkpoint_path=args.path_to_model_checkpoint,
        hparams_file=args.path_to_model_hparams_file,
        map_location=None,
        batch_size=args.eval_batch_size,
    )

    mlm_model = RobertaForMaskedLM.from_pretrained(
        './cached_models/roberta_squad1_covidmlm(train_and_dev)_3epoch/')
    model.model.roberta.load_state_dict(mlm_model.roberta.state_dict())

    # mlm_model = RobertaForMaskedLM.from_pretrained('./cached_models/roberta_squad1_2epoch_covidmlm_3epoch/')
    # model.model.roberta.load_state_dict(mlm_model.roberta.state_dict())
    # # evaluate ner
    # model = BertForNERTask.load_from_checkpoint(
    #     checkpoint_path=args.path_to_model_checkpoint,
    #     hparams_file=args.path_to_model_hparams_file,
    #     map_location=None,
    #     batch_size=args.eval_batch_size
    # )

    trainer.test(model=model)
Пример #9
0
    parser = argparse.ArgumentParser(
        description=
        "Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation"
    )
    parser.add_argument("--model_type",
                        default="roberta",
                        choices=["roberta", "gpt2"])
    parser.add_argument("--model_name", default="roberta-large", type=str)
    parser.add_argument("--dump_checkpoint",
                        default="serialization_dir/tf_roberta_048131723.pth",
                        type=str)
    parser.add_argument("--vocab_transform", action="store_true")
    args = parser.parse_args()

    if args.model_type == "roberta":
        model = RobertaForMaskedLM.from_pretrained(args.model_name)
        prefix = "roberta"
    elif args.model_type == "gpt2":
        model = GPT2LMHeadModel.from_pretrained(args.model_name)
        prefix = "transformer"

    state_dict = model.state_dict()
    compressed_sd = {}

    # Embeddings #
    if args.model_type == "gpt2":
        for param_name in ["wte.weight", "wpe.weight"]:
            compressed_sd[f"{prefix}.{param_name}"] = state_dict[
                f"{prefix}.{param_name}"]
    else:
        for w in [
Пример #10
0
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM

config = RobertaConfig(
    vocab_size=50265,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=1,
)
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base',
                                                 do_lower_case=False)
model = RobertaForMaskedLM.from_pretrained('roberta-base', config=config)
# 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture

dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                file_path='IMDB_train.csv.txt',
                                block_size=128)
#dataset = load_dataset("./csv_for_ft_new.py", data_files=file_path)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)

training_args = TrainingArguments(
    do_train=True,
    do_predict=True,
    output_dir=savedname,
    overwrite_output_dir=True,
from typing import Optional, List, Callable, Any, Dict
import torch
from transformers import FillMaskPipeline
from transformers import RobertaTokenizerFast, RobertaForMaskedLM

model = RobertaForMaskedLM.from_pretrained("roberta-large")
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large")
device_number = torch.cuda.current_device() if torch.cuda.is_available(
) else -1

mask_predictor = FillMaskPipeline(model=model,
                                  tokenizer=tokenizer,
                                  device=device_number)

#
# def predict_mask(text, predictor=mask_predictor, options: Optional[List[str]] = None, num_results: Optional[int] = 1):
#
#
#     results = predictor(text, targets=options, top_k=num_results)
#     parsed_results = []
#     for result in results:
#         parsed_result = {"word": _postprocess_mask_prediction_token(result['token_str']),
#                          "softmax": result["score"]}
#         parsed_results.append(parsed_result)
#
#     return parsed_results


def _postprocess_mask_prediction_token(text):
    return text[1:] if text[0] == "Ġ" else text
Пример #12
0
def gen_neighborhood(args):

    # initialize seed
    if args.seed is not None:
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)

    # load model and tokenizer

    # slow tokenizer is for non-unk decoding
    if args.is_roberta:
        r_model = RobertaForMaskedLM.from_pretrained(args.model)
        tokenizer = RobertaTokenizerFast.from_pretrained(
            args.tokenizer, max_len=512
        )
        old_style_tokenizer = RobertaTokenizer.from_pretrained(
            args.tokenizer, max_len=512
        )
        mask_length = max(
            len(tokenizer.vocab), r_model.lm_head.decoder.out_features
        )
        start_ignore = min(
            len(tokenizer.vocab), r_model.lm_head.decoder.out_features
        )
    else:
        tokenizer = BertTokenizerFast.from_pretrained(
            args.tokenizer,
            clean_text=True,
            tokenize_chinese_chars=True,
            strip_accents=False,
            do_lower_case=False,
        )
        old_style_tokenizer = BertTokenizer.from_pretrained(
            args.tokenizer,
            do_lower_case=False,
            strip_accents=False,
            tokenize_chinese_chars=True,
        )
        r_model = BertForMaskedLM.from_pretrained(args.model)

        assert (
            len(tokenizer.vocab)
            == r_model.cls.predictions.decoder.out_features
        )
        mask_length = len(tokenizer.vocab)
        start_ignore = mask_length

    r_model.eval()
    if torch.cuda.is_available():
        r_model.cuda()

    # remove unused vocab and special ids from sampling
    softmax_mask = np.full(mask_length, False)
    softmax_mask[tokenizer.all_special_ids] = True
    for k, v in tokenizer.vocab.items():
        if "[unused" in k:
            softmax_mask[v] = True
    for i in range(start_ignore, mask_length):
        # this is what happens if your vocab is smaller than it claims to be
        # we'll never use the rest of the ids anyways so we should mask them
        softmax_mask[i] = True
        if not args.is_roberta:
            import pdb

            pdb.set_trace()

    # load the inputs and labels
    lines = [
        tuple(s.strip().split("\t")) for s in open(args.in_file).readlines()
    ]
    num_lines = len(lines)
    # lines[i] is a list of [s], where s is each sentence in the ith column
    # of the file
    lines = [[[s] for s in s_list] for s_list in list(zip(*lines))]

    # load label file if it exists
    if args.label_file:
        labels = [s.strip() for s in open(args.label_file).readlines()]
        output_labels = True
    else:
        labels = [0] * num_lines
        output_labels = False

    # shard the input and labels
    if args.num_shards > 0:
        shard_start = (int(num_lines / args.num_shards) + 1) * args.shard
        shard_end = (int(num_lines / args.num_shards) + 1) * (args.shard + 1)
        lines = [s_list[shard_start:shard_end] for s_list in lines]
        labels = labels[shard_start:shard_end]

    # open output files
    if args.num_shards != 1:
        s_rec_file = open(args.output_prefix + "_" + str(args.shard), "w")
        unk_rec_file = open(
            args.output_prefix, "_unks_" + str(args.shard), "w"
        )
        if output_labels:
            l_rec_file = open(
                args.output_prefix + "_" + str(args.shard) + ".label", "w"
            )
    else:
        s_rec_file = open(args.output_prefix, "w")
        unk_rec_file = open(args.output_prefix + "_unks", "w")
        if output_labels:
            l_rec_file = open(args.output_prefix + ".label", "w")

    # sentences and labels to process
    sents = []
    l = []

    # number sentences generated
    num_gen = []

    # sentence index to noise from
    gen_index = []

    # number of tries generating a new sentence
    num_tries = []

    # next sentence index to draw from
    next_sent = 0

    # indices and words corresponding to each instance of [UNK] / <unk> for the
    # sentences in sents
    unks = []

    sents, l, next_sent, num_gen, num_tries, gen_index, unks = fill_batch(
        args,
        tokenizer,
        sents,
        l,
        lines,
        labels,
        next_sent,
        num_gen,
        num_tries,
        gen_index,
        unks,
        old_style_tokenizer,
    )

    total_unks_in_base_corpus = 0

    # main augmentation loop
    while sents != []:

        # remove any sentences that are done generating and dump to file
        for i in range(len(num_gen))[::-1]:
            if num_gen[i] == args.num_samples or num_tries[i] > args.max_tries:

                # get sent info
                gen_sents = sents.pop(i)
                num_gen.pop(i)
                gen_index.pop(i)
                label = l.pop(i)
                unk_list = unks.pop(i)

                total_unks_in_base_corpus += len(unk_list[0][0])

                # write generated sentences
                for sg, unk in zip(gen_sents[1:], unk_list[1:]):
                    # the [1:-1] here refers to some weirdness that repr() does
                    # on strings -- namely, adding quotes at the start and end
                    de_unked = [
                        de_unk(repr(val)[1:-1], unk[i], tokenizer)
                        for i, val in enumerate(sg)
                    ]
                    orig = [repr(val)[1:-1] for val in sg]

                    s_rec_file.write("\t".join(de_unked) + "\n")
                    unk_rec_file.write("\t".join(orig) + "\n")
                    if output_labels:
                        l_rec_file.write(label + "\n")

        # fill batch
        sents, l, next_sent, num_gen, num_tries, gen_index, unks = fill_batch(
            args,
            tokenizer,
            sents,
            l,
            lines,
            labels,
            next_sent,
            num_gen,
            num_tries,
            gen_index,
            unks,
            old_style_tokenizer,
        )

        # break if done dumping
        if len(sents) == 0:
            print(f"Total unks in base corpus: {total_unks_in_base_corpus}")
            break

        # build batch
        toks = []
        masks = []

        for i in range(len(gen_index)):
            s = sents[i][gen_index[i]]
            tok, mask = hf_masked_encode(
                tokenizer,
                *s,
                noise_prob=args.noise_prob,
                random_token_prob=args.random_token_prob,
                leave_unmasked_prob=args.leave_unmasked_prob,
            )
            toks.append(tok)
            masks.append(mask)

        # pad up to max len input
        max_len = max([len(tok) for tok in toks])
        pad_tok = tokenizer.pad_token_id

        toks = [
            F.pad(tok, (0, max_len - len(tok)), "constant", pad_tok)
            for tok in toks
        ]
        masks = [
            F.pad(mask, (0, max_len - len(mask)), "constant", pad_tok)
            for mask in masks
        ]
        toks = torch.stack(toks)
        masks = torch.stack(masks)

        # load to GPU if available
        if torch.cuda.is_available():
            toks = toks.cuda()
            masks = masks.cuda()

        # predict reconstruction
        rec, rec_masks = hf_reconstruction_prob_tok(
            toks,
            masks,
            tokenizer,
            r_model,
            softmax_mask,
            reconstruct=True,
            topk=args.topk,
        )

        # decode reconstructions and append to lists
        for i in range(len(rec)):
            rec_work = rec[i].cpu().tolist()
            meaningful_ids = [
                val for val in rec_work if val != tokenizer.pad_token_id
            ][1:-1]
            # TODO (maybe) make this work with multiple sentences
            # this would involve splitting on tokenizer.sep_token_id and
            # doing this for each sentence
            curr_unk_data = get_current_unk_data(
                meaningful_ids, unks[i][0][0], tokenizer.unk_token_id
            )
            curr_unk_data = (curr_unk_data,)

            s_rec = [
                s.strip()
                for s in tokenizer.decode(meaningful_ids).split(
                    tokenizer.sep_token
                )
            ]
            s_rec = tuple(s_rec)

            # check if identical reconstruction or empty
            if s_rec not in sents[i] and "" not in s_rec:
                sents[i].append(s_rec)
                num_gen[i] += 1
                num_tries[i] = 0
                gen_index[i] = 0
                unks[i].append(curr_unk_data)

            # otherwise try next sentence
            else:
                num_tries[i] += 1
                gen_index[i] += 1
                if gen_index[i] == len(sents[i]):
                    gen_index[i] = 0

        # clean up tensors
        del toks
        del masks
Пример #13
0
 def _get_masked_language_model(self):
     """
     Initializes the RoBERTaForMaskedLM transformer
     """
     self.mlm = RobertaForMaskedLM.from_pretrained(self.model)
     self.mlm.eval()
Пример #14
0
def create_long_model(model_specified, attention_window, max_pos,
                      save_model_to):
    """Starting from the `roberta-base` (or similar) checkpoint, the following function converts it into an instance of `RobertaLong`.
     It makes the following changes:
        1)extend the position embeddings from `512` positions to `max_pos`. In Longformer, we set `max_pos=4096`
        2)initialize the additional position embeddings by copying the embeddings of the first `512` positions.
            This initialization is crucial for the model performance (check table 6 in [the paper](https://arxiv.org/pdf/2004.05150.pdf)
            for performance without this initialization)
        3) replaces `modeling_bert.BertSelfAttention` objects with `modeling_longformer.LongformerSelfAttention` with a attention window size `attention_window`

        The output of this function works for long documents even without pretraining.
        Check tables 6 and 11 in [the paper](https://arxiv.org/pdf/2004.05150.pdf) to get a sense of 
        the expected performance of this model before pretraining."""

    model = RobertaForMaskedLM.from_pretrained(
        model_specified)  #,gradient_checkpointing=True)

    tokenizer = RobertaTokenizer.from_pretrained(model_specified,
                                                 model_max_length=max_pos)

    config = model.config

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
    max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos
    # allocate a larger position embedding matrix
    new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(
        max_pos, embed_size)
    # copy position embeddings over and over to initialize the new position embeddings
    k = 2
    step = current_max_pos - 2
    while k < max_pos - 1:
        new_pos_embed[k:(
            k +
            step)] = model.roberta.embeddings.position_embeddings.weight[2:]
        k += step

    model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed
    model.roberta.embeddings.position_embeddings.num_embeddings = len(
        new_pos_embed.data)

    # # first, check that model.roberta.embeddings.position_embeddings.weight.data.shape is correct — has to be 4096 (default) of your desired length
    # model.roberta.embeddings.position_ids = torch.arange(
    #     0, model.roberta.embeddings.position_embeddings.num_embeddings
    # )[None]

    model.roberta.embeddings.position_ids.data = torch.tensor(
        [i for i in range(max_pos)]).reshape(1, max_pos)

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.roberta.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = copy.deepcopy(layer.attention.self.query)
        longformer_self_attn.key = copy.deepcopy(layer.attention.self.key)
        longformer_self_attn.value = copy.deepcopy(layer.attention.self.value)

        longformer_self_attn.query_global = copy.deepcopy(
            layer.attention.self.query)
        longformer_self_attn.key_global = copy.deepcopy(
            layer.attention.self.key)
        longformer_self_attn.value_global = copy.deepcopy(
            layer.attention.self.value)

        layer.attention.self = longformer_self_attn

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    tst_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/test.csv')
    trn_df = pd.concat([trn_df, tst_df], axis=0).fillna(-1)
    trn_df['is_original'] = 1
    # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv')
    # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv')
    # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv')

    # clean texts
    # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])

    # load additional tokens
    # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin:
    #     additional_tokens = pickle.load(fin)

    gkf = GroupKFold(
        n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold > 0:
            break
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ', logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            trn_df = trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(list(itertools.chain.from_iterable(
            fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
            fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
            fold_trn_df.answer.apply(lambda x: x.split(' '))
        ))).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]#  + additional_tokens
        fold_trn_df = trn_df.drop(['is_original', 'question_body_le'], axis=1)

        # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df], axis=0)

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='</s>',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
            use_category=False,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        model = RobertaForMaskedLM.from_pretrained(MODEL_PRETRAIN)

        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=MAX_EPOCH, eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue

            # model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch_ML(model, optimizer, trn_loader, DEVICE)

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [trn_loss, ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(trn_loss)
            else:
                histories['val_loss'][fold] = [trn_loss, ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(trn_loss)
            else:
                histories['val_metric'][fold] = [trn_loss, ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(trn_loss)
            else:
                histories['val_metric_raws'][fold] = [trn_loss, ]

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ',
                logger)
            model = model.to('cpu')
            # model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                [],
                [],
                [],
                fold,
                epoch,
                trn_loss,
                trn_loss,
                )
        save_and_clean_for_prediction(
            f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
            trn_dataset.tokenizer,
            clean=False)
        del model

    send_line_notification('fini!')

    sel_log('now saving best checkpoints...', logger)
Пример #16
0
import torch
from transformers import RobertaForMaskedLM, RobertaTokenizer

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# TODO: move these global variables to local as we add metrics.
# TODO: finetune `MODEL` on the QA/dialog dataset for a better metric
MODEL = RobertaForMaskedLM.from_pretrained('roberta-large').to(DEVICE)
TOKENIZER = RobertaTokenizer.from_pretrained('roberta-large')


def mlm_metric(prompt, response):
    """
    Masked language model metric.

    (Need to double-check implementation.)

    Parameters
    ----------
    prompt : str
        The user comment.
    response : str
        The bot's response.

    Returns
    -------
    Sum of negative probabilities (lower is better).
    """
    tokens = TOKENIZER([[prompt, response]], return_tensors='pt')
    input_ids = tokens['input_ids'][0]
    dividing_indices = [
Пример #17
0
tokenizer2 = PreTrainedTokenizerFast.from_pretrained(
    dataset_path / 'my-pretrained-tokenizer-fast2', max_len=128)

# 4. Check that the LM actually trained


def to_gpu(x, *args, **kwargs):
    return x.cuda(*args, **kwargs) if USE_GPU else x


# load trained model

# os.system('tar xzvf PanTadeuszRoBERTa.tgz')

model = RobertaForMaskedLM.from_pretrained(str(model_path))
model = to_gpu(model)
model.device

# ## generate
model.eval()

max_length = 100

eval = Evaluator(text_tokenizer, tokenizer2)

# gen1 = evaluate('chwycił na taśmie przypięty', max_length=max_length, temperature=1.0)
# print_eval(gen1)
# gen1

# print_eval(evaluate('Litwo! Ojczyzno', max_length=max_length, temperature=1.0))
Пример #18
0
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    # Prepare model
    class tempmodel(nn.Module):
        def __init__(self, roberta, insert_net, delete_net):
            super().__init__()
            self.roberta = roberta
            self.insert_net = insert_net
            self.delete_net = delete_net

    roberta = RobertaForMaskedLM.from_pretrained("roberta-base")
    insert_net = nn.Linear(768, 3)
    delete_net = nn.Linear(768, 3)
    roberta.load_state_dict(
        torch.load(os.path.join(args.from_dir, 'bert_model.bin')))
    # if args.delete:
    #     insert_net.load_state_dict(torch.load(os.path.join(args.from_dir, 'insert_model.bin')))
    # else:
    from weight_init import weight_init
    insert_net.apply(weight_init)
    delete_net.apply(weight_init)
    # init CTRL code
    roberta.roberta.embeddings.word_embeddings.weight[
        50261, :].data = roberta.roberta.embeddings.word_embeddings.weight[
            0, :].data
    roberta.roberta.embeddings.word_embeddings.weight[
Пример #19
0
file_in = sys.argv[1]
file_out = sys.argv[2]

all_data_dict = dict()
max_length = 100
tail_hidd_list = list()
#device = "cpu"
device = "cuda"

pretrained_weights = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(pretrained_weights)

fine_tuned_weight = 'roberta-base'
model = RobertaForMaskedLM.from_pretrained(pretrained_weights,
                                           output_hidden_states=True,
                                           return_dict=True)
#model.load_state_dict(torch.load(fine_tuned_weight), strict=False)

#model.to(device).half()
model.to(device)
model.eval()

num_samples = 1000000

old = torch.FloatTensor(768)
with open(file_in) as f:
    #data = json.load(f)
    for index, d in tqdm(enumerate(f)):
        if index == 1000000:
            break
Пример #20
0
    intermediate_size=2048,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
    bos_token_id=tokenizer2._tokenizer.token_to_id("<s>"),
    eos_token_id=tokenizer2._tokenizer.token_to_id("</s>"),
    pad_token_id=tokenizer2._tokenizer.token_to_id("<pad>"),
    attention_probs_dropout_prob=0.0,
    hidden_dropout_prob=0.0,
)
print(config)

# model = RobertaForMaskedLM(config=config)
# !tar xzvf "PanTadeuszRoBERTa.tgz"
model = RobertaForMaskedLM.from_pretrained('runs/run_4/model')
print(f'model.num_parameters(): {model.num_parameters()}')

# Build training/eval Datasets
print(fn_corpus)
dataset = LineByLineTextDataset(
    tokenizer=tokenizer2,
    file_path=fn_corpus,
    block_size=128,
)

eval_dataset = LineByLineTextDataset(
    tokenizer=tokenizer2,
    file_path=fn_corpus_eval,
    block_size=128,
)
Пример #21
0
def main():
    # 在 src/transformers/training_args.py中查看所有可能的参数,或将--help标志传递给此脚本。
    # 现在,我们保留了不同的参数集,以使关注点更加清晰。
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # 如果我们仅将一个参数传递给脚本,并且它是指向json文件的路径,那么让我们对其进行解析以获取参数。
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"输出目录({training_args.output_dir}) 以及存在,并且不为空"
            "Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # 记录每个进程的日志
    logger.warning(
        f"使用的 rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu} "
        + f"是否分布式训练: {bool(training_args.local_rank != -1)}, 16-bits 半精度训练: {training_args.fp16}"
    )
    # 主进程的日志设为verbosity:
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("训练/评估参数 %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
    # behavior (see below)
    #
    # 在分布式训练中,load_dataset函数可确保只有一个本地进程可以同时下载数据集。
    if data_args.dataset_name is not None:
        # 从hub下载和加载数据集。
        # 首先确定本地缓存了cache文件
        cache_script = os.path.join("data", data_args.dataset_name+".py")
        if not os.path.exists(cache_script):
            raise Exception("请检查本地是否存在相关脚本文件")
        datasets = load_dataset(path=cache_script, name=data_args.dataset_config_name, data_dir=data_args.data_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # 加载预训练模型和tokenizer
    #
    # Distributed training:
    # .from_pretrained方法可确保只有一本地个进程可以同时下载模型和vocab。
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("你正从头开始初始化一个新的config.")
    # tokenizer的设置
    if model_args.tokenizer_name:
        if model_args.tokenizer_name == "myroberta":
            tokenizer = BertTokenizer.from_pretrained(
                model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
            )
        else:
            tokenizer = AutoTokenizer.from_pretrained(
                model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
            )
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    else:
        raise ValueError(
            "您正在从头实例化一个新的tokenizer。 此脚本不支持此功能。 "
            "您可以用其它形式训练好之后,在这里使用,使用方法:  using --tokenizer_name."
        )
    #模型的设置
    if model_args.model_name_or_path:
        if model_args.model_name_or_path == 'myroberta':
            model = RobertaForMaskedLM.from_pretrained(
                model_args.model_name_or_path,
                from_tf=bool(".ckpt" in model_args.model_name_or_path),
                config=config,
                cache_dir=model_args.cache_dir,
            )
        else:
            model = AutoModelForMaskedLM.from_pretrained(
                model_args.model_name_or_path,
                from_tf=bool(".ckpt" in model_args.model_name_or_path),
                config=config,
                cache_dir=model_args.cache_dir,
            )
    else:
        logger.info("从头开始训练一个模型")
        model = AutoModelForMaskedLM.from_config(config)
    #重设下tokenizer的大小,如果当我们从头训练新模型时,这是必须的
    model.resize_token_embeddings(len(tokenizer))

    # 处理数据集
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.line_by_line:
        # 按行处理, tokenize each nonempty line
        padding = "max_length" if data_args.pad_to_max_length else False

        def tokenize_function(examples):
            # 移除空行
            # 收到的数据长度
            print(f"收到的数据长度: {[len(t) for t in examples['text']]}")
            examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
            tokenizer_res = tokenizer(
                examples["text"],
                padding=padding,
                truncation=True,
                max_length=data_args.max_seq_length,
                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                # receives the `special_tokens_mask`.
                return_special_tokens_mask=True,
            )
            print(f"tokenizer之后的数据长度: {print([len(t) for t in tokenizer_res['input_ids']])}")
            return tokenizer_res

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=[text_column_name],
            load_from_cache_file=not data_args.overwrite_cache,
        )
    else:
        # 否则,我们将tokenize每个文本,然后将它们拼接在一起,然后再将它们分成较小的部分。
        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
        # efficient when it receives the `special_tokens_mask`.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
        #默认一次处理1000行
        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

        if data_args.max_seq_length is None:
            max_seq_length = tokenizer.model_max_length
        else:
            if data_args.max_seq_length > tokenizer.model_max_length:
                logger.warning(
                    f"参数给定的 max_seq_length  ({data_args.max_seq_length}) 比模型的 ({tokenizer.model_max_length}) 最大长度长. 使用模型的最大长度 max_seq_length={tokenizer.model_max_length}."
                )
            max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

        # 主要数据处理功能,可拼接数据集中的所有文本并生成max_seq_length的块。
        def group_texts(examples):
            # 拼接所有文本。
            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # 我们删除一小部分,如果模型支持该字段,则可以添加padding,而不是删除,您可以根据需要自定义此部分。
            total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
                for k, t in concatenated_examples.items()
            }
            return result

        # 注意,使用batched=True`时,此映射一起处理1,000个文本,因此group_texts会丢弃这1,000个文本组中的每一个的余数。 您可以在此处调整该batch_size,但较高的值可能会较慢进行预处理。
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Data collator
    # 这部分是随机mask token的设置
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        model_path = (
            model_args.model_name_or_path
            if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path))
            else None
        )
        trainer.train(model_path=model_path)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** 开始评估 ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
Пример #22
0
        'prediction': prediction,
        'word_distance': word_distance,
        'word_length': word_length,
        'wer': wer,
        'char_distance': char_distance,
        'char_length': char_length,
        'cer': cer
    }

    return result


from transformers import RobertaForMaskedLM, RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("jordimas/julibert")
julibert = RobertaForMaskedLM.from_pretrained("jordimas/julibert").to("cuda")

print(julibert.config)


# Preprocessing the datasets.
# We need to read the aduio files as arrays
def evaluate(batch):
    inputs = processor(batch["speech"],
                       sampling_rate=16_000,
                       return_tensors="pt",
                       padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"),
                       attention_mask=inputs.attention_mask.to("cuda")).logits
Пример #23
0
    intermediate_size=256,
    max_position_embeddings=256,
    num_attention_heads=1,
    num_hidden_layers=1,
    type_vocab_size=1,
    bos_token_id=tokenizer._tokenizer.token_to_id("<s>"),
    eos_token_id=tokenizer._tokenizer.token_to_id("</s>"),
    pad_token_id=tokenizer._tokenizer.token_to_id("<pad>"),
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.3,
)

print(config)

# model = RobertaForMaskedLM(config=config)
model = RobertaForMaskedLM.from_pretrained('runs/esperberto/run_7/model')
print(f'model.num_parameters(): {model.num_parameters()}')

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=dataset_path / "oscar.eo.1000x10.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)

training_args = TrainingArguments(
    output_dir=str(run_path),
    logging_dir=str(run_path),
Пример #24
0
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(
    "trained_models/distilbako-base-akuapem-twi-cased",
    max_len=512,
    do_lower_case=True)
tokenizer.save_vocabulary(
    "trained_models/distilbako-base-asante-twi-uncased"
)  # saving pretrained tokenizer locally in case of asante

from transformers import RobertaForMaskedLM

#model = RobertaForMaskedLM(config=config) # from scratch, for Akuapem
model = RobertaForMaskedLM.from_pretrained(
    "trained_models/distilbako-base-akuapem-twi-cased"
)  # fine-tune from Akuapem weights, for Asante
print(model.num_parameters())

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="asante_twi_bible.txt",
    block_size=128,
)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
Пример #25
0
        '--evaluate_during_training',
        '--do_train',
        '--do_eval',
    ])

training_args.val_datapath = [
    '/media/txguo/866225e9-e15c-2d46-aba5-1ce1c0452e49/download_pdf/another_samples/1_5001.txt',
    '/media/txguo/866225e9-e15c-2d46-aba5-1ce1c0452e49/download_pdf/another_samples/5001_10001.txt'
]
training_args.train_datapath = [
    '/media/txguo/866225e9-e15c-2d46-aba5-1ce1c0452e49/download_pdf/another_samples/10001_15001.txt',
    '/media/txguo/866225e9-e15c-2d46-aba5-1ce1c0452e49/download_pdf/another_samples/15001_20001.txt'
]

if __name__ == "__main__":
    roberta_base = RobertaForMaskedLM.from_pretrained(
        './pretrained_model/roberta_chinese_base')
    roberta_base_tokenizer = BertTokenizerFast.from_pretrained(
        './pretrained_model/roberta_chinese_base')
    # logger.info('Evaluating roberta-base (seqlen: 512) for refernece ...')
    # pretrain_and_evaluate(training_args, roberta_base, roberta_base_tokenizer, eval_only=True, model_path=None)

    model_path = f'{training_args.output_dir}/roberta-base-{model_args.max_pos}'
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    logger.info(
        f'Converting roberta-base into roberta-base-{model_args.max_pos}')
    model, tokenizer = create_long_model(
        save_model_to=model_path,
        attention_window=model_args.attention_window,
        max_pos=model_args.max_pos)
Пример #26
0
    def __init__(self,
                 model="bert",
                 postag_file='saved_objects/postag_dict_all+VBN.p',
                 syllables_file='saved_objects/cmudict-0.7b.txt',
                 extra_stress_file='saved_objects/edwins_extra_stresses.txt',
                 top_file='saved_objects/words/top_words.txt',
                 templates_file='poems/scenery_templates.txt',
                 mistakes_file='saved_objects/mistakes.txt'):

        #self.templates = [("FROM scJJS scNNS PRP VBZ NN", "0_10_10_1_01_01"),
        #                 ("THAT scJJ scNN PRP VBD MIGHT RB VB", "0_10_10_1_0_10_1"),
        #                ("WHERE ALL THE scNNS OF PRP$ JJ NNS", "0_1_0_10_1_0_10_1"),
        #               ("AND THAT JJ WHICH RB VBZ NN", "0_1_01_0_10_1_01")]
        with open(templates_file) as tf:
            self.templates = [(" ".join(line.split()[:-1]), line.split()[-1])
                              for line in tf.readlines()]
        with open(postag_file, 'rb') as f:
            self.postag_dict = pickle.load(f)
        self.pos_to_words, self.words_to_pos = helper.get_pos_dict(
            postag_file, mistakes_file=mistakes_file)

        self.special_words = helper.get_finer_pos_words()

        self.dict_meters = helper.create_syll_dict([syllables_file],
                                                   extra_stress_file)

        if model == "bert":
            self.lang_model = BertForMaskedLM.from_pretrained(
                'bert-base-uncased')
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            self.lang_vocab = list(self.tokenizer.vocab.keys())
            self.lang_model.eval()
            self.vocab_to_num = self.tokenizer.vocab

        elif model == "roberta":
            self.lang_model = RobertaForMaskedLM.from_pretrained(
                'roberta-base')  # 'roberta-base'
            self.tokenizer = RobertaTokenizer.from_pretrained(
                'roberta-base')  # 'roberta-large'
            with open("saved_objects/roberta/vocab.json") as json_file:
                j = json.load(json_file)
            self.lang_vocab = list(j.keys())
            self.lang_model.eval()
            self.vocab_to_num = {
                self.lang_vocab[x]: x
                for x in range(len(self.lang_vocab))
            }

        else:
            self.lang_model = None

        self.poems = pd.read_csv('poems/kaggle_poem_dataset.csv')['Content']

        with open(top_file) as tf:
            self.top_common_words = [line.strip()
                                     for line in tf.readlines()][:125]

        self.stemmer = PorterStemmer()

        self.api_url = 'https://api.datamuse.com/words'

        self.gender = random.choice([["he", "him", "his", "himself"],
                                     ["she", "her", "hers", "herself"]])
Пример #27
0
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False)
    model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config)
    #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config)
    # 12-layer, 768-hidden, 12-heads, 110M parameters.

elif args.LM == 'RoBerta':
    from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM

    config = RobertaConfig(vocab_size=50265,
                           max_position_embeddings=514,
                           num_attention_heads=12,
                           num_hidden_layers=12,
                           type_vocab_size=1,
                           )
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False)
    model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config)
    # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture

elif args.LM == 'XLM':
    from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel

    config = XLMConfig(vocab_size=64139,
                       emb_dim=1024,
                       max_position_embeddings=512,
                       n_heads=8,
                       n_layers=6,
                       )

    tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False)
    model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config)
    # 6-layer, 1024-hidden, 8-heads
Пример #28
0
def evaluate(args):
    """
    Evaluate a masked language model using CrowS-Pairs dataset.
    """

    print("Evaluating:")
    print("Input:", args.input_file)
    print("Model:", args.lm_model)
    print("=" * 100)

    logging.basicConfig(level=logging.INFO)

    # load data into panda DataFrame
    df_data = read_data(args.input_file)

    # supported masked language models
    if args.lm_model == "scibert-bert":
        tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
        model = BertForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased')
        uncased = True
    elif args.lm_model == "biobert-bert":
        tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
        model = BertForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1')
        uncased = True
    elif args.lm_model == "scibert-roberta":
        tokenizer = RobertaTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
        model = RobertaForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased')
        uncased = True
    elif args.lm_model == "biobert-roberta":
        tokenizer = RobertaTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
        model = RobertaForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1')
        uncased = True
    elif args.lm_model == "roberta":
        tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
        model = RobertaForMaskedLM.from_pretrained('roberta-large')
        uncased = False
    elif args.lm_model == "bert":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForMaskedLM.from_pretrained('bert-base-uncased')
        uncased = True

    model.eval()
    model.to('cuda')

    mask_token = tokenizer.mask_token
    log_softmax = torch.nn.LogSoftmax(dim=0)
    vocab = tokenizer.get_vocab()
    with open(args.lm_model + ".vocab", "w") as f:
        f.write(json.dumps(vocab))

    lm = {"model": model,
          "tokenizer": tokenizer,
          "mask_token": mask_token,
          "log_softmax": log_softmax,
          "uncased": uncased
          }

    # score each sentence.
    # each row in the dataframe has the sentid and score for pro and anti stereo.
    df_score = pd.DataFrame(columns=['sent_more', 'sent_less',
                                     'sent_more_score', 'sent_less_score',
                                     'score', 'stereo_antistereo', 'bias_type'])

    total_stereo, total_antistereo = 0, 0
    stereo_score, antistereo_score = 0, 0

    N = 0
    neutral = 0
    total = len(df_data.index)
    with tqdm(total=total) as pbar:
        for index, data in df_data.iterrows():
            direction = data['direction']
            bias = data['bias_type']
            score = mask_unigram(data, lm)

            for stype in score.keys():
                score[stype] = round(score[stype], 3)

            N += 1
            pair_score = 0
            pbar.update(1)
            if score['sent1_score'] == score['sent2_score']:
                neutral += 1
            else:
                if direction == 'stereo':
                    total_stereo += 1
                    if score['sent1_score'] > score['sent2_score']:
                        stereo_score += 1
                        pair_score = 1
                elif direction == 'antistereo':
                    total_antistereo += 1
                    if score['sent2_score'] > score['sent1_score']:
                        antistereo_score += 1
                        pair_score = 1

            sent_more, sent_less = '', ''
            if direction == 'stereo':
                sent_more = data['sent1']
                sent_less = data['sent2']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']
            else:
                sent_more = data['sent2']
                sent_less = data['sent1']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']

            df_score = df_score.append({'sent_more': sent_more,
                                        'sent_less': sent_less,
                                        'sent_more_score': sent_more_score,
                                        'sent_less_score': sent_less_score,
                                        'score': pair_score,
                                        'stereo_antistereo': direction,
                                        'bias_type': bias
                                        }, ignore_index=True)

    df_score.to_csv(args.output_file)
    print('=' * 100)
    print('Total examples:', N)
    print('Metric score:', round((stereo_score + antistereo_score) / N * 100, 2))
    print('Stereotype score:', round(stereo_score / total_stereo * 100, 2))
    if antistereo_score != 0:
        print('Anti-stereotype score:', round(antistereo_score / total_antistereo * 100, 2))
    print("Num. neutral:", neutral, round(neutral / N * 100, 2))
    print('=' * 100)
    print()
Пример #29
0
# %%
import torch
import string

from transformers import RobertaTokenizer, RobertaForMaskedLM
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval()

top_k = 10


def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return '\n'.join(tokens[:top_clean])


def encode(tokenizer, text_sentence, add_special_tokens=True):
    text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
    # if <mask> is the last token, append a "." so that models dont predict punctuation.
    if tokenizer.mask_token == text_sentence.split()[-1]:
        text_sentence += ' .'

    input_ids = torch.tensor([
        tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)
    ])
    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

export_model_path = os.path.join(output_dir, "phobert-base-formaskedlm.onnx")
model_name_or_path = "vinai/phobert-base"
cache_dir = "./cache_models"
enable_overwrite = True

tokenizer = PhobertTokenizer.from_pretrained(model_name_or_path)
input_ids = torch.tensor(
    tokenizer.encode("Hôm nay trời <mask> quá",
                     add_special_tokens=True)).unsqueeze(0)

ort_session = onnxruntime.InferenceSession(export_model_path)

# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(input_ids)}
ort_out = ort_session.run(None, ort_inputs)
print(len(ort_out[0][0][5]))

config = RobertaConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
model = RobertaForMaskedLM.from_pretrained(model_name_or_path,
                                           config=config,
                                           cache_dir=cache_dir)
with torch.no_grad():
    out = model(input_ids)

print("***** Verifying correctness *****")
print('PyTorch and ONNX Runtime output are close: {}'.format(
    np.allclose(to_numpy(out[0]), ort_out[0], rtol=1e-03, atol=1e-04)))