def __init__(self,
                 pretrained_path,
                 n_labels,
                 hidden_size,
                 dropout_p,
                 label_ignore_idx=0,
                 head_init_range=0.04,
                 device='cuda'):
        super().__init__()

        self.n_labels = n_labels

        self.linear_1 = nn.Linear(hidden_size, hidden_size)
        self.classification_head = nn.Linear(hidden_size, n_labels)

        self.label_ignore_idx = label_ignore_idx

        self.xlmr = XLMRModel.from_pretrained(pretrained_path)
        self.model = self.xlmr.model
        self.dropout = nn.Dropout(dropout_p)

        self.device = device

        # initializing classification head
        self.classification_head.weight.data.normal_(mean=0.0,
                                                     std=head_init_range)
Пример #2
0
def convert_fairseq_model(args):
    if not args.save_dir:
        args.save_dir = os.path.basename(args.fairseq_model_path) + '_gluon'
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    fairseq_xlmr = fairseq_XLMRModel.from_pretrained(
        args.fairseq_model_path, checkpoint_file='model.pt')
    vocab_size = convert_vocab(args, fairseq_xlmr)

    gluon_cfg = convert_config(fairseq_xlmr.args, vocab_size,
                               XLMRModel.get_cfg().clone())
    with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
        of.write(gluon_cfg.dump())

    ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()

    gluon_xlmr = convert_params(fairseq_xlmr, gluon_cfg, ctx)
    if args.test:
        test_model(fairseq_xlmr, gluon_xlmr, args.gpu)

    gluon_xlmr.save_parameters(os.path.join(args.save_dir, 'model_mlm.params'),
                               deduplicate=True)
    logging.info('Convert the RoBERTa MLM model in {} to {}'.
                 format(os.path.join(args.fairseq_model_path, 'model.pt'), \
                        os.path.join(args.save_dir, 'model_mlm.params')))
    gluon_xlmr.backbone_model.save_parameters(os.path.join(
        args.save_dir, 'model.params'),
                                              deduplicate=True)
    logging.info('Convert the RoBERTa backbone model in {} to {}'.
                 format(os.path.join(args.fairseq_model_path, 'model.pt'), \
                        os.path.join(args.save_dir, 'model.params')))

    logging.info('Conversion finished!')
    logging.info('Statistics:')
    rename(args.save_dir)
Пример #3
0
class TestXLMRTextEncoder(unittest.TestCase):
    download_file_maybe_extract(
        "https://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz",
        directory=os.environ["HOME"] + "/.cache/torch/unbabel_comet/",
        check_files=["xlmr.base/model.pt"],
    )
    xlmr = XLMRModel.from_pretrained(
        os.environ["HOME"] + "/.cache/torch/unbabel_comet/xlmr.base",
        checkpoint_file="model.pt",
    )
    original_vocab = xlmr.task.source_dictionary.__dict__["indices"]
    tokenizer = XLMRTextEncoder(xlmr.encode, original_vocab)

    def test_unk_property(self):
        self.assertEqual(self.tokenizer.unk_index, self.original_vocab["<unk>"])

    def test_pad_property(self):
        self.assertEqual(self.tokenizer.padding_index, self.original_vocab["<pad>"])

    def test_bos_property(self):
        self.assertEqual(self.tokenizer.bos_index, self.original_vocab["<s>"])

    def test_eos_property(self):
        self.assertEqual(self.tokenizer.eos_index, self.original_vocab["</s>"])

    def test_mask_property(self):
        self.assertEqual(self.tokenizer.mask_index, self.original_vocab["<mask>"])

    def test_vocab_property(self):
        self.assertEqual(self.tokenizer.vocab, self.original_vocab)

    def test_vocab_size_property(self):
        self.assertEqual(self.tokenizer.vocab_size, len(self.original_vocab))

    def test_encode(self):
        sentence = "Hello, my dog is cute"
        expected = self.xlmr.encode(sentence)
        result = self.tokenizer.encode(sentence)
        self.assertTrue(torch.equal(expected, result))
        # Make sure the bos and eos tokens were added.
        self.assertEqual(result[0], self.tokenizer.bos_index)
        self.assertEqual(result[-1], self.tokenizer.eos_index)

    def test_batch_encode(self):
        # Test batch_encode.
        batch = ["Hello, my dog is cute", "hello world!"]
        encoded_batch, lengths = self.tokenizer.batch_encode(batch)

        self.assertTrue(torch.equal(encoded_batch[0], self.tokenizer.encode(batch[0])))
        self.assertTrue(
            torch.equal(encoded_batch[1][: lengths[1]], self.tokenizer.encode(batch[1]))
        )
        self.assertEqual(lengths[0], len(self.xlmr.encode("Hello, my dog is cute")))
        self.assertEqual(lengths[1], len(self.xlmr.encode("hello world!")))

        # Check if last sentence is padded.
        self.assertEqual(encoded_batch[1][-1], self.tokenizer.padding_index)
        self.assertEqual(encoded_batch[1][-2], self.tokenizer.padding_index)
Пример #4
0
    def __init__(self, pretrained_path, hidden_size, dropout_p, device='cuda'):
        super().__init__()

        
        self.xlmr = XLMRModel.from_pretrained(pretrained_path)
        self.model = self.xlmr.model
        self.dropout = nn.Dropout(dropout_p)
        
        self.device=device
Пример #5
0
 def __init__(self, pretrained_model_name_or_path):
     super().__init__()
     print(
         "loading model from local :",
         os.path.join(os.getenv("TRAINED_MODEL_DIR"),
                      pretrained_model_name_or_path))
     self.xlmr = XLMRModel.from_pretrained(os.path.join(
         os.getenv("TRAINED_MODEL_DIR"), pretrained_model_name_or_path),
                                           checkpoint_file='model.pt')
Пример #6
0
    def __init__(self, textpath='/text/asr_result.txt'):
        self.textpath = os.path.dirname(os.path.realpath(__file__)) + textpath

        self.xlmr = XLMRModel.from_pretrained('xlmr.base',
                                              checkpoint_file='model.pt')
        self.xlmr = self.xlmr.to(DEVICE)
        self.xlmr.eval()
        print('xlmr base model loaded.')

        with open(self.textpath, 'rt', -1, 'utf-8') as rf:
            self.data = rf.readlines()
Пример #7
0
    def __init__(self,
                 urls: Union[str, List[str]],
                 vocab_path: str = '',
                 compare: bool = False,
                 labeled: bool = False,
                 anomalies: bool = False,
                 raw: bool = False) -> None:
        """

        Parameters
        ----------
        urls: Union[str, List[str]]
            Path or paths to pass to webdataset.dataset.ShardList. Points to Censored Planet .tar data files.
        vocab_path: str
            Path to a .pyc file which holds a dictionary that maps an index sequence with tokens used from
            fairseq.models.roberta.model_xlmr.XLMRModel when flattening data.
        compare: bool
            Should data be compared with Censored Planet blockpage signatures?
        labeled: bool
            Should only data successfully precessed by blockpage matcher be returned?
        anomalies: bool
            Should only data marked by Censored Planet as an anomaly be processed?
        raw: bool
            Should the raw row be returned without processing into vectors?
        """
        super().__init__()

        assert (
            urls
            is not None), "Must supply a url as a string or list of strings"

        self.__shards = ShardList(urls)
        self.__blockpage_matcher = BlockpageMatcher()
        self.__labeled = labeled
        self.__compare = labeled or compare
        self.__anomalies = anomalies
        self.__raw = raw
        if not self.__raw:
            # Bring in the MMDB free database.
            self.__ip2geo = geoip2.database.Reader('./mmdb/country.mmdb')
            # Bring in the pretrained XLMR model.
            self.__xlmr = XLMRModel.from_pretrained('/data/xlmr.large',
                                                    checkpoint_file='model.pt')
            self.__xlmr.eval()
            self.__vocab_path = vocab_path
            try:
                with open(vocab_path, 'rb') as retrieved_dict:
                    self.__vocab = pickle.load(retrieved_dict)
            except OSError:
                self.__vocab = dict()
            self.__vocab_next = len(self.__vocab)
Пример #8
0
def create_pretrained(model_type, force_download=False):
    """
    Downloads and creates the pretrained assets.
    """
    cache_dir = join(PROJECT_DIR, '.cache')

    os.makedirs(cache_dir, exist_ok=True)

    model_dir = join(cache_dir, model_type)

    if not exists(model_dir) or force_download:
        download(model_type, cache_dir)

    xlmr = XLMRModel.from_pretrained(model_dir, checkpoint_file='model.pt')

    return xlmr
Пример #9
0
    def from_pretrained(cls, hparams: HyperOptArgumentParser, lm_head: bool = False):
        if not os.path.exists("pretrained/"):
            os.mkdir("pretrained/")

        pretrained_model = hparams.pretrained_model
        if pretrained_model == "xlmr.base":
            download_file_maybe_extract(
                XLMR_BASE_URL,
                directory="pretrained",
                check_files=[XLMR_BASE_MODEL_NAME],
            )

        elif pretrained_model == "xlmr.large":
            download_file_maybe_extract(
                XLMR_LARGE_URL,
                directory="pretrained",
                check_files=[XLMR_LARGE_MODEL_NAME],
            )
        elif pretrained_model == "xlmr.base.v0":
            download_file_maybe_extract(
                XLMR_BASE_V0_URL,
                directory="pretrained",
                check_files=[XLMR_BASE_V0_MODEL_NAME],
            )

        elif pretrained_model == "xlmr.large.v0":
            download_file_maybe_extract(
                XLMR_LARGE_V0_URL,
                directory="pretrained",
                check_files=[XLMR_LARGE_V0_MODEL_NAME],
            )
        else:
            raise Exception(f"{pretrained_model} is an invalid XLM-R model.")

        xlmr = XLMRModel.from_pretrained(
            "pretrained/" + pretrained_model, checkpoint_file="model.pt"
        )
        xlmr.eval()
        tokenizer = RoBERTaTextEncoder(
            xlmr.encode, xlmr.task.source_dictionary.__dict__["indices"]
        )
        return XLMRoBERTa(
            xlmr=xlmr, tokenizer=tokenizer, hparams=hparams, lm_head=lm_head
        )
Пример #10
0
    def from_pretrained(cls, hparams: Namespace):
        if not os.path.exists(saving_directory):
            os.makedirs(saving_directory)

        pretrained_model = hparams.pretrained_model
        if pretrained_model == "xlmr.base":
            download_file_maybe_extract(
                XLMR_BASE_URL,
                directory=saving_directory,
                check_files=[XLMR_BASE_MODEL_NAME],
            )

        elif pretrained_model == "xlmr.large":
            download_file_maybe_extract(
                XLMR_LARGE_URL,
                directory=saving_directory,
                check_files=[XLMR_LARGE_MODEL_NAME],
            )
        elif pretrained_model == "xlmr.base.v0":
            download_file_maybe_extract(
                XLMR_BASE_V0_URL,
                directory=saving_directory,
                check_files=[XLMR_BASE_V0_MODEL_NAME],
            )

        elif pretrained_model == "xlmr.large.v0":
            download_file_maybe_extract(
                XLMR_LARGE_V0_URL,
                directory=saving_directory,
                check_files=[XLMR_LARGE_V0_MODEL_NAME],
            )
        else:
            raise Exception(f"{pretrained_model} is an invalid XLM-R model.")

        xlmr = XLMRModel.from_pretrained(saving_directory + pretrained_model,
                                         checkpoint_file="model.pt")
        # xlmr.eval()
        tokenizer = XLMRTextEncoder(
            xlmr.encode, xlmr.task.source_dictionary.__dict__["indices"])
        return XLMREncoder(xlmr=xlmr, tokenizer=tokenizer, hparams=hparams)
        detok_labels.append(1 if sum(labels) > 0 else 0)
        token = " ".join(atom).replace('\u2581', ' ').replace(' ', '')
        detoks.append(token)
    assert len(sent_detoks) == len(detok_labels)
    # assert " ".join(detoks) == " ".join(sent_detoks)
    return detok_labels

# batch size
bsz = 100

for model in models:
    print(model)

    xlmr = XLMRModel.from_pretrained(
        model,
        checkpoint_file='checkpoint.pt',
        data_name_or_path=datapath
    )

    raw = True
    print("Loaded the model!")
    xlmr.cuda()
    xlmr.eval()
    max_positions = xlmr.model.max_positions()

    for use_ref in [0,]:
        print(f"use ref = {use_ref}")

        for prefix, test_dir in zip(test_prefix, test_dirs):
            print(prefix, test_dir)
            log_name = os.path.join(opt_dir, "use_ref_{}_{}.log".format(use_ref, prefix.lower()))
Пример #12
0
import pdb
import csv
from torch.nn.utils.rnn import pad_sequence

##### FOR XLMR Model

from fairseq.models.roberta import XLMRModel

if torch.cuda.is_available():
    map_location = lambda storage, loc: storage.cuda()
else:
    map_location = 'cpu'

# model = torch.load('./xlmr.large/XLMR_BBC.pickle', map_location=map_location)
# model.eval()
xlmr = XLMRModel.from_pretrained('xlmr.large', checkpoint_file='model.pt')
xlmr.eval()


def get_emb(sentence):
    hi_tokens = xlmr.encode(sentence)
    f = xlmr.extract_features(hi_tokens)
    return f.squeeze(0)


def extract_from_file(data_file, max_sents):
    data = {'lbls': [], 'text': []}
    label_to_int = {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3}

    t = []
    l = []
Пример #13
0
 def __init__(self, model):
     print("loading model for tokenizer :",
           os.path.join(os.getenv("TRAINED_MODEL_DIR"), model))
     self.roberta = XLMRModel.from_pretrained(os.path.join(
         os.getenv("TRAINED_MODEL_DIR"), model),
                                              checkpoint_file='model.pt')
Пример #14
0
from fairseq.models.roberta import XLMRModel
from pprint import pprint
import torch
import os
import sentencepiece as spm

pretrained_path = './model-bin/cased/'

# load sentence piece model for checking purpose
sp = spm.SentencePieceProcessor()
sp.Load(os.path.join(pretrained_path, 'sentencepiece.bpe.model'))

# Load RoBERTa model. That already include loading sentence piece model
roberta = XLMRModel.from_pretrained(pretrained_path,
                                    checkpoint_file='model.pt')
roberta.eval()  # disable dropout (or leave in train mode to finetune)
print(roberta)

text_input = 'Đại học Bách Khoa Hà Nội.'

# Encode using roberta class
tokens_ids = roberta.encode(text_input)
assert tokens_ids.tolist() == [0, 451, 71, 3401, 1384, 168, 234, 5, 2]
# Tokenizer using sentence piece
tokens_text = sp.encode_as_pieces(text_input)
assert tokens_text == ['▁Đại', '▁học', '▁Bách', '▁Khoa', '▁Hà', '▁Nội', '.']
assert roberta.decode(tokens_ids) == text_input

print(tokens_ids)
print(tokens_text)
print(roberta.decode(tokens_ids))