def __init__(self, pretrained_path, n_labels, hidden_size, dropout_p, label_ignore_idx=0, head_init_range=0.04, device='cuda'): super().__init__() self.n_labels = n_labels self.linear_1 = nn.Linear(hidden_size, hidden_size) self.classification_head = nn.Linear(hidden_size, n_labels) self.label_ignore_idx = label_ignore_idx self.xlmr = XLMRModel.from_pretrained(pretrained_path) self.model = self.xlmr.model self.dropout = nn.Dropout(dropout_p) self.device = device # initializing classification head self.classification_head.weight.data.normal_(mean=0.0, std=head_init_range)
def convert_fairseq_model(args): if not args.save_dir: args.save_dir = os.path.basename(args.fairseq_model_path) + '_gluon' if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) fairseq_xlmr = fairseq_XLMRModel.from_pretrained( args.fairseq_model_path, checkpoint_file='model.pt') vocab_size = convert_vocab(args, fairseq_xlmr) gluon_cfg = convert_config(fairseq_xlmr.args, vocab_size, XLMRModel.get_cfg().clone()) with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of: of.write(gluon_cfg.dump()) ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu() gluon_xlmr = convert_params(fairseq_xlmr, gluon_cfg, ctx) if args.test: test_model(fairseq_xlmr, gluon_xlmr, args.gpu) gluon_xlmr.save_parameters(os.path.join(args.save_dir, 'model_mlm.params'), deduplicate=True) logging.info('Convert the RoBERTa MLM model in {} to {}'. format(os.path.join(args.fairseq_model_path, 'model.pt'), \ os.path.join(args.save_dir, 'model_mlm.params'))) gluon_xlmr.backbone_model.save_parameters(os.path.join( args.save_dir, 'model.params'), deduplicate=True) logging.info('Convert the RoBERTa backbone model in {} to {}'. format(os.path.join(args.fairseq_model_path, 'model.pt'), \ os.path.join(args.save_dir, 'model.params'))) logging.info('Conversion finished!') logging.info('Statistics:') rename(args.save_dir)
class TestXLMRTextEncoder(unittest.TestCase): download_file_maybe_extract( "https://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz", directory=os.environ["HOME"] + "/.cache/torch/unbabel_comet/", check_files=["xlmr.base/model.pt"], ) xlmr = XLMRModel.from_pretrained( os.environ["HOME"] + "/.cache/torch/unbabel_comet/xlmr.base", checkpoint_file="model.pt", ) original_vocab = xlmr.task.source_dictionary.__dict__["indices"] tokenizer = XLMRTextEncoder(xlmr.encode, original_vocab) def test_unk_property(self): self.assertEqual(self.tokenizer.unk_index, self.original_vocab["<unk>"]) def test_pad_property(self): self.assertEqual(self.tokenizer.padding_index, self.original_vocab["<pad>"]) def test_bos_property(self): self.assertEqual(self.tokenizer.bos_index, self.original_vocab["<s>"]) def test_eos_property(self): self.assertEqual(self.tokenizer.eos_index, self.original_vocab["</s>"]) def test_mask_property(self): self.assertEqual(self.tokenizer.mask_index, self.original_vocab["<mask>"]) def test_vocab_property(self): self.assertEqual(self.tokenizer.vocab, self.original_vocab) def test_vocab_size_property(self): self.assertEqual(self.tokenizer.vocab_size, len(self.original_vocab)) def test_encode(self): sentence = "Hello, my dog is cute" expected = self.xlmr.encode(sentence) result = self.tokenizer.encode(sentence) self.assertTrue(torch.equal(expected, result)) # Make sure the bos and eos tokens were added. self.assertEqual(result[0], self.tokenizer.bos_index) self.assertEqual(result[-1], self.tokenizer.eos_index) def test_batch_encode(self): # Test batch_encode. batch = ["Hello, my dog is cute", "hello world!"] encoded_batch, lengths = self.tokenizer.batch_encode(batch) self.assertTrue(torch.equal(encoded_batch[0], self.tokenizer.encode(batch[0]))) self.assertTrue( torch.equal(encoded_batch[1][: lengths[1]], self.tokenizer.encode(batch[1])) ) self.assertEqual(lengths[0], len(self.xlmr.encode("Hello, my dog is cute"))) self.assertEqual(lengths[1], len(self.xlmr.encode("hello world!"))) # Check if last sentence is padded. self.assertEqual(encoded_batch[1][-1], self.tokenizer.padding_index) self.assertEqual(encoded_batch[1][-2], self.tokenizer.padding_index)
def __init__(self, pretrained_path, hidden_size, dropout_p, device='cuda'): super().__init__() self.xlmr = XLMRModel.from_pretrained(pretrained_path) self.model = self.xlmr.model self.dropout = nn.Dropout(dropout_p) self.device=device
def __init__(self, pretrained_model_name_or_path): super().__init__() print( "loading model from local :", os.path.join(os.getenv("TRAINED_MODEL_DIR"), pretrained_model_name_or_path)) self.xlmr = XLMRModel.from_pretrained(os.path.join( os.getenv("TRAINED_MODEL_DIR"), pretrained_model_name_or_path), checkpoint_file='model.pt')
def __init__(self, textpath='/text/asr_result.txt'): self.textpath = os.path.dirname(os.path.realpath(__file__)) + textpath self.xlmr = XLMRModel.from_pretrained('xlmr.base', checkpoint_file='model.pt') self.xlmr = self.xlmr.to(DEVICE) self.xlmr.eval() print('xlmr base model loaded.') with open(self.textpath, 'rt', -1, 'utf-8') as rf: self.data = rf.readlines()
def __init__(self, urls: Union[str, List[str]], vocab_path: str = '', compare: bool = False, labeled: bool = False, anomalies: bool = False, raw: bool = False) -> None: """ Parameters ---------- urls: Union[str, List[str]] Path or paths to pass to webdataset.dataset.ShardList. Points to Censored Planet .tar data files. vocab_path: str Path to a .pyc file which holds a dictionary that maps an index sequence with tokens used from fairseq.models.roberta.model_xlmr.XLMRModel when flattening data. compare: bool Should data be compared with Censored Planet blockpage signatures? labeled: bool Should only data successfully precessed by blockpage matcher be returned? anomalies: bool Should only data marked by Censored Planet as an anomaly be processed? raw: bool Should the raw row be returned without processing into vectors? """ super().__init__() assert ( urls is not None), "Must supply a url as a string or list of strings" self.__shards = ShardList(urls) self.__blockpage_matcher = BlockpageMatcher() self.__labeled = labeled self.__compare = labeled or compare self.__anomalies = anomalies self.__raw = raw if not self.__raw: # Bring in the MMDB free database. self.__ip2geo = geoip2.database.Reader('./mmdb/country.mmdb') # Bring in the pretrained XLMR model. self.__xlmr = XLMRModel.from_pretrained('/data/xlmr.large', checkpoint_file='model.pt') self.__xlmr.eval() self.__vocab_path = vocab_path try: with open(vocab_path, 'rb') as retrieved_dict: self.__vocab = pickle.load(retrieved_dict) except OSError: self.__vocab = dict() self.__vocab_next = len(self.__vocab)
def create_pretrained(model_type, force_download=False): """ Downloads and creates the pretrained assets. """ cache_dir = join(PROJECT_DIR, '.cache') os.makedirs(cache_dir, exist_ok=True) model_dir = join(cache_dir, model_type) if not exists(model_dir) or force_download: download(model_type, cache_dir) xlmr = XLMRModel.from_pretrained(model_dir, checkpoint_file='model.pt') return xlmr
def from_pretrained(cls, hparams: HyperOptArgumentParser, lm_head: bool = False): if not os.path.exists("pretrained/"): os.mkdir("pretrained/") pretrained_model = hparams.pretrained_model if pretrained_model == "xlmr.base": download_file_maybe_extract( XLMR_BASE_URL, directory="pretrained", check_files=[XLMR_BASE_MODEL_NAME], ) elif pretrained_model == "xlmr.large": download_file_maybe_extract( XLMR_LARGE_URL, directory="pretrained", check_files=[XLMR_LARGE_MODEL_NAME], ) elif pretrained_model == "xlmr.base.v0": download_file_maybe_extract( XLMR_BASE_V0_URL, directory="pretrained", check_files=[XLMR_BASE_V0_MODEL_NAME], ) elif pretrained_model == "xlmr.large.v0": download_file_maybe_extract( XLMR_LARGE_V0_URL, directory="pretrained", check_files=[XLMR_LARGE_V0_MODEL_NAME], ) else: raise Exception(f"{pretrained_model} is an invalid XLM-R model.") xlmr = XLMRModel.from_pretrained( "pretrained/" + pretrained_model, checkpoint_file="model.pt" ) xlmr.eval() tokenizer = RoBERTaTextEncoder( xlmr.encode, xlmr.task.source_dictionary.__dict__["indices"] ) return XLMRoBERTa( xlmr=xlmr, tokenizer=tokenizer, hparams=hparams, lm_head=lm_head )
def from_pretrained(cls, hparams: Namespace): if not os.path.exists(saving_directory): os.makedirs(saving_directory) pretrained_model = hparams.pretrained_model if pretrained_model == "xlmr.base": download_file_maybe_extract( XLMR_BASE_URL, directory=saving_directory, check_files=[XLMR_BASE_MODEL_NAME], ) elif pretrained_model == "xlmr.large": download_file_maybe_extract( XLMR_LARGE_URL, directory=saving_directory, check_files=[XLMR_LARGE_MODEL_NAME], ) elif pretrained_model == "xlmr.base.v0": download_file_maybe_extract( XLMR_BASE_V0_URL, directory=saving_directory, check_files=[XLMR_BASE_V0_MODEL_NAME], ) elif pretrained_model == "xlmr.large.v0": download_file_maybe_extract( XLMR_LARGE_V0_URL, directory=saving_directory, check_files=[XLMR_LARGE_V0_MODEL_NAME], ) else: raise Exception(f"{pretrained_model} is an invalid XLM-R model.") xlmr = XLMRModel.from_pretrained(saving_directory + pretrained_model, checkpoint_file="model.pt") # xlmr.eval() tokenizer = XLMRTextEncoder( xlmr.encode, xlmr.task.source_dictionary.__dict__["indices"]) return XLMREncoder(xlmr=xlmr, tokenizer=tokenizer, hparams=hparams)
detok_labels.append(1 if sum(labels) > 0 else 0) token = " ".join(atom).replace('\u2581', ' ').replace(' ', '') detoks.append(token) assert len(sent_detoks) == len(detok_labels) # assert " ".join(detoks) == " ".join(sent_detoks) return detok_labels # batch size bsz = 100 for model in models: print(model) xlmr = XLMRModel.from_pretrained( model, checkpoint_file='checkpoint.pt', data_name_or_path=datapath ) raw = True print("Loaded the model!") xlmr.cuda() xlmr.eval() max_positions = xlmr.model.max_positions() for use_ref in [0,]: print(f"use ref = {use_ref}") for prefix, test_dir in zip(test_prefix, test_dirs): print(prefix, test_dir) log_name = os.path.join(opt_dir, "use_ref_{}_{}.log".format(use_ref, prefix.lower()))
import pdb import csv from torch.nn.utils.rnn import pad_sequence ##### FOR XLMR Model from fairseq.models.roberta import XLMRModel if torch.cuda.is_available(): map_location = lambda storage, loc: storage.cuda() else: map_location = 'cpu' # model = torch.load('./xlmr.large/XLMR_BBC.pickle', map_location=map_location) # model.eval() xlmr = XLMRModel.from_pretrained('xlmr.large', checkpoint_file='model.pt') xlmr.eval() def get_emb(sentence): hi_tokens = xlmr.encode(sentence) f = xlmr.extract_features(hi_tokens) return f.squeeze(0) def extract_from_file(data_file, max_sents): data = {'lbls': [], 'text': []} label_to_int = {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3} t = [] l = []
def __init__(self, model): print("loading model for tokenizer :", os.path.join(os.getenv("TRAINED_MODEL_DIR"), model)) self.roberta = XLMRModel.from_pretrained(os.path.join( os.getenv("TRAINED_MODEL_DIR"), model), checkpoint_file='model.pt')
from fairseq.models.roberta import XLMRModel from pprint import pprint import torch import os import sentencepiece as spm pretrained_path = './model-bin/cased/' # load sentence piece model for checking purpose sp = spm.SentencePieceProcessor() sp.Load(os.path.join(pretrained_path, 'sentencepiece.bpe.model')) # Load RoBERTa model. That already include loading sentence piece model roberta = XLMRModel.from_pretrained(pretrained_path, checkpoint_file='model.pt') roberta.eval() # disable dropout (or leave in train mode to finetune) print(roberta) text_input = 'Đại học Bách Khoa Hà Nội.' # Encode using roberta class tokens_ids = roberta.encode(text_input) assert tokens_ids.tolist() == [0, 451, 71, 3401, 1384, 168, 234, 5, 2] # Tokenizer using sentence piece tokens_text = sp.encode_as_pieces(text_input) assert tokens_text == ['▁Đại', '▁học', '▁Bách', '▁Khoa', '▁Hà', '▁Nội', '.'] assert roberta.decode(tokens_ids) == text_input print(tokens_ids) print(tokens_text) print(roberta.decode(tokens_ids))