def __init__(self, device): super(RobertaTweetEmbedding, self).__init__(device=device) self.config = RobertaConfig.from_pretrained( '../data/models/BERTweet_base_transformers/config.json') self.model = RobertaModel.from_pretrained( '../data/models/BERTweet_base_transformers/model.bin', config=self.config) self.model.eval( ) # disable dropout (or leave in train mode to finetune) self.model.to(self.device) self.pad_token_id = self.config.pad_token_id self.embedding_dim = self.model.config.hidden_size # Load BPE encoder parser = argparse.ArgumentParser() parser.add_argument( '--bpe-codes', default="../data/models/BERTweet_base_transformers/bpe.codes", required=False, type=str, help='path to fastBPE BPE') args = parser.parse_args() self.bpe = fastBPE(args) # Load the dictionary self.vocab = Dictionary() self.vocab.add_from_file( "../data/models/BERTweet_base_transformers/dict.txt")
def __init__(self, config: Bunch) -> None: pl.LightningModule.__init__(self) self.config = config bpe_codes_path = os.path.join( config.pretrained_model_base_path, "BERTweet_base_transformers/bpe.codes", ) bpe = fastBPE(Namespace(bpe_codes=bpe_codes_path)) vocab = Dictionary() vocab.add_from_file( os.path.join( config.pretrained_model_base_path, "BERTweet_base_transformers/dict.txt", )) tokenizer = BertweetTokenizer(self.config.max_tokens_per_tweet, bpe, vocab) self.data_processor = BertweetDataProcessor(config, tokenizer) model_config = RobertaConfig.from_pretrained( os.path.join( config.pretrained_model_base_path, "BERTweet_base_transformers/config.json", )) self.model = RobertaForSequenceClassification.from_pretrained( os.path.join( config.pretrained_model_base_path, "BERTweet_base_transformers/model.bin", ), config=model_config, ) self.loss = CrossEntropyLoss()
def mark(line, masked_line): # Load PhoBERT-base in fairseq # from fairseq.models.roberta import RobertaModel # phobert_mask = RobertaModel.from_pretrained('PhoBERT_base_fairseq', checkpoint_file='model.pt') # phobert_mask.eval() # disable dropout (or leave in train mode to finetune) # Incorporate the BPE encoder into PhoBERT-base args = parser_mask.parse_args() phobert_mask.bpe = fastBPE(args) #Incorporate the BPE encoder into PhoBERT # INPUT TEXT IS WORD-SEGMENTED! # line = "Tôi là sinh_viên trường đại_học Công_nghệ ." # Extract the last layer's features subwords = phobert_mask.encode(line) last_layer_features = phobert_mask.extract_features(subwords) # assert last_layer_features.size() == torch.Size([1, 9, 768]) # Extract all layer's features (layer 0 is the embedding layer) all_layers = phobert_mask.extract_features(subwords, return_all_hiddens=True) assert len(all_layers) == 13 assert torch.all(all_layers[-1] == last_layer_features) # Filling marks # masked_line = 'Tôi là <mask> trường đại_học Công_nghệ .' topk_filled_outputs = phobert_mask.fill_mask(masked_line, topk=1) return topk_filled_outputs
def __init__(self, data_dir, max_length=150, remove_negative_pair=True): super(VNNewsDataset, self).__init__() self.data_dir = data_dir self.max_length = max_length self.sentence_1 = open(os.path.join(self.data_dir, 'Sentences_1.txt'), mode='r', encoding='utf-8-sig').read().split('\n') self.sentence_2 = open(os.path.join(self.data_dir, 'Sentences_2.txt'), mode='r', encoding='utf-8-sig').read().split('\n') self.labels = open(os.path.join(self.data_dir, 'Labels.txt'), mode='r', encoding='utf-8-sig').read().split('\n') self.bpe = fastBPE(BPEConfig) self.vocab = Dictionary() self.vocab.add_from_file( os.path.join(os.getcwd(), '../pretrained', 'PhoBERT_base_transformers', 'dict.txt')) self.rdr_segmenter = VnCoreNLP(os.path.join('../vncorenlp', 'VnCoreNLP-1.1.1.jar'), annotators='wseg', max_heap_size='-Xmx500m') if remove_negative_pair is True: self.remove_negative_pair()
def __init__(self, vncore=True): """ Hacky way to run VnCoreNLP tokenizer with PhoBERT :param vncore: Set it to `False` if your sentences are already tokenized by VnCoreNLP """ self.dictionary = Dictionary.load(open(DICT_PATH)) self.annotator = None self.vncore = vncore self.bpe = fastBPE(args)
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) parser = argparse.ArgumentParser() parser.add_argument('--bpe-codes', default="/content/PhoBERT_base_transformers/bpe.codes", required=False, type=str, help='path to fastBPE BPE') args1, unknown = parser.parse_known_args() bpe = fastBPE(args1) # Load the dictionary vocab = Dictionary() vocab.add_from_file("/content/PhoBERT_base_transformers/dict.txt") tokenizer = bpe symbols = { 'BOS': vocab.indices['[unused0]'], 'EOS': vocab.indices['[unused1]'], 'PAD': vocab.indices['[PAD]'], 'EOQ': vocab.indices['[unused2]'] } valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def __init__(self,pretrained_path = '../input/bertweet-transformer-private/', parser=parser): self.bpe = fastBPE(args=parser.parse_args(args=[])) self.vocab = Dictionary() self.vocab.add_from_file(pretrained_path + "dict.txt") self.cls_token_id = 0 self.pad_token_id = 1 self.sep_token_id = 2 self.pad_token = '<pad>' self.cls_token = '<s> ' self.sep_token = ' </s>'
def fit(self, sentences): if self.model is None: from fairseq.models.roberta import RobertaModel from fairseq.data.encoders.fastbpe import fastBPE self.model = RobertaModel.from_pretrained( 'PhoBERT_base_fairseq', checkpoint_file='model.pt') self.model.eval() args = BPE() self.model.bpe = fastBPE(args) return self
def __init__( self, model_path: str, ) -> None: self.bpe = fastBPE(Args(model_path + "/bpe.codes")) self.vocab = Dictionary() self.vocab.add_from_file(f"{model_path}/dict.txt") self._tokenizer_lowercases = False self.sequence_pair_start_tokens = [Token(text="<s>", text_id=0, type_id=0)] self.sequence_pair_mid_tokens = [Token(text="</s>", text_id=2, type_id=0), Token(text="</s>", text_id=2, type_id=0)] self.sequence_pair_end_tokens = [Token(text="</s>", text_id=2, type_id=0)]
def __init__(self, pretrain="auxiliary_data/PhoBERT_base_fairseq"): self.phoBERT = RobertaModel.from_pretrained(pretrain, checkpoint_file='model.pt') self.phoBERT.eval() parser = options.get_preprocessing_parser() parser.add_argument('--bpe-codes', type=str, help='path to fastBPE BPE', default=pretrain + "/bpe.codes") args, unknown = parser.parse_known_args() self.phoBERT.bpe = fastBPE( args) #Incorporate the BPE encoder into PhoBERT
def __init__(self, pretrained_path='./bertweet/'): self.bpe = fastBPE( SimpleNamespace(bpe_codes=pretrained_path + "bpe.codes")) self.vocab = Dictionary() self.vocab.add_from_file(pretrained_path + "dict.txt") self.cls_token_id = 0 self.pad_token_id = 1 self.sep_token_id = 2 self.pad_token = '<pad>' self.cls_token = '<s>' self.sep_token = '</s>'
def __init__(self, max_length=512): self.bpe = fastBPE(BPEConfig) self.vocab = Dictionary() self.vocab.add_from_file(os.path.join(os.getcwd(), 'pretrained', 'PhoBERT_base_transformers', 'dict.txt')) self.rdr_segmenter = VnCoreNLP( os.path.join('vncorenlp', 'VnCoreNLP-1.1.1.jar'), annotators='wseg', max_heap_size='-Xmx500m' ) self.max_length = max_length
def get_w2v_sent(arr_sent): # from transformers import RobertaModel # phobert_w2v = RobertaModel.from_pretrained( # "PhoBERT_base_transformers/model.bin", # config=config # ) args = parser_w2v.parse_args() bpe = fastBPE(args) # Load the dictionary # vocab = Dictionary() # vocab.add_from_file("PhoBERT_base_transformers/dict.txt") # line = "Tôi là sinh_viên trường đại_học Công_nghệ ." line = ' '.join(arr_sent[0]) # Encode the line using fastBPE & Add prefix <s> and suffix </s> subwords = '<s> ' + bpe.encode(line) + ' </s>' # Map subword tokens to corresponding indices in the dictionary input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist() # Convert into torch tensor all_input_ids = torch.tensor([input_ids], dtype=torch.long) # Extract features with torch.no_grad(): features = phobert_w2v(all_input_ids) # Represent each word by the contextualized embedding of its first subword token # i. Get indices of the first subword tokens of words in the input sentence listSWs = subwords.split() firstSWindices = [] for ind in range(1, len(listSWs) - 1): if not listSWs[ind - 1].endswith("@@"): firstSWindices.append(ind) # ii. Extract the corresponding contextualized embeddings vector_sent = [] words = line.split() assert len(firstSWindices) == len(words) vectorSize = features[0][0, 0, :].size()[0] for word, index in zip(words, firstSWindices): vector_sent.append([ features[0][0, index, :][_ind].item() for _ind in range(vectorSize) ]) # print(word, " --> " ,[features[0][0, index, :][_ind].item() for _ind in range(vectorSize)]) result = np.array(vector_sent) result_vec = np.sum(result, axis=0) return result_vec
def get_input_ids_and_att_masks( lines: pd.core.series.Series) -> Tuple[List, List]: # Load BPE Tokenizer print('Load BPE Tokenizer') parser: argparse.ArgumentParser = argparse.ArgumentParser() parser.add_argument('--bpe-codes', default="./BERTweet_base_transformers/bpe.codes", required=False, type=str, help='path to fastBPE BPE') args: fastBPE = parser.parse_args() bpe: argparse.Namespace = fastBPE(args) vocab: Dictionary = Dictionary() vocab.add_from_file("./BERTweet_base_transformers/dict.txt") input_ids: List = [] attention_masks: List = [] for line in lines: # (1) Tokenize the sentence # (2) Add <CLS> token and <SEP> token (<s> and </s>) # (3) Map tokens to IDs # (4) Pad/Truncate the sentence to `max_length` # (5) Create attention masks for [PAD] tokens subwords: str = '<s> ' + \ bpe.encode(line.lower()) + ' </s>' # (1) + (2) line_ids: List = vocab.encode_line( subwords, append_eos=False, add_if_not_exist=False).long().tolist() # (3) if len(line_ids) < MAX_LENGTH: paddings: torch.tensor = torch.ones( (1, MAX_LENGTH - len(line_ids)), dtype=torch.long) # convert the line_ids to torch tensor tensor_line_ids: torch.tensor = torch.cat( [torch.tensor([line_ids], dtype=torch.long), paddings], dim=1) line_attention_masks: torch.tensor = torch.cat([ torch.ones((1, len(line_ids)), dtype=torch.long), torch.zeros((1, MAX_LENGTH - len(line_ids)), dtype=torch.long) ], dim=1) elif len(line_ids) > MAX_LENGTH: tensor_line_ids: torch.tensor = torch.tensor( [line_ids[0:MAX_LENGTH]], dtype=torch.long) line_attention_masks: torch.tensor = torch.ones((1, MAX_LENGTH), dtype=torch.long) input_ids.append(tensor_line_ids) attention_masks.append(line_attention_masks) return tuple([input_ids, attention_masks])
def __init__(self, device, model): super(HaggingFaceEmbeddings, self).__init__(device=device) self.model_keys = self.get_model_keys() MODELS = {'bert-base-uncased': (BertModel, BertTokenizer, 'bert-base-uncased'), 'openai-gpt': (OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'), 'transfo-xl-wt103': (TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'), 'gpt2': (GPT2Model, GPT2Tokenizer, 'gpt2'), 'xlm-mlm-enfr-1024': (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'), 'xlnet-base-cased': (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'), 'roberta-base': (RobertaModel, RobertaTokenizer, 'roberta-base'), 'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'), 'ctrl': (CTRLModel, CTRLTokenizer, 'ctrl'), 'distilbert-base-cased': (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'), 'camembert': (CamembertModel, CamembertTokenizer, 'camembert-base'), 'albert-base-v2': (AlbertModel, AlbertTokenizer, 'albert-base-v2'), 'xlm-roberta-base': (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'), 'flaubert_base_cased': (FlaubertModel, FlaubertTokenizer, 'flaubert/flaubert_base_cased'), 'bart-large': (BartModel, BartTokenizer, 'facebook/bart-large'), 't5-small': (T5Model, T5Tokenizer, 't5-small'), 'electra-small-discriminator': (ElectraModel, ElectraTokenizer, 'google/electra-small-discriminator'), # DiaploGPT 'reformer-crime-and-punishment': (ReformerModel, ReformerTokenizer, 'google/reformer-crime-and-punishment'), 'opus-mt-en-ROMANCE': (MarianMTModel, MarianTokenizer, 'Helsinki-NLP/opus-mt-en-ROMANCE'), 'longformer-base-4096': (LongformerModel, LongformerTokenizer, 'allenai/longformer-base-4096'), 'retribert': (RetriBertModel, RetriBertTokenizer, 'distilbert-base-uncased'), 'mobilebert-uncased': (MobileBertModel, MobileBertTokenizer, 'google/mobilebert-uncased') } if model not in self.model_keys: assert '{} is not in keys'.format(model) self.model_name = MODELS[model][2] self.tokenizer = MODELS[model][1].from_pretrained(self.model_name) self.model = MODELS[model][0].from_pretrained(self.model_name) self.model.eval() # disable dropout (or leave in train mode to finetune) self.model.to(self.device) self.pad_token_id = self.tokenizer.pad_token_id self.embedding_dim = self.model.config.hidden_size parser = argparse.ArgumentParser() parser.add_argument('--bpe-codes', default="../data/models/BERTweet_base_transformers/bpe.codes", required=False, type=str, help='path to fastBPE BPE' ) args = parser.parse_args() self.bpe = fastBPE(args) self.max_seq_length = 256
def __init__(self): # Load the model in fairseq MODEL_DIR = '/usr/local/software/pretrained-models/PhoBERT_base_fairseq' __checkpoint_file = join(MODEL_DIR, 'model.pt') self.__phoBERT = RobertaModel.from_pretrained( MODEL_DIR, checkpoint_file=__checkpoint_file) self.__phoBERT.eval( ) # disable dropout (or leave in train mode to finetune # Khởi tạo Byte Pair Encoding cho PhoBERT class BPE(): bpe_codes = '/usr/local/software/pretrained-models/PhoBERT_base_fairseq/bpe.codes' args = BPE() self.__phoBERT.bpe = fastBPE( args) # Incorporate the BPE encoder into PhoBERT
def _init_model(pretrain_model): bpe_path = os.path.join(pretrain_model, "bpe.codes") BERTweet = RobertaModel.from_pretrained(pretrain_model, checkpoint_file='model.pt') BERTweet.eval() # disable dropout (or leave in train mode to finetune) # Incorporate the BPE encoder into BERTweet-base parser = options.get_preprocessing_parser() parser.add_argument('--bpe-codes', type=str, help='path to fastBPE BPE', default=bpe_path) args = parser.parse_args() BERTweet.bpe = fastBPE(args) # Incorporate the BPE encoder into BERTweet return BERTweet
def load_phobert_model(): device = torch.device("cpu") parser = argparse.ArgumentParser() parser.add_argument('--bpe-codes', default=paths.bpe_codes_path, required=False, type=str, help='path to fastBPE BPE') args = parser.parse_args() bpe = fastBPE(args) vn_tokenizer = VnCoreNLP(paths.vncore_jar_path, annotators="wseg", max_heap_size='-Xmx500m') # config model config = RobertaConfig.from_pretrained(paths.config_path, output_hidden_states=True, num_labels=3) model_bert = RobertaForAIViVN.from_pretrained(paths.pretrained_path, config=config) # model_bert.cuda() # Load the dictionary vocab = Dictionary() vocab.add_from_file(paths.dict_path) ''' if torch.cuda.device_count(): print(f"Testing using {torch.cuda.device_count()} gpus") model_bert = nn.DataParallel(model_bert) tsfm = model_bert.module.roberta else: tsfm = model_bert.roberta ''' model_bert = nn.DataParallel(model_bert) tsfm = model_bert.module.roberta model_bert.load_state_dict( torch.load(paths.phobert_path, map_location=device)) return bpe, vn_tokenizer, model_bert, vocab
def __init__(self, pretrain_path=None, n_class=config.MODEL['N_CLASS'], device='cpu'): super(ClassifierModel, self).__init__() self.device = device from fairseq.models.roberta import RobertaModel self.bert_model = RobertaModel.from_pretrained( config.PATH['PHO_BERT'], checkpoint_file='model.pt') self.bert_model.bpe = fastBPE(BPE()) self.bert_model.register_classification_head('new_task', num_classes=n_class) self.bert_model.to(device=device) if pretrain_path != None: self.load_model(pretrain_path) self.bert_model.eval()
def load_bpe_and_vocab(args): if args.model.model_class == 'roberta': args.model.bpe_codes = os.path.join(utils.PROJ_DIR, args.model.bpe_codes) print( f"Loading BPE from pretrained checkpoint at {args.model.bpe_codes}" ) bpe = fastBPE(args.model) args.model.vocab = os.path.join(utils.PROJ_DIR, args.model.vocab) print(f"Loading BPE from pretrained checkpoint at {args.model.vocab}") vocab = Dictionary() vocab.add_from_file(args.model.vocab) print() else: bpe, vocab = None, None return bpe, vocab
def loadModel(self): parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--bpe-codes', type=str, help='path to fastBPE BPE', default=self.BPE_PATH) args = parser.parse_args("") phoBERT = RobertaModel.from_pretrained(self.MODEL_PATH, checkpoint_file='model.pt') phoBERT.eval() phoBERT.bpe = fastBPE(args) rdrsegmenter = VnCoreNLP(self.VNCORENLP_PATH, annotators="wseg", max_heap_size='-Xmx500m') return phoBERT, rdrsegmenter
def get_bert_embedding(lines: List[str]) -> List[torch.Tensor]: # Load model config = RobertaConfig.from_pretrained( "./BERTweet_base_transformers/config.json") BERTweet = RobertaModel.from_pretrained( "./BERTweet_base_transformers/model.bin", config=config) # Load BPE encoder parser = argparse.ArgumentParser() parser.add_argument('--bpe-codes', default="./BERTweet_base_transformers/bpe.codes", required=False, type=str, help='path to fastBPE BPE') args = parser.parse_args() bpe = fastBPE(args) # Load the dictionary vocab = Dictionary() vocab.add_from_file("./BERTweet_base_transformers/dict.txt") result: List[torch.Tensor] = [] for i in range(len(lines)): line: str = lines[i] # Encode the line using fastBPE & Add prefix <s> and suffix </s> subwords = '<s> ' + bpe.encode(line) + ' </s>' # Map subword tokens to corresponding indices in the dictionary input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist() # Convert into torch tensor all_input_ids = torch.tensor([input_ids], dtype=torch.long) features = None with torch.no_grad(): features = BERTweet(all_input_ids) result.append(features[0][:, 0, :].numpy()[0]) return result
def __init__(self, bpe_path: str, vncorenlp_path: str, do_lower_case: bool = False): bpe_codes_path = os.path.join(bpe_path, BPECODE_FILE) vocab_file_path = os.path.join(bpe_path, VOCAB_FILE) if not os.path.isfile(bpe_codes_path): raise EnvironmentError(f"{BPECODE_FILE} not found in {bpe_path}") if not os.path.isfile(vocab_file_path): raise EnvironmentError(f"{VOCAB_FILE} not found in {bpe_path}") self.do_lower_case = do_lower_case BPEConfig = namedtuple('BPEConfig', 'vncorenlp bpe_codes vocab') self.pho_config = BPEConfig(vncorenlp=vncorenlp_path, bpe_codes=bpe_codes_path, vocab=vocab_file_path) self.rdrsegmenter = VnCoreNLP(self.pho_config.vncorenlp, annotators="wseg", max_heap_size='-Xmx1g') self.bpe = fastBPE(self.pho_config) self.vocab = Dictionary() self.vocab.add_from_file(self.pho_config.vocab)
def __init__(self, model_path): #Load the pretrained PhoBERT Model print("Loading Classification...") self.config = RobertaConfig.from_pretrained( model_path + 'PhoBERT/config.json', from_tf=False, num_labels=5, output_hidden_states=False, ) self.phoBERT_cls = RobertaForSequenceClassification.from_pretrained( model_path + 'PhoBERT/model.bin', config=self.config) device = "cuda:0" self.phoBERT_cls = self.phoBERT_cls.to(device) self.phoBERT_cls.eval() print("Loading pre-trained model...") self.phoBERT_cls.load_state_dict( torch.load( model_path + 'roberta_state_dict_9bfb8319-01b2-4301-aa5a-756d390a98e1.pth')) print("Finished loading PhoBERT Classification model.") #Load the BPE and Vocabulary Dictionary print("Loading BPE and vocab dict ...") class BPE(): bpe_codes = model_path + 'PhoBERT/bpe.codes' args = BPE() self.bpe = fastBPE(args) self.vocab = Dictionary() self.vocab.add_from_file(model_path + "PhoBERT/dict.txt") print("Finished loading BPE and vocab dict.") #Load the Text Recognizer config = Cfg.load_config_from_name('vgg_transformer') config['weights'] = 'weights/transformerocr.pth' config['cnn']['pretrained'] = False config['device'] = 'cuda:0' config['predictor']['beamsearch'] = False self.text_recognizer = Predictor(config)
default='./PhoBERT_large_transformers/model.bin') parser.add_argument('--max_sequence_length', type=int, default=256) parser.add_argument('--batch_size', type=int, default=64) parser.add_argument('--accumulation_steps', type=int, default=5) parser.add_argument('--epochs', type=int, default=5) parser.add_argument('--fold', type=int, default=0) parser.add_argument('--seed', type=int, default=69) parser.add_argument('--lr', type=float, default=3e-5) parser.add_argument('--ckpt_path', type=str, default='./models_zalo') parser.add_argument('--bpe-codes', default="./PhoBERT_large_transformers/bpe.codes", type=str, help='path to fastBPE BPE') args = parser.parse_args() bpe = fastBPE(args) rdrsegmenter = VnCoreNLP(args.rdrsegmenter_path, annotators="wseg", max_heap_size='-Xmx500m') seed_everything(69) # Load model config = RobertaConfig.from_pretrained(args.config_path, output_hidden_states=True, num_labels=6) model_bert = RobertaForAIViVN.from_pretrained(args.pretrained_path, config=config) model_bert.cuda()
from fairseq.models.roberta import RobertaModel phobert = RobertaModel.from_pretrained('PhoBERT_base_fairseq', checkpoint_file='model.pt') phobert.eval() # disable dropout (or leave in train mode to finetune) # Incorporate the BPE encoder into PhoBERT-base from fairseq.data.encoders.fastbpe import fastBPE from fairseq import options parser = options.get_preprocessing_parser() parser.add_argument('--bpe-codes', type=str, help='path to fastBPE BPE', default="PhoBERT_base_fairseq/bpe.codes") args = parser.parse_args() phobert.bpe = fastBPE(args) def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]): """ Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy). Args: roberta (RobertaHubInterface): RoBERTa instance bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)` other_tokens (List[str]): other tokens of shape `(T_words)` Returns: List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*. """ assert bpe_tokens.dim() == 1 assert bpe_tokens[0] == 0
from joblib import Parallel, delayed import warnings warnings.filterwarnings("ignore") LEARNING_RATE = 6e-5 MAX_LEN = 126 TRAIN_BATCH_SIZE = 35 VALID_BATCH_SIZE = 32 EPOCHS = 4 INPUT_PATH = "/content/drive/My Drive/Tweet Sentiment Extraction/input/" OUTPUT_PATH = "" TRAINING_FILE = f"" ROBERTA_PATH = f"" key = argparse.Namespace(bpe_codes= f"BERTweet_base_transformers/bpe.codes") bpe = fastBPE(key) # Load the dictionary vocab = Dictionary() vocab.add_from_file("BERTweet_base_transformers/dict.txt") # TOKENIZER = transformers.RobertaTokenizer( # vocab_file = f'{ROBERTA_PATH}vocab.json', # merges_file = f'{ROBERTA_PATH}merges.txt', # lowercase = True, # add_prefix_space = True # ) class AverageMeter: """ Computes and stores the average and current value
def __init__(self, bpe_path, vocab_path): self._bpe = fastBPE(Config(bpe_codes=bpe_path)) self._vocab = self._get_vocab(vocab_path)
import torch from fairseq.models.roberta import RobertaModel from fairseq.data.encoders.fastbpe import fastBPE from CONFIG import * phoBERT = RobertaModel.from_pretrained('PhoBERT_base_fairseq', checkpoint_file='model.pt') phoBERT.eval() # disable dropout (or leave in train mode to finetune) class BPE(): bpe_codes = 'PhoBERT_base_fairseq/bpe.codes' args = BPE() phoBERT.bpe = fastBPE(args) #Incorporate the BPE encoder into PhoBERT def embedding_document(document): doc = ViTokenizer.tokenize(document) tokens = phoBERT.encode(doc) if len(tokens) > 256: chunks = math.ceil(len(tokens) / 256) emb = [] sum_tokens = len(tokens) chunks = min(chunks, 3) for i in range(chunks): sum_tokens = sum_tokens - 256 if sum_tokens > 0: emb.append( phoBERT.extract_features(tokens[i * 256:(i + 1) *
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, choices=["vlsp_2018_single", \ "vlsp_2018_NLI_M", "vlsp_2018_QA_M", "vlsp_2018_NLI_B", "vlsp_2018_QA_B"], help="The name of the task to train.") parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument('--bpe-codes', default=None, required=True, type=str, help='path to fastBPE BPE') parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Other parameters parser.add_argument("--do_save_model", default=False, action='store_true', help="Whether to save checkpoint.") parser.add_argument("--eval_test", default=False, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--accumulate_gradients", type=int, default=1, help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # prepare dataloaders processors = { "vlsp_2018_single":VLSP_2018_single_Processor, "vlsp_2018_NLI_M":VLSP_2018_NLI_M_Processor, "vlsp_2018_QA_M":VLSP_2018_QA_M_Processor, "vlsp_2018_NLI_B":VLSP_2018_NLI_B_Processor, "vlsp_2018_QA_B":VLSP_2018_QA_B_Processor, } processor = processors[args.task_name]() label_list = processor.get_labels() bert_config = RobertaConfig.from_pretrained(args.bert_config_file) bert_config.num_labels = len(label_list) label2id = {} id2label = {} for (i, label) in enumerate(label_list): label2id[label] = i id2label[str(i)] = label bert_config.label2id = label2id bert_config.id2label = id2label if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) bpe = fastBPE(args) vocab = Dictionary() vocab.add_from_file(args.vocab_file) # training set train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, bpe, vocab) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # dev set dev_examples = processor.get_dev_examples(args.data_dir) dev_features = convert_examples_to_features( dev_examples, label_list, args.max_seq_length, bpe, vocab) all_dev_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_dev_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_dev_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) all_dev_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(all_dev_input_ids, all_dev_input_mask, all_dev_segment_ids, all_dev_label_ids) dev_dataloader = DataLoader(dev_data, batch_size=args.eval_batch_size, shuffle=False) # test set if args.eval_test: test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, bpe, vocab) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) test_dataloader = DataLoader(test_data, batch_size=args.eval_batch_size, shuffle=False) # model and optimizer model = RobertaForSequenceClassification(bert_config) if args.init_checkpoint is not None: model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) # train output_log_file = os.path.join(args.output_dir, "log.txt") print("output_log_file=",output_log_file) with open(output_log_file, "w") as writer: if args.eval_test: writer.write("epoch\tglobal_step\tloss\tdev_loss\tdev_accuracy\ttest_loss\ttest_accuracy\n") else: writer.write("epoch\tglobal_step\tloss\n") global_step = 0 epoch=0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch+=1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch #RoBERTa not use token_type_ids loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if(args.do_save_model): if(n_gpu > 1): torch.save(model.module.state_dict(), os.path.join(args.output_dir, 'model_ep' + str(epoch) + '.bin')) else: torch.save(model.state_dict(), os.path.join(args.output_dir, 'model_ep' + str(epoch) + '.bin')) #dev eval model.eval() dev_loss, dev_accuracy = 0, 0 nb_dev_steps, nb_dev_examples = 0, 0 with open(os.path.join(args.output_dir, "dev_ep_"+str(epoch)+".txt"),"w") as f_dev: for input_ids, input_mask, segment_ids, label_ids in dev_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_dev_test_loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) logits = F.softmax(logits, dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output_i in range(len(outputs)): f_dev.write(str(outputs[output_i])) for ou in logits[output_i]: f_dev.write(" "+str(ou)) f_dev.write("\n") tmp_dev_accuracy=np.sum(outputs == label_ids) dev_loss += tmp_dev_test_loss.mean().item() dev_accuracy += tmp_dev_accuracy nb_dev_examples += input_ids.size(0) nb_dev_steps += 1 dev_loss = dev_loss / nb_dev_steps dev_accuracy = dev_accuracy / nb_dev_examples # eval_test if args.eval_test: model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 with open(os.path.join(args.output_dir, "test_ep_"+str(epoch)+".txt"),"w") as f_test: for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_test_loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) logits = F.softmax(logits, dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output_i in range(len(outputs)): f_test.write(str(outputs[output_i])) for ou in logits[output_i]: f_test.write(" "+str(ou)) f_test.write("\n") tmp_test_accuracy=np.sum(outputs == label_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples result = collections.OrderedDict() if args.eval_test: result = {'epoch': epoch, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps, 'dev_loss': dev_loss, 'dev_accuracy': dev_accuracy, 'test_loss': test_loss, 'test_accuracy': test_accuracy} else: result = {'epoch': epoch, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} logger.info("***** Eval results *****") with open(output_log_file, "a+") as writer: for key in result.keys(): logger.info(" %s = %s\n", key, str(result[key])) writer.write("%s\t" % (str(result[key]))) writer.write("\n")