def __init__(self, max_seq_len : int, model_name: str, model_path: Optional[str] = '', device: Optional[str] = 'cpu'): super(TextEmbedder, self).__init__() """ Parameters ---------- max_len : int model_name : str The name of the model, should be one of 'word2vec', 'xlm-roberta-base', 'xlm-roberta-large', 'vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2' model_path : str, optional The path to the w2v file / finetuned Transformer model path. Required for w2v. """ assert model_name in ['word2vec', 'xlm-roberta-base', 'xlm-roberta-large', 'vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2'] self.max_seq_len = max_seq_len self.model_name = model_name self.device = torch.device(device) if model_path == '': model_path = model_name # TODO check if the print('TextEmbedder: Loading model {} ({})'.format(model_name, model_path)) if model_name == 'word2vec': assert os.path.isfile(model_path) self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', model_max_length=self.max_seq_len+1) self._load_weibo_w2v(model_path) self.embed_dim = 300 elif model_name in ['vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2', 'jordan-m-young/buzz-article-gpt-2']: self.tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=self.max_seq_len) self.model = AutoModel.from_pretrained(model_path, return_dict=True).to(self.device) # T5 for news doesn't have 'add_pooling_layer' option self.embed_dim = 768 else: assert model_path in ['xlm-roberta-base', 'xlm-roberta-large'] or os.path.isdir(model_path) self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name, model_max_length=self.max_seq_len) self.model = XLMRobertaModel.from_pretrained(model_path, return_dict=True, add_pooling_layer=False).to(self.device) self.embed_dim = 768 print('TextEmbedder: Finished loading model {}'.format(model_name))
def __init__(self, vocabs, opt, predict_inverse=False): super(XLMRPredictor, self).__init__(vocabs=vocabs, opt=opt) self.xlmr = XLMRobertaModel.from_pretrained('xlm-roberta-base') self.model = nn.Sequential(*list(self.xlmr.children())[:-1]) self.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu')
def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool = False): super().__init__() self.tokenzier = RobertaTokenizer.from_pretrained(model_dir_or_name) self.encoder = RobertaModel.from_pretrained(model_dir_or_name) # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) if isinstance(layers, list): self.layers = [int(l) for l in layers] elif isinstance(layers, str): self.layers = list(map(int, layers.split(','))) else: raise TypeError("`layers` only supports str or list[int]") for layer in self.layers: if layer < 0: assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a RoBERTa model with {encoder_layer_number} layers." else: assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a RoBERTa model with {encoder_layer_number} layers." self._cls_index = self.tokenzier.encoder['<s>'] self._sep_index = self.tokenzier.encoder['</s>'] # 需要用于生成word_piece self._wordpiece_pad_index = self.tokenzier.encoder['<pad>'] self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>'] self.pooled_cls = pooled_cls
def get_embeddings_contextualized(self, dataset: CLIRDataset) -> torch.Tensor: """ Compute average contextualized embeddings for multi-piece words """ # encoded input sentences with XLMRoberta model = XLMRobertaModel.from_pretrained('xlm-roberta-base', output_hidden_states=True) all_input_ids = [input_ids.input_ids for input_ids in dataset.data] all_token_len = [token_len.token_len for token_len in dataset.data] all_input_ids = torch.tensor(all_input_ids) all_outputs = model(all_input_ids) embeddings = all_outputs[2][ 0] # shape (batch_size, max_seq_len, emb_dim) doc_size, _, emb_size = embeddings.shape # average all pieces for multi-piece words idxs, masks, token_num, token_len = token_lens_to_idxs(all_token_len) # reshape idxs from (batch_size, seq_len) to (batch_size, seq_len, embed_size) idxs = all_input_ids.new(idxs).unsqueeze(-1).expand( doc_size, -1, emb_size) + 1 masks = embeddings.new(masks).unsqueeze(-1) outputs = torch.gather(embeddings, 1, idxs) * masks outputs = outputs.view(doc_size, token_num, token_len, emb_size) outputs = outputs.mean(dim=2) return outputs
def __init__(self, config): super(Model, self).__init__() self.bert = XLMRobertaModel.from_pretrained(config.pretrained_path) for param in self.bert.parameters(): param.requires_grad = True self.fc = nn.Linear(config.hidden_size, 1) self.sigmoid = nn.Sigmoid()
def __init__(self, choose_model): """Initialize Text Encoder. Args: choose_model (str): Only XLM-R possible for now. """ self.model_max_length = 128 if choose_model.lower() == "XLM-R".lower(): self.tokenizer = XLMRobertaTokenizer.from_pretrained( 'xlm-roberta-base', model_max_length=self.model_max_length) self.model = XLMRobertaModel.from_pretrained( 'xlm-roberta-base', output_hidden_states=True) self.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') self.model.to(self.device, non_blocking=True) self.PAD_TOKEN = "<pad>" self.BOS_TOKEN = "<s>" self.EOS_TOKEN = "</s>" self.UNK_TOKEN = "<unk>" self.add_special_token = True self.pad_to_max_length = True self.target_embedding_matrix = [] self.proj_embedding_source_target = [] self.src_word2ind = {} self.trg_word2ind = {} self.src_ind2word = {} self.trg_ind2word = {} self.norm_trg_embedding_matrix = [] else: assert False, print("No correct model was chosen!!")
def __init__(self, num_classes): super(XLMRoBERTaNER, self).__init__() self.embedding_dim = 768 self.num_classes = num_classes self.RoBERTa = XLMRobertaModel.from_pretrained("xlm-roberta-base") self.linear = nn.Linear(self.embedding_dim, self.num_classes)
def __init__(self, config, args): super(XPairModel, self).__init__(config) self.args = args self.num_labels = args.num_pair_labels self.roberta = XLMRobertaModel(config=config) self.classifier = SequenceClassifier(config.hidden_size, self.num_labels)
def __init__(self): super(ToxicSimpleNNModel, self).__init__() self.backbone = XLMRobertaModel.from_pretrained(BACKBONE_PATH) self.dropout = nn.Dropout(0.25) self.linear = nn.Linear( in_features=self.backbone.pooler.dense.out_features*2, out_features=2, )
def __init__(self, config, model_name, only_embedding=True, output_hidden_states=True): super(XLMTokenEmbedder, self).__init__(config) self.config = config self.only_embedding = only_embedding self.model = XLMRobertaModel.from_pretrained(model_name, output_hidden_states=output_hidden_states) if self.only_embedding: self.model = self.model.get_input_embeddings() self.model.weight.requires_grad = False
def __init__(self, config): super(Bert_CRF, self).__init__(config) self.num_labels = config.num_labels self.bert = XLMRobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.num_labels) self.init_weights() self.crf = CRF(self.num_labels, batch_first=True)
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = XLMRobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights()
def __init__(self, hparams): super().__init__() self.encoder = XLMRobertaModel.from_pretrained( hparams.pretrained_path) # load pretrained model self.encoder_size = hparams.bert_output # encoder output size - 768 for base architecture self.num_output = hparams.num_output # number of label - 6 self.fc = nn.Linear(self.encoder_size, self.num_output) # fully connected layer self.out = nn.Sigmoid()
def __init__(self, config, args): super(XNLUModel, self).__init__(config) self.args = args self.num_intent_labels = len(get_intent_labels(args)) self.num_slot_labels = len(get_slot_labels(args)) self.roberta = XLMRobertaModel(config=config) self.intent_classifier = SequenceClassifier(config.hidden_size, self.num_intent_labels) self.slot_classifier = TokenClassifier(config.hidden_size, self.num_slot_labels)
def __init__(self, model_name, num_labels): """ Args: model_name: model name, eg, roberta-base' num_labels: number of classes to classify """ super().__init__() self.w = nn.Linear(768, 1, bias=False) self.bert = XLMRobertaModel.from_pretrained(model_name) self.prediction_layer = nn.Linear(768, num_labels) self.init_weights()
def make_pretrained_transformer_and_tokenizer(transformer_name: str): if 'distilgpt2' in transformer_name: print("DistilGPT2!") model = GPT2Model.from_pretrained('distilgpt2') tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2') else: print(f"Loading {transformer_name}!") model = XLMRobertaModel.from_pretrained(transformer_name) tokenizer = XLMRobertaTokenizer.from_pretrained(transformer_name) return model, tokenizer
def __init__(self, config, model_name=None, smoothing=0.0): super(XLMRobertaForSequenceClassification, self).__init__(config) self.num_labels = 42 self.l2_reg_lambda = config.l2_reg_lambda if model_name is not None: self.xlmroberta = XLMRobertaModel.from_pretrained(model_name, config=config) else: self.xlmroberta = XLMRobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) classifier_size = config.hidden_size * 3 self.classifier = nn.Linear(classifier_size, self.config.num_labels) self.latent_size = config.hidden_size self.latent_type = nn.Parameter(torch.FloatTensor( 3, config.hidden_size), requires_grad=True) if smoothing == 0.0: self.smoothing = False else: self.smoothing = True
def __init__(self, use_aux=True): super(ToxicSimpleNNModel, self).__init__() self.backbone = XLMRobertaModel.from_pretrained(BACKBONE_PATH) self.dropout = nn.Dropout(0.3) aux_len = 0 if use_aux: aux_len = 5 self.linear = nn.Linear( in_features=self.backbone.pooler.dense.out_features * 2, out_features=2 + aux_len, )
def __init__(self, config, args): super(RBERT, self).__init__(config) self.roberta = XLMRobertaModel(config=config) self.num_labels = config.num_labels self.cls_fc_layer = FCLayer(config.hidden_size, config.hidden_size, 0.1) self.e1_fc_layer = FCLayer(config.hidden_size, config.hidden_size, 0.1) self.e2_fc_layer = FCLayer(config.hidden_size, config.hidden_size, 0.1) self.label_classifier = FCLayer(config.hidden_size * 3, self.num_labels, 0.1, use_activation=False)
def __init__(self, config, num_hidden_layers, type="bert"): super().__init__() if type == "bert": self.bert = BertModel.from_pretrained( "bert-base-multilingual-cased", num_hidden_layers=num_hidden_layers) elif type == "xlmr": config.num_hidden_layers = 3 # self.bert = XLMRobertaModel.from_pretrained("xlm-roberta-base", num_hidden_layers=num_hidden_layers) self.bert = XLMRobertaModel(config) else: raise KeyError(f"{type} is not a valid type!")
def __init__(self): super(XLMRobertaLargeTC, self).__init__() config = XLMRobertaConfig.from_pretrained('xlm-roberta-large', output_hidden_states=True) self.xlm_roberta = XLMRobertaModel.from_pretrained('xlm-roberta-large', config=config) self.fc = nn.Linear(config.hidden_size, 1) self.dropout = nn.Dropout(p=0.2) # initialize weight nn.init.normal_(self.fc.weight, std=0.02) nn.init.normal_(self.fc.bias, 0)
def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool = False): super().__init__() self.tokenzier = XLMRobertaTokenizer.from_pretrained(model_dir_or_name) self.encoder = XLMRobertaModel.from_pretrained(model_dir_or_name) self._cls_index = self.tokenzier.encoder['<s>'] self._sep_index = self.tokenzier.encoder['</s>'] self._wordpiece_pad_index = self.tokenzier.encoder['<pad>'] self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>'] self.pooled_cls = pooled_cls
def build_model(): src_encoder = XLMRobertaModel.from_pretrained('xlm-roberta-large') en2fr = TransformerModel.from_pretrained( '/home/mindreese/xencoder/wmt14.en-fr.joined-dict.transformer/', checkpoint_file='model.pt', bpe='subword_nmt', bpe_codes= '/home/mindreese/xencoder/wmt14.en-fr.joined-dict.transformer/bpecodes' ) tgt_encoder = [ model for name, model in en2fr.named_modules() if name == 'models.0.encoder' ][0] return src_encoder, tgt_encoder
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels_list self.roberta = XLMRobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.pooler = nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size), nn.Tanh()) self.classifiers = nn.ModuleList([ nn.Linear(config.hidden_size, num_label) for num_label in self.num_labels ]) self.init_weights()
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): super(XLMRoBERTa, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if self.do_lower_case is not None: tokenizer_args['do_lower_case'] = do_lower_case self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name_or_path, **model_args) self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name_or_path, **tokenizer_args) if max_seq_length > self.tokenizer.max_len_single_sentence: logging.warning("XLM-RoBERTa only allows a max_seq_length of "+self.tokenizer.max_len_single_sentence) max_seq_length = self.tokenizer.max_len_single_sentence self.max_seq_length = max_seq_length
def xlmr_model(): config = XLMRobertaConfig( vocab_size=251000, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37, hidden_act='gelu', hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=256, type_vocab_size=2, is_decoder=False, initializer_range=0.02, ) return XLMRobertaModel(config=config)
def test_xlm_roberta_large(self): model = XLMRobertaModel.from_pretrained("xlm-roberta-large", return_dict=True) input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]]) # The dog is cute and lives in the garden house expected_output_shape = torch.Size((1, 12, 1024)) # batch_size, sequence_length, embedding_vector_dim expected_output_values_last_dim = torch.tensor( [[-0.0699, -0.0318, 0.0705, -0.1241, 0.0999, -0.0520, 0.1004, -0.1838, -0.4704, 0.1437, 0.0821, 0.0126]] ) # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large') # xlmr.eval() # expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1] output = model(input_ids)["last_hidden_state"].detach() self.assertEqual(output.shape, expected_output_shape) # compare the actual values for a slice of last dim self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
def __init__(self, config, num_intents=12, num_slots=31, return_dict=True): # bug # num intents not passing through when loading model super(JointClassifier, self).__init__(config) self.num_labels = config.num_labels self.num_intent_labels = num_intents self.num_slot_labels = num_slots self.roberta = XLMRobertaModel(config, add_pooling_layer=True) self.intent_clf = IntentClassifier(768, self.num_intent_labels) self.slot_clf = SlotClassifier(768, self.num_slot_labels) self.return_dict = return_dict #initial lize weights. Uses inherited Roberta Init self.init_weights()
def words_embedding(topic_abstract): tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') model = XLMRobertaModel.from_pretrained('xlm-roberta-base') # raw_dataset = load_raw_dataset(DATA_ROOT_PATH) # id_texts = load_tweet_ids_with_text(raw_dataset) # ids = [id_texts[idx][0] for idx in range(len(id_texts))] # texts = [id_texts[idx][1] for idx in range(len(id_texts))] topic_abstract_text = '' for word in topic_abstract: topic_abstract_text += word + ' ' result = [] input_ids = torch.tensor( [tokenizer.encode(topic_abstract_text, add_special_tokens=True)])[:, :512] with torch.no_grad(): outputs = model(input_ids)[1].tolist() return np.array(outputs)
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): super(XLMRoBERTa, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name_or_path) self.tokenizer = XLMRobertaTokenizer.from_pretrained( model_name_or_path, do_lower_case=do_lower_case) if max_seq_length > self.tokenizer.max_len_single_sentence: logging.warning("XLM-RoBERTa only allows a max_seq_length of " + self.tokenizer.max_len_single_sentence) max_seq_length = self.tokenizer.max_len_single_sentence self.max_seq_length = max_seq_length self.cls_token_id = self.tokenizer.cls_token_id self.eos_token_id = self.tokenizer.eos_token_id