def init_data(self, use_cuda: bool): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') torch.set_grad_enabled(False) cfg = AlbertConfig() self.torch_embedding = AlbertEmbeddings(cfg) self.torch_embedding.eval() if use_cuda: self.torch_embedding.to(test_device) self.turbo_embedding = turbo_transformers.AlbertEmbeddings.from_torch( self.torch_embedding) input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_length), dtype=torch.long, device=test_device) position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) position_ids = position_ids.repeat(batch_size, 1) token_type_ids = torch.zeros_like(input_ids, dtype=torch.long) return input_ids, position_ids, token_type_ids
def load(cls, pretrained_model_name_or_path, language=None, **kwargs): """ Load a language model either by supplying * the name of a remote model on s3 ("albert-base" ...) * or a local path of a model trained via transformers ("some_dir/huggingface_model") * or a local path of a model trained via FARM ("some_dir/farm_model") :param pretrained_model_name_or_path: name or path of a model :param language: (Optional) Name of language the model was trained for (e.g. "german"). If not supplied, FARM will try to infer it from the model name. :return: Language Model """ albert = cls() if "farm_lm_name" in kwargs: albert.name = kwargs["farm_lm_name"] else: albert.name = pretrained_model_name_or_path # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(farm_lm_config): # FARM style config = AlbertConfig.from_pretrained(farm_lm_config) farm_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" albert.model = AlbertModel.from_pretrained(farm_lm_model, config=config, **kwargs) albert.language = albert.model.config.language else: # Huggingface transformer Style albert.model = AlbertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) albert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) return albert
def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = AlbertConfig() self.torch_layer = AlbertLayer(self.cfg) if torch.cuda.is_available(): self.torch_layer.to(self.test_device) self.torch_layer.eval() self.hidden_size = self.cfg.hidden_size self.input_tensor = torch.rand(size=(batch_size, seq_length, self.hidden_size), dtype=torch.float32, device=self.test_device) self.attention_mask = torch.ones((batch_size, seq_length), dtype=torch.float32, device=self.test_device) self.attention_mask = self.attention_mask[:, None, None, :] self.attention_mask = (1.0 - self.attention_mask) * -10000.0 self.turbo_layer = turbo_transformers.AlbertLayer.from_torch( self.torch_layer)
def init_data(self, use_cuda): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) cfg = AlbertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) torch_attention = AlbertAttention(cfg) torch_attention.eval() if use_cuda: torch_attention.to(test_device) # Get FT Attention turbo_attention = turbo_transformers.AlbertAttention.from_torch( torch_attention) hidden_size = cfg.hidden_size input_tensor = torch.rand(size=(batch_size, seq_length, hidden_size), dtype=torch.float32, device=test_device) attention_mask = torch.ones((batch_size, seq_length), dtype=torch.float32, device=test_device) attention_mask = attention_mask[:, None, None, :] attention_mask = (1.0 - attention_mask) * -10000.0 return torch_attention, turbo_attention, input_tensor, attention_mask
def __init__(self, cfg): super(DSB_ALBERTModel, self).__init__() self.cfg = cfg cate_col_size = len(cfg.cate_cols) cont_col_size = len(cfg.cont_cols) self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0) def get_cont_emb(): return nn.Sequential(nn.Linear(cont_col_size, cfg.hidden_size), nn.LayerNorm(cfg.hidden_size), nn.ReLU(), nn.Linear(cfg.hidden_size, cfg.hidden_size)) self.cont_emb = get_cont_emb() self.config = AlbertConfig( 3, # not used embedding_size=cfg.emb_size * cate_col_size + cfg.hidden_size, hidden_size=cfg.emb_size * cate_col_size + cfg.hidden_size, num_hidden_layers=cfg.nlayers, #num_hidden_groups=1, num_attention_heads=cfg.nheads, intermediate_size=cfg.hidden_size, hidden_dropout_prob=cfg.dropout, attention_probs_dropout_prob=cfg.dropout, max_position_embeddings=cfg.seq_len, type_vocab_size=1, #initializer_range=0.02, #layer_norm_eps=1e-12, ) self.encoder = AlbertModel(self.config) def get_reg(): return nn.Sequential( nn.Linear(cfg.emb_size * cate_col_size + cfg.hidden_size, cfg.hidden_size), nn.LayerNorm(cfg.hidden_size), nn.Dropout(cfg.dropout), nn.ReLU(), nn.Linear(cfg.hidden_size, cfg.hidden_size), nn.LayerNorm(cfg.hidden_size), nn.Dropout(cfg.dropout), nn.ReLU(), nn.Linear(cfg.hidden_size, cfg.target_size), ) self.reg_layer = get_reg()
def load(cls, pretrained_model_name_or_path, language=None, **kwargs): """ Load a pretrained model by supplying * the name of a remote model on s3 ("distilbert-base-german-cased" ...) * OR a local path of a model trained via transformers ("some_dir/huggingface_model") * OR a local path of a model trained via FARM ("some_dir/farm_model") :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. :type pretrained_model_name_or_path: str """ distilbert = cls() if "farm_lm_name" in kwargs: distilbert.name = kwargs["farm_lm_name"] else: distilbert.name = pretrained_model_name_or_path # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = Path( pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(farm_lm_config): # FARM style config = AlbertConfig.from_pretrained(farm_lm_config) farm_lm_model = Path( pretrained_model_name_or_path) / "language_model.bin" distilbert.model = DistilBertModel.from_pretrained(farm_lm_model, config=config, **kwargs) distilbert.language = distilbert.model.config.language else: # Pytorch-transformer Style distilbert.model = DistilBertModel.from_pretrained( str(pretrained_model_name_or_path), **kwargs) distilbert.language = cls._get_or_infer_language_from_name( language, pretrained_model_name_or_path) config = distilbert.model.config # DistilBERT does not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we # feed everything to the prediction head config.summary_last_dropout = 0 config.summary_type = 'first' config.summary_activation = 'tanh' distilbert.pooler = SequenceSummary(config) distilbert.pooler.apply(distilbert.model._init_weights) return distilbert
def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) self.torch_model = AlbertModel(self.cfg) if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.torch_model.eval() self.hidden_size = self.cfg.hidden_size self.turbo_model = turbo_transformers.AlbertModel.from_torch( self.torch_model)
def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = AlbertConfig() self.torch_model = AlbertModel(self.cfg) if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.torch_model.eval() self.hidden_size = self.cfg.hidden_size self.input_tensor = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, seq_length), device=self.test_device) self.turbo_model = turbo_transformers.AlbertModel.from_torch( self.torch_model)
from utils.evaluate import evaluate from torch.optim import lr_scheduler from torch import nn if __name__ == "__main__": train_data_path = "/home/longred/BertForSentenceSimilarity/dataset/LCQMC/train.txt" dev_data_path = "/home/longred/BertForSentenceSimilarity/dataset/LCQMC/dev.txt" test_data_path = "/home/longred/BertForSentenceSimilarity/dataset/LCQMC/test.txt" vocab_path = "/home/longred/BertForSentenceSimilarity/prev_trained_model/albert_tiny_zh/vocab.txt" train_data_loader = LCQMCDataLoader( train_data_path, vocab_path, batch_size=1024, is_pair=True, length=80) dev_data_loader = LCQMCDataLoader( dev_data_path, vocab_path, batch_size=1024, is_pair=True, length=80) test_data_loader = LCQMCDataLoader( test_data_path, vocab_path, batch_size=1024, is_pair=True, length=80) config = AlbertConfig.from_pretrained( "/home/longred/BertForSentenceSimilarity/prev_trained_model/albert_tiny_zh/config.json") config.num_labels = 1 # config.hidden_size = 128 config.dropout = 0.5 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # device = torch.device('cpu') net = AlbertForSequenceClassification.from_pretrained( pretrained_model_name_or_path="/home/longred/BertForSentenceSimilarity/prev_trained_model/albert_tiny_zh/pytorch_model.bin", config=config).to(device) # %% learning_rate = 5e-4 no_decay = ["bias", "LayerNorm.weight"] bert_param_optimizer = list(net.bert.named_parameters()) linear_param_optimizer = list(net.classifier.named_parameters()) optimizer_grouped_parameters = [