def __init__(self, config): super().__init__(config) self.prev_pred_embeddings = PrevPredEmbeddings(config) self.encoder = BertEncoder(config) # self.apply(self.init_weights) # old versions of pytorch_transformers self.init_weights()
def __init__(self, config): super(BertModelDialog, self).__init__(config) self.embeddings = BertEmbeddingsDialog(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.init_weights()
def __init__(self, config): super(SelfAttn, self).__init__() self.config = config self.hsize = 64 self.atom_emb = nn.Embedding(5, 64) self.type_emb = nn.Embedding(15, 64) self.pos_emb = nn.Linear(3, 256, bias=False) self.dist_emb = nn.Linear(1, 64, bias=False) self.mu_emb = nn.Linear(1, 32, bias=False) # dipole_moment self.attn = BertEncoder(config) def get_reg_layer(output_size): return nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size), nn.LayerNorm(config.hidden_size), nn.LeakyReLU(), nn.Dropout(config.hidden_dropout_prob), nn.Linear(config.hidden_size, config.hidden_size), nn.LayerNorm(config.hidden_size), nn.LeakyReLU(), nn.Dropout(config.hidden_dropout_prob), nn.Linear(config.hidden_size, output_size), ) self.reg_layers4 = nn.ModuleList([get_reg_layer(4) for _ in range(9)]) self.reg_layers1 = nn.ModuleList([get_reg_layer(1) for _ in range(9)]) # not currently used. self.reg_aux = None
def __init__(self, config, depth=None): super(CustomBertModel, self).__init__(config) self.depth = depth self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.cls = BertPreTrainingHeads(config) self.apply(self.init_weights)
def __init__(self, config): super(PatientLevelBert, self).__init__() self.config = config self.embeddings = PatientLevelEmbedding(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.apply(self.init_weights)
def __init__(self, config): super(BertModel, self).__init__(config) self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.apply(self.init_weights)
def __init__(self, config, gat_config): super().__init__(config) self.prev_pred_embeddings = PrevPredEmbeddings(config) # self.ggcn = QCGATLayers(config.hidden_size, gat_config.num_gat_layers) # self.ggcn = QVGATLayers(config.hidden_size, gat_config.num_gat_layers) # self.encoder = BertEncoder(config) # self.apply(self.init_weights) # old versions of pytorch_transformers self.init_weights()
def __init__(self, args,adapter_config): super(Adapter, self).__init__() self.adapter_config = adapter_config self.args = args self.down_project = nn.Linear( self.adapter_config.project_hidden_size, self.adapter_config.adapter_size, ) self.encoder = BertEncoder(self.adapter_config) self.up_project = nn.Linear(self.adapter_config.adapter_size, adapter_config.project_hidden_size) self.init_weights()
def __init__(self, config): super().__init__(config) self.prev_pred_embeddings = PrevPredEmbeddings(config) # self.ggcn = GatedGraphConvNet(768) # 40.47 -- 40.76 # self.ggcn = MultiHeadGraphAttNet(768) # 39.86 # self.ggcn = BaseGraphAttNet(768) # 39.57 self.ggcn = QuestionConditionedGAT(768, 0.15) # 40.99 self.encoder = BertEncoder(config) # self.apply(self.init_weights) # old versions of pytorch_transformers self.init_weights()
def __init__(self, cfg): super(LSTMATTNModel, self).__init__() self.cfg = cfg cate_col_size = len(cfg.cate_cols) cont_col_size = len(cfg.cont_cols) self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0) self.cate_proj = nn.Sequential( nn.Linear(cfg.emb_size*cate_col_size, cfg.hidden_size//2), nn.LayerNorm(cfg.hidden_size//2), ) self.cont_emb = nn.Sequential( nn.Linear(cont_col_size, cfg.hidden_size//2), nn.LayerNorm(cfg.hidden_size//2), ) self.encoder = nn.LSTM(cfg.hidden_size, cfg.hidden_size, 1, dropout=cfg.dropout, batch_first=True) self.config = BertConfig( 3, # not used hidden_size=cfg.hidden_size, num_hidden_layers=1, num_attention_heads=cfg.nheads, intermediate_size=cfg.hidden_size, hidden_dropout_prob=cfg.dropout, attention_probs_dropout_prob=cfg.dropout, ) self.attn = BertEncoder(self.config) def get_reg(): return nn.Sequential( nn.Linear(cfg.hidden_size, cfg.hidden_size), nn.LayerNorm(cfg.hidden_size), nn.Dropout(cfg.dropout), nn.ReLU(), nn.Linear(cfg.hidden_size, cfg.hidden_size), nn.LayerNorm(cfg.hidden_size), nn.Dropout(cfg.dropout), nn.ReLU(), nn.Linear(cfg.hidden_size, cfg.target_size), ) self.reg_layer = get_reg()
def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout # from pytorch_transformers import RobertaModel from fairseq.modules.roberta_causal_mask import RobertaCasulMaskModel, BertCasulMaskModel from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer if args.roberta_model.startswith('roberta'): self.roberta = RobertaCasulMaskModel.from_pretrained( args.roberta_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = RobertaConfig.from_pretrained(args.roberta_model) self.tokenizer = RobertaTokenizer.from_pretrained( args.roberta_model) else: self.roberta = BertCasulMaskModel.from_pretrained( args.roberta_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = BertConfig.from_pretrained(args.roberta_model) self.tokenizer = BertTokenizer.from_pretrained(args.roberta_model) self.config.output_attentions = True self.roberta.pooler.dense.weight.requires_grad = False self.roberta.pooler.dense.bias.requires_grad = False embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx # self.embed_tokens = embed_tokens # self.embed_scale = math.sqrt(embed_dim) self.args = args # if args.sentence_transformer_arch == 'fairseq': # self.padding_idx = embed_tokens.padding_idx # self.sent_embed_positions = PositionalEmbedding( # 1024, embed_dim, self.padding_idx, # left_pad=False, # learned=args.encoder_learned_pos, # ) # self.doc_layers = nn.ModuleList([]) # self.doc_layers.extend([ # TransformerEncoderLayer(args) # for i in range(args.encoder_layers) # ]) if args.sentence_transformer_arch == 'bert': # from pytorch_transformers import RobertaConfig, RobertaTokenizer # self.config = RobertaConfig.from_pretrained(args.roberta_model) # self.config.output_attentions = True # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') embed_dim = self.config.hidden_size print('*** padding idx before ***', embed_tokens.padding_idx) self.padding_idx = self.tokenizer.convert_tokens_to_ids( self.tokenizer.pad_token) print('*** padding idx after ***', self.padding_idx) # let's assume each document has at most 128-self.padding_idx-1 sentences # in case of roberta, it is 126 self.sent_position_embeddings = nn.Embedding(128, embed_dim) if args.encoder_layers: self.config.num_hidden_layers = args.encoder_layers if args.dropout: self.config.hidden_dropout_prob = args.dropout if args.attention_dropout: self.config.attention_probs_dropout_prob = args.attention_dropout if args.attn_type == 'attn_score': self.sent_encoder = AttnScoreBertEncoder(self.config) elif args.attn_type == 'attn_prob': self.sent_encoder = BertEncoder(self.config) else: raise Exception('--attn-type doesn\'t support {} yet !'.format( args.attn_type)) self.sent_encoder.apply(self._init_weights) print('*** sentence encoder config ***') print(self.config) else: raise Exception( '--sentence-transformer-arch doesn\'t support {} yet!'.format( args.sentence_transformer_arch))
class TransformerEncoder(FairseqEncoder): """Transformer encoder.""" def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout # from pytorch_transformers import RobertaModel from fairseq.modules.roberta_causal_mask import RobertaCasulMaskModel, BertCasulMaskModel from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer if args.roberta_model.startswith('roberta'): self.roberta = RobertaCasulMaskModel.from_pretrained( args.roberta_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = RobertaConfig.from_pretrained(args.roberta_model) self.tokenizer = RobertaTokenizer.from_pretrained( args.roberta_model) else: self.roberta = BertCasulMaskModel.from_pretrained( args.roberta_model, cache_dir=PYTORCH_TRANSFORMERS_CACHE / 'distributed_{}'.format(args.distributed_rank)) self.config = BertConfig.from_pretrained(args.roberta_model) self.tokenizer = BertTokenizer.from_pretrained(args.roberta_model) self.config.output_attentions = True self.roberta.pooler.dense.weight.requires_grad = False self.roberta.pooler.dense.bias.requires_grad = False embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx # self.embed_tokens = embed_tokens # self.embed_scale = math.sqrt(embed_dim) self.args = args # if args.sentence_transformer_arch == 'fairseq': # self.padding_idx = embed_tokens.padding_idx # self.sent_embed_positions = PositionalEmbedding( # 1024, embed_dim, self.padding_idx, # left_pad=False, # learned=args.encoder_learned_pos, # ) # self.doc_layers = nn.ModuleList([]) # self.doc_layers.extend([ # TransformerEncoderLayer(args) # for i in range(args.encoder_layers) # ]) if args.sentence_transformer_arch == 'bert': # from pytorch_transformers import RobertaConfig, RobertaTokenizer # self.config = RobertaConfig.from_pretrained(args.roberta_model) # self.config.output_attentions = True # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') embed_dim = self.config.hidden_size print('*** padding idx before ***', embed_tokens.padding_idx) self.padding_idx = self.tokenizer.convert_tokens_to_ids( self.tokenizer.pad_token) print('*** padding idx after ***', self.padding_idx) # let's assume each document has at most 128-self.padding_idx-1 sentences # in case of roberta, it is 126 self.sent_position_embeddings = nn.Embedding(128, embed_dim) if args.encoder_layers: self.config.num_hidden_layers = args.encoder_layers if args.dropout: self.config.hidden_dropout_prob = args.dropout if args.attention_dropout: self.config.attention_probs_dropout_prob = args.attention_dropout if args.attn_type == 'attn_score': self.sent_encoder = AttnScoreBertEncoder(self.config) elif args.attn_type == 'attn_prob': self.sent_encoder = BertEncoder(self.config) else: raise Exception('--attn-type doesn\'t support {} yet !'.format( args.attn_type)) self.sent_encoder.apply(self._init_weights) print('*** sentence encoder config ***') print(self.config) else: raise Exception( '--sentence-transformer-arch doesn\'t support {} yet!'.format( args.sentence_transformer_arch)) def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, BertLayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() def forward(self, src_tokens, segment_ids, doc_pad_mask, doc_pos_tok, cls_pos, attention_mask=None): # if self.args.sentence_transformer_arch == 'fairseq': # bsz, seqlen = src_tokens.size() # # compute padding mask # attention_mask = src_tokens.ne(self.padding_idx) # # enc_hids, _ = self.bert(src_tokens, segment_ids, attention_mask, output_all_encoded_layers=False) # all_hids = self.roberta(src_tokens, segment_ids, attention_mask) # # print('all_hids', all_hids.size()) # enc_hids = all_hids[0] # doc_pos = self.sent_embed_positions(doc_pos_tok) # sent_repr = get_sent_end_repr(enc_hids, cls_pos) # sent_repr = sent_repr + doc_pos # # n_sent x bsz x C # sent_repr = sent_repr.transpose(0, 1) # for doc_layer in self.doc_layers: # sent_repr = doc_layer(sent_repr, doc_pad_mask) # return { # 'encoder_out': sent_repr, # n_sent x bsz x C # 'encoder_padding_mask': doc_pad_mask, # bsz x n_sent # } if self.args.sentence_transformer_arch == 'bert': bsz, seqlen = src_tokens.size() doclen = cls_pos.size(1) position_ids = torch.arange(1 + self.padding_idx, doclen + 1 + self.padding_idx, dtype=torch.long, device=cls_pos.device) position_ids = position_ids.unsqueeze(0).expand_as(cls_pos) doc_pos = self.sent_position_embeddings(position_ids) # compute padding mask if attention_mask is None: attention_mask = src_tokens.ne(self.padding_idx) # seq_len = src_tokens.shape[1] # while seq_len >= self.roberta.embeddings.position_embeddings.weight.shape[0] - self.roberta.embeddings.padding_idx: # old_num_pos = self.roberta.embeddings.position_embeddings.weight.shape[0] # print('| WARNING: longer than {}, expand the position embedding to {}'.format(old_num_pos, old_num_pos+512)) # num_pos = old_num_pos + 512 # embed_dim = self.roberta.embeddings.position_embeddings.weight.shape[1] # new_embeddings = torch.nn.Embedding(num_pos, embed_dim) # new_embeddings.to(self.roberta.embeddings.position_embeddings.weight.device) # new_embeddings.to(self.roberta.embeddings.position_embeddings.weight.dtype) # new_embeddings.weight.data[:old_num_pos, :] = self.roberta.embeddings.position_embeddings.weight.data[:old_num_pos, :] # new_embeddings.weight.data[old_num_pos:, :] = self.roberta.embeddings.position_embeddings.weight.data[-512:, :] # self.roberta.embeddings.position_embeddings = new_embeddings # self.roberta.embeddings.position_embeddings.weight.fill_(0) all_hids = self.roberta(src_tokens, segment_ids, attention_mask) enc_hids = all_hids[0] sent_repr = get_sent_end_repr(enc_hids, cls_pos) sent_repr = sent_repr + doc_pos head_mask = [None] * self.config.num_hidden_layers extended_doc_mask = doc_pad_mask.unsqueeze(1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_doc_mask = extended_doc_mask.to(dtype=next( self.parameters()).dtype) # fp16 compatibility extended_doc_mask = extended_doc_mask * -10000.0 all_hids_doc = self.sent_encoder(sent_repr, extended_doc_mask, head_mask) sent_repr_given_doc = all_hids_doc[0] attn_weights = all_hids_doc[1] return { 'encoder_out': sent_repr_given_doc, # bsz x n_sent x C 'attn_weights': attn_weights, 'encoder_doc_mask': doc_pad_mask, # bsz x n_sent } else: raise Exception( '--sentence-transformer-arch doesn\'t support {} yet!'.format( args.sentence_transformer_arch)) def reorder_encoder_out(self, encoder_out_dict, new_order): if encoder_out_dict['encoder_out'] is not None: encoder_out_dict['encoder_out'] = \ encoder_out_dict['encoder_out'].index_select(1, new_order) if encoder_out_dict['encoder_padding_mask'] is not None: encoder_out_dict['encoder_padding_mask'] = \ encoder_out_dict['encoder_padding_mask'].index_select(0, new_order) return encoder_out_dict def max_positions(self): """Maximum input length supported by the encoder.""" # return self.embed_positions.max_positions() return 10240 def upgrade_state_dict(self, state_dict): ''' if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): if 'encoder.embed_positions.weights' in state_dict: del state_dict['encoder.embed_positions.weights'] if 'encoder.embed_positions._float_tensor' not in state_dict: state_dict['encoder.embed_positions._float_tensor'] = torch.FloatTensor() ''' return state_dict
def __init__(self, config): super().__init__(config) self.encoder = BertEncoder(config) # self.apply(self.init_weights) # old versions of pytorch_transformers self.init_weights()