def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

            input_mask = None
            if self.use_input_mask:
                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)

            token_type_ids = None
            if self.use_token_type_ids:
                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)

            sequence_labels = None
            token_labels = None
            choice_labels = None
            if self.use_labels:
                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = BertConfig(
                vocab_size_or_config_json_file=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
                intermediate_size=self.intermediate_size,
                hidden_act=self.hidden_act,
                hidden_dropout_prob=self.hidden_dropout_prob,
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range)

            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
示例#2
0
    def __init__(self, args, device, checkpoint):
        super(ExtSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = Bert(args, args.temp_dir, args.finetune_bert)

        self.ext_layer = ExtTransformerEncoder(self.bert.model.config.hidden_size, args.ext_ff_size, args.ext_heads,
                                               args.ext_dropout, args.ext_layers)
        if (args.encoder == 'baseline'):
            bert_config = BertConfig(self.bert.model.config.vocab_size, hidden_size=args.ext_hidden_size,
                                     num_hidden_layers=args.ext_layers, num_attention_heads=args.ext_heads, intermediate_size=args.ext_ff_size)
            self.bert.model = BertModel(bert_config)
            self.ext_layer = Classifier(self.bert.model.config.hidden_size)

        if(args.max_pos>512):
            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][None,:].repeat(args.max_pos-512,1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings


        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:
            if args.param_init != 0.0:
                for p in self.ext_layer.parameters():
                    p.data.uniform_(-args.param_init, args.param_init)
            if args.param_init_glorot:
                for p in self.ext_layer.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)

        self.to(device)
示例#3
0
 def __init__(self):
     super().__init__()
     config = BertConfig()
     config.output_hidden_states = True
     self.bert = BertModel.from_pretrained('bert-base-uncased',
                                           config=config)
     self.bertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
示例#4
0
    def __init__(self,
                 *,
                 pretrained_model_name=None,
                 config_filename=None,
                 vocab_size=None,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 max_position_embeddings=512,
                 random_init=False,
                 **kwargs):
        TrainableNM.__init__(self, **kwargs)

        # Check that only one of pretrained_model_name, config_filename, and
        # vocab_size was passed in
        total = 0

        if pretrained_model_name is not None:
            total += 1
        if config_filename is not None:
            total += 1
        if vocab_size is not None:
            total += 1

        if total != 1:
            raise ValueError(
                "Only one of pretrained_model_name, vocab_size, " +
                "or config_filename should be passed into the " +
                "BERT constructor.")

        if vocab_size is not None:
            config = BertConfig(
                vocab_size_or_config_json_file=vocab_size,
                hidden_size=hidden_size,
                num_hidden_layers=num_hidden_layers,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                hidden_act=hidden_act,
                max_position_embeddings=max_position_embeddings)
            model = BertModel(config)
        elif pretrained_model_name is not None:
            model = BertModel.from_pretrained(pretrained_model_name)
        elif config_filename is not None:
            config = BertConfig.from_json_file(config_filename)
            model = BertModel(config)
        else:
            raise ValueError(
                "Either pretrained_model_name or vocab_size must" +
                "be passed into the BERT constructor")

        model.to(self._device)

        self.add_module("bert", model)
        self.config = model.config

        if random_init:
            self.apply(
                lambda module: transformer_weights_init(module, xavier=False))
示例#5
0
    def __init__(self, args, device, checkpoint=None, bert_from_extractive=None):
        super(AbsSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = Bert(args.large, args.temp_dir, args.finetune_bert)

        if bert_from_extractive is not None:
            self.bert.model.load_state_dict(
                dict([(n[11:], p) for n, p in bert_from_extractive.items() if n.startswith('bert.model')]), strict=True)

        if (args.encoder == 'baseline'):
            bert_config = BertConfig(self.bert.model.config.vocab_size, hidden_size=args.enc_hidden_size,
                                     num_hidden_layers=args.enc_layers, num_attention_heads=8,
                                     intermediate_size=args.enc_ff_size,
                                     hidden_dropout_prob=args.enc_dropout,
                                     attention_probs_dropout_prob=args.enc_dropout)
            self.bert.model = BertModel(bert_config)

        if(args.max_pos>512):
            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][None,:].repeat(args.max_pos-512,1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
        self.vocab_size = self.bert.model.config.vocab_size
        tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)
        if (self.args.share_emb):
            tgt_embeddings = self.bert.model.embeddings.word_embeddings

        self.decoder = TransformerDecoder(
            self.args.dec_layers,
            self.args.dec_hidden_size, heads=self.args.dec_heads,
            d_ff=self.args.dec_ff_size, dropout=self.args.dec_dropout, embeddings=tgt_embeddings)

        self.generator = get_generator(self.vocab_size, self.args.dec_hidden_size, device)
        self.generator[0].weight = self.decoder.embeddings.weight


        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:
            for module in self.decoder.modules():
                if isinstance(module, (nn.Linear, nn.Embedding)):
                    module.weight.data.normal_(mean=0.0, std=0.02)
                elif isinstance(module, nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)
                if isinstance(module, nn.Linear) and module.bias is not None:
                    module.bias.data.zero_()
            for p in self.generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
                else:
                    p.data.zero_()
            if(args.use_bert_emb):
                tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)
                tgt_embeddings.weight = copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight)
                self.decoder.embeddings = tgt_embeddings
                self.generator[0].weight = self.decoder.embeddings.weight

        self.to(device)
示例#6
0
    def __init__(self,
                 num_labels=2,
                 model_type='bert-base-uncased',
                 token_layer='token-cls',
                 output_logits=True):
        super(BertForWSD, self).__init__()

        self.config = BertConfig()
        self.token_layer = token_layer
        self.num_labels = 2
        self.bert = BertModel.from_pretrained(model_type)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.output_logits = output_logits

        # Define which token selection layer to use
        if token_layer == 'token-cls':
            self.tokenselectlayer = TokenClsLayer()
        elif token_layer in ['sent-cls', 'sent-cls-ws']:
            self.tokenselectlayer = SentClsLayer()
        else:
            raise ValueError(
                "Unidentified parameter for token selection layer")

        self.classifier = nn.Linear(768, num_labels)
        if not output_logits:
            self.softmax = nn.Softmax(dim=1)  # to be checked!!!

        nn.init.xavier_normal_(self.classifier.weight)
示例#7
0
    def __init__(self, args, device, checkpoint):
        super(ExtSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = Bert(args.large, args.temp_dir, args.finetune_bert)

        self.ext_layer = ExtTransformerEncoder(
            self.bert.model.config.hidden_size, args.ext_ff_size,
            args.ext_heads, args.ext_dropout, args.ext_layers)
        if (args.encoder == 'baseline'):
            bert_config = BertConfig(self.bert.model.config.vocab_size,
                                     hidden_size=args.hidden_size,
                                     num_hidden_layers=6,
                                     num_attention_heads=8,
                                     intermediate_size=args.ff_size)
            self.bert.model = BertModel(bert_config)
            self.ext_layer = Classifier(self.bert.model.config.hidden_size)

        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:
            if args.param_init != 0.0:
                for p in self.ext_layer.parameters():
                    p.data.uniform_(-args.param_init, args.param_init)
            if args.param_init_glorot:
                for p in self.ext_layer.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)

        self.to(device)
示例#8
0
def load_model():
    model_dir = '../../model/model/'
    config = BertConfig(num_labels=3, output_attentions=True)
    config.from_pretrained('../../model/bert-cased/')
    model = BertAttn(config,
                     option='feed',
                     dropout=0.1,
                     gpu=False,
                     seed=0,
                     do_lower_case=False)
    class_weights = [0.6058, 0.1161, 0.2781]
    model.set_focal_loss(alpha=class_weights, gamma=-1)
    model.load_model(True, model_dir)
    return model
 def init_embedding(self, np_file, embedding_type="glove"):
     if embedding_type == "bert" and SUPPORT_BERT_EMBEDDING:
         bert = Bert(False, '/temp/', False)
         bert.model.config.vocab_size = self.ntoken
         bert_config = BertConfig(bert.model.config.vocab_size,
                 hidden_size=self.emb_dim,
                 num_hidden_layers=2,
                 num_attention_heads=8,
                 intermediate_size=512,
                 hidden_dropout_prob=self.dropout,
                 attention_probs_dropout_prob=self.dropout)
         bert.model = BertModel(bert_config)
         weight_init = copy.deepcopy(bert.model.embeddings.word_embeddings.weight)
         assert weight_init.shape == (self.ntoken, self.emb_dim)
         self.emb.weight.data[:self.ntoken] = weight_init
     else:
         weight_init = torch.from_numpy(np.load(np_file))
         assert weight_init.shape == (self.ntoken, self.emb_dim)
         self.emb.weight.data[:self.ntoken] = weight_init
    def init(self):
        bert_config = BertConfig(self.args.output_config_file)
        if os.path.exists(self.args.output_model_file):
            if self.args.model_name == 'BertCNNPlus':
                bert_config.filter_num = self.args.filter_num
                bert_config.filter_sizes = [int(val) for val in self.args.filter_sizes.split()]
            elif self.args.model_name == 'BertRCNN':
                bert_config.rnn_hidden_size = self.args.rnn_hidden_size
                bert_config.num_layers = self.args.num_layers
                bert_config.bidirectional = self.args.bidirectional
                bert_config.dropout = self.args.dropout
            else:
                pass

            self.model = Net(config=bert_config)
            self.model.load_state_dict(torch.load(self.args.output_model_file))
            self.model.to(DEVICE)

        self.tokenizer = BertTokenizer(self.args.bert_vocab_file).from_pretrained(self.args.bert_model_dir,
                                                                                  do_lower_case=self.args.do_lower_case)
示例#11
0
    def __init__(self):
        super().__init__()
        config = BertConfig()
        config.output_hidden_states = True
        self.bert = BertModel.from_pretrained('bert-base-uncased',
                                              config=config)
        self.bertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.ninput = 768
        self.nlayers = 1
        self.bidirectional = True
        self.num_directions = 2 if self.bidirectional else 1
        self.nhidden = 256 // self.num_directions
        self.drop_prob = 0.5
        self.rnn = nn.LSTM(self.ninput,
                           self.nhidden,
                           self.nlayers,
                           batch_first=True,
                           dropout=self.drop_prob,
                           bidirectional=self.bidirectional)
    def __init__(self, path_to_csv="tractatus_with_splits.csv", pretrained_name="bert-base-cased", n_layers=4, start_split="train"):
        """

        :type n_layers: int
        """
        self.n_layers = n_layers
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_name)
        our_config = BertConfig(vocab_size_or_config_json_file=28996, output_hidden_states=True)
        self.model = BertModel.from_pretrained(pretrained_name, config=our_config)
        self.model.eval()

        self.all_df = pd.read_csv(path_to_csv)

        self.test_df = self.all_df[self.all_df.split == "test"]
        self.train_df = self.all_df[self.all_df.split == "train"]
        self.val_df = self.all_df[self.all_df.split == "validation"]

        self._lookup_dict = {"train": self.train_df,
                             "val": self.val_df,
                             "test": self.test_df}

        self.set_split(start_split)
示例#13
0
文件: plot.py 项目: zheyuye/TwitterSA
    ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))

    ax.set_xticklabels([''] + list(df.columns))
    ax.set_yticklabels([''] + list(df.index))
    for tick in ax.get_xticklabels():
        tick.set_rotation(45)
    #     tick.label.set_fontsize(14)
    # for tick in ax.get_yticklabels():
    #     tick.label.set_fontsize(14)
    ax.tick_params(axis='both', which='major', labelsize=15)
    plt.savefig(f"./figures/{step}_{sentence}.png")
    # plt.show()


model_dir = './B96_lr1e-06_s1.0_0903_1905/'
config = BertConfig(num_labels=3, output_attentions=True)
# PRETRAINED_WEIGHTS = "bert-base-cased"
config.from_pretrained('bert-base-cased')
model = BertAttn(config,
                 option='emoji',
                 dropout=0.1,
                 gpu=False,
                 seed=0,
                 do_lower_case=False)
# model.set_focal_loss(alpha=class_weights,gamma=-1)
model.load_model(True, model_dir)
# model.bert.save_pretrained('./bert-cased/')

class_weights, train, dev, test = get_data(option='emoji',
                                           dataset_size=1,
                                           unbalanced=False)
示例#14
0
def main():    
    parser = argparse.ArgumentParser("")
    parser.add_argument("--model", type=str, default='')    
    parser.add_argument("--resume", action='store_true')
    parser.add_argument("--eval", action='store_true')
    parser.add_argument("--batch_size", type=int, default=CFG.batch_size)
    parser.add_argument("--nepochs", type=int, default=CFG.num_train_epochs)    
    parser.add_argument("--wsteps", type=int, default=CFG.warmup_steps)
    parser.add_argument("--nlayers", type=int, default=CFG.num_hidden_layers)
    parser.add_argument("--nahs", type=int, default=CFG.num_attention_heads)
    parser.add_argument("--seed", type=int, default=7)
    parser.add_argument("--lr", type=float, default=CFG.learning_rate)
    parser.add_argument("--dropout", type=float, default=CFG.dropout)
    parser.add_argument("--types", nargs='+', type=str, 
                        default=['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN'], 
                        help='3JHC,2JHC,1JHC,3JHH,2JHH,3JHN,2JHN,1JHN')
    parser.add_argument("--train_file", default="train_mute_cp")
    parser.add_argument("--test_file", default="test_mute_cp")
    parser.add_argument("--pseudo_path", default="")
    parser.add_argument("--pseudo", action='store_true')
    parser.add_argument("--gen_pseudo", action='store_true')
    parser.add_argument("--use_all", action='store_true')
    parser.add_argument("--structure_file", default="structures_mu")
    parser.add_argument("--contribution_file", default="scalar_coupling_contributions")        
    args = parser.parse_args()
    print(args) 
    
    CFG.batch_size=args.batch_size
    CFG.num_train_epochs=args.nepochs
    CFG.warmup_steps=args.wsteps
    CFG.num_hidden_layers=args.nlayers
    CFG.num_attention_heads=args.nahs
    CFG.learning_rate=args.lr
    CFG.dropout=args.dropout
    CFG.seed =  args.seed
    print(CFG.__dict__)
    
    random.seed(CFG.seed)
    np.random.seed(CFG.seed)
    torch.manual_seed(CFG.seed)
    
    #if not args.eval:    
    if True:
        train_df = load_csv(args.train_file)
        
        structures_df = load_csv(args.structure_file)  
        structures_df[['x', 'y', 'z']] -= structures_df.groupby('molecule_name')[['x', 'y', 'z']].transform('mean')        
        
        contributions_df = load_csv(args.contribution_file)
        train_df = train_df.merge(contributions_df, how='left')   
        train_df = normalize_cols(train_df, ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso'])        
        train_df = add_extra_features(train_df, structures_df)
        train_df = train_df.fillna(1e08)
        n_mols = train_df['molecule_name'].nunique()
        train_df, valid_df = train_test_split(train_df, 5000 )
        
        # only molecules with the args.types
        print(train_df['molecule_name'].nunique())
        mol_names_with_at = train_df[train_df['type'].isin(args.types)]['molecule_name'].unique()
        train_df = train_df[train_df['molecule_name'].isin(mol_names_with_at)].reset_index(drop=True)
        print(train_df['molecule_name'].nunique())
        
        # Print the 5 rows of valid_df to verify whether the valid_df is the same as the previous experiment.
        print(valid_df.head(5))
        
        if args.pseudo:        
            test_df = load_csv(args.test_file)
            logger.info(f'loading dataset - {args.pseudo_path} ...')
            test_pseudo_df = pd.read_csv(args.pseudo_path)
            #mol_names_jhn = train_df[test_df['type'].isin(['1JHN', '2JHN', '3JHN'])]['molecule_name'].unique()
            #test_df = test_df[test_df['molecule_name'].isin(mol_names_jhn)].reset_index(drop=True)        
            test_df = add_extra_features(test_df, structures_df)
            test_df = test_df.set_index('id')
            test_pseudo_df = test_pseudo_df.set_index('id')
            test_df[['scalar_coupling_constant',  'fc', 'sd', 'pso', 'dso']] = test_pseudo_df[['scalar_coupling_constant',  'fc', 'sd', 'pso', 'dso']]
            test_df = test_df.reset_index()            
            #test_df = normalize_target(test_df)
            test_df = normalize_cols(test_df, ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso'])
            #test_df = test_df.assign(fc=1e08, sd=1e08, pso=1e08, dso=1e08)
            train_df['weight'] = 1.0
            valid_df['weight'] = 1.0
            test_df['weight'] = 1.0
            n_mols = test_df['molecule_name'].nunique()            
            train_df = train_df.append(test_df).reset_index(drop=True)
        else:
            train_df['weight'] = 1.0
            valid_df['weight'] = 1.0
        
        if args.use_all:
            train_df = train_df.append(valid_df) 
        
        print(f' n_train:{len(train_df)}, n_valid:{len(valid_df)}')
    
    config = BertConfig(            
            3, # not used
            hidden_size=CFG.hidden_size,
            num_hidden_layers=CFG.num_hidden_layers,
            num_attention_heads=CFG.num_attention_heads,
            intermediate_size=CFG.intermediate_size,
            hidden_dropout_prob=CFG.dropout,
            attention_probs_dropout_prob=CFG.dropout,
        )    
    model = cust_model.SelfAttn(config)
    if args.model != "":
        print("=> loading checkpoint '{}'".format(args.model))
        checkpoint = torch.load(args.model)
        CFG.start_epoch = checkpoint['epoch']        
        model.load_state_dict(checkpoint['state_dict'])        
        print("=> loaded checkpoint '{}' (epoch {})"
              .format(args.model, checkpoint['epoch']))
    model.cuda()
    
    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('parameters: ', count_parameters(model))
    
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    
    # to produce the submission.csv
    if args.eval:
        test_df = load_csv(args.test_file)
        structures_df = load_csv(args.structure_file)
        structures_df[['x', 'y', 'z']] -= structures_df.groupby('molecule_name')[['x', 'y', 'z']].transform('mean')        
        test_df = add_extra_features(test_df, structures_df)
        test_df = test_df.assign(fc=1e08, sd=1e08, pso=1e08, dso=1e08) 
        test_df['scalar_coupling_constant'] = 0
        test_df['weight'] = 1.0
        test_db = db.MolDB(test_df, CFG.max_seq_length)
        test_loader = DataLoader(
            test_db, batch_size=CFG.batch_size, shuffle=False,
            num_workers=CFG.num_workers)
        res_df = validate(test_loader, model, args.types)        
        res_df = unnormalize_cols(res_df, cols=['fc', 'sd', 'pso', 'dso'])
        res_df = unnormalize_target(res_df, 'prediction1')
        if args.gen_pseudo:
            res_df['scalar_coupling_constant'] = res_df['prediction1']
            res_df = res_df[res_df['id']>-1].sort_values('id')
            res_df[['id', 'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']].to_csv(f'pseudo_{CFG.seed}.csv', index=False)
            return
        res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1)
        res_df['prediction']= res_df[['prediction1','prediction4']].mean(1)        
        res_df['scalar_coupling_constant'] = res_df['prediction']
        res_df = res_df[res_df['id']>-1].sort_values('id')
        os.makedirs('output', exist_ok=True)
        res_df[['id', 'scalar_coupling_constant']].to_csv(f'output/submission_{CFG.seed}.csv', index=False)        
        return
    
    train_db = db.MolDB(train_df, CFG.max_seq_length)    
    print('preloading dataset ...')
    train_db = db.MolDB_FromDB(train_db, 10)    
    valid_db = db.MolDB(valid_df, CFG.max_seq_length)    
    num_train_optimization_steps = int(
        len(train_db) / CFG.batch_size / CFG.gradient_accumulation_steps) * (CFG.num_train_epochs-CFG.start_epoch)
    print('num_train_optimization_steps', num_train_optimization_steps)      

    train_loader = DataLoader(
        train_db, batch_size=CFG.batch_size, shuffle=True,
        num_workers=CFG.num_workers, pin_memory=True)
    val_loader = DataLoader(
        valid_db, batch_size=CFG.batch_size, shuffle=False,
        num_workers=CFG.num_workers)
    
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    
    optimizer = AdamW(optimizer_grouped_parameters,
                           lr=CFG.learning_rate,
                           weight_decay=CFG.weight_decay,                           
                           )
    scheduler = WarmupLinearSchedule(optimizer, CFG.warmup_steps,
                                        t_total=num_train_optimization_steps
                                     )
    
    def get_lr():
        return scheduler.get_lr()[0]
    
    if args.model != "":
        if args.resume:
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
        #for param_group in optimizer.param_groups:
        #    param_group['lr'] = CFG.learning_rate
        mae_log_df = checkpoint['mae_log']
        del checkpoint
    else:
        mae_log_df = pd.DataFrame(columns=(['EPOCH']+['LR']+args.types + ['OVERALL']) )     
    os.makedirs('log', exist_ok=True)
    
    
    res_df = validate(val_loader, model, args.types)        
    res_df = unnormalize_cols(res_df, cols=['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso'])
    res_df = unnormalize_target(res_df, 'prediction1')            
    res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1)
    res_df['prediction']= res_df[['prediction1','prediction4']].mean(1)
    res_df.to_csv(f'log/valid_df_{"_".join(args.types)}.csv', index=False)
    overall_mae, maes = metric(res_df, args.types)
    print(overall_mae, maes)    
    
    
    curr_lr = get_lr()
    print(f'initial learning rate:{curr_lr}')
    for epoch in range(CFG.start_epoch, CFG.num_train_epochs):
        # train for one epoch
                
        #print(adjust_learning_rate(optimizer, epoch))    
        train(train_loader, model, optimizer, epoch, args.types, scheduler)
       
        if epoch % CFG.test_freq == 0:
            res_df = validate(val_loader, model, args.types)        
            res_df = unnormalize_cols(res_df, cols=['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso'])
            res_df = unnormalize_target(res_df, 'prediction1')            
            res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1)
            res_df['prediction']= res_df[['prediction1','prediction4']].mean(1)
            res_df.to_csv(f'log/valid_df_{"_".join(args.types)}.csv', index=False)
            overall_mae, maes = metric(res_df, args.types)
            
            # write log file
            mae_row = dict([(typ, [mae]) for typ, mae in maes.items() if typ in args.types])
            mae_row.update({'EPOCH':(epoch),'OVERALL':overall_mae, 'LR':curr_lr})
            mae_log_df = mae_log_df.append(pd.DataFrame(mae_row), sort=False)
            print(mae_log_df.tail(20))        
            mae_log_df.to_csv(f'log/{"_".join(args.types)}.csv', index=False)
            
            #scheduler.step(overall_mae)
            curr_lr = get_lr()
            print(f'set the learning_rate: {curr_lr}')
            
            # evaluate on validation set
            batch_size = CFG.batch_size            
            pseudo_path = '' if not args.pseudo else '_' + args.pseudo_path 
            curr_model_name = (f'b{batch_size}_l{config.num_hidden_layers}_'
                               f'mh{config.num_attention_heads}_h{config.hidden_size}_'
                               f'd{CFG.dropout}_'
                               f'ep{epoch}_{"_".join(args.types)}_s{CFG.seed}{pseudo_path}.pt')
            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the cust_model it-self    
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': 'transformer',
                'state_dict': model_to_save.state_dict(),
                'mae_log': mae_log_df,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                },
                FINETUNED_MODEL_PATH, curr_model_name
            )                                                
                                         
    print('done')
示例#15
0
def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path,
                                          pytorch_dump_folder_path,
                                          classification_head):
    """
    Copy/paste/tweak roberta's weights to our BERT structure.
    """
    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
    roberta.eval()  # disable dropout
    config = BertConfig(
        vocab_size_or_config_json_file=50265,
        hidden_size=roberta.args.encoder_embed_dim,
        num_hidden_layers=roberta.args.encoder_layers,
        num_attention_heads=roberta.args.encoder_attention_heads,
        intermediate_size=roberta.args.encoder_ffn_embed_dim,
        max_position_embeddings=514,
        type_vocab_size=1,
        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
    )
    if classification_head:
        config.num_labels = roberta.args.num_classes
    print("Our BERT config:", config)

    model = RobertaForSequenceClassification(
        config) if classification_head else RobertaForMaskedLM(config)
    model.eval()

    # Now let's copy all the weights.
    # Embeddings
    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # just zero them out b/c RoBERTa doesn't use them.
    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias

    for i in range(config.num_hidden_layers):
        # Encoder: start of layer
        layer: BertLayer = model.roberta.encoder.layer[i]
        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[
            i]

        ### self attention
        self_attn: BertSelfAttention = layer.attention.self
        assert (roberta_layer.self_attn.in_proj_weight.shape == torch.Size(
            (3 * config.hidden_size, config.hidden_size)))
        # we use three distinct linear layers so we split the source layer here.
        self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:
                                                                             config
                                                                             .
                                                                             hidden_size, :]
        self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:
                                                                         config
                                                                         .
                                                                         hidden_size]
        self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[
            config.hidden_size:2 * config.hidden_size, :]
        self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[
            config.hidden_size:2 * config.hidden_size]
        self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[
            2 * config.hidden_size:, :]
        self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[
            2 * config.hidden_size:]

        ### self-attention output
        self_output: BertSelfOutput = layer.attention.output
        assert (self_output.dense.weight.shape ==
                roberta_layer.self_attn.out_proj.weight.shape)
        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias

        ### intermediate
        intermediate: BertIntermediate = layer.intermediate
        assert (
            intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape)
        intermediate.dense.weight = roberta_layer.fc1.weight
        intermediate.dense.bias = roberta_layer.fc1.bias

        ### output
        bert_output: BertOutput = layer.output
        assert (
            bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape)
        bert_output.dense.weight = roberta_layer.fc2.weight
        bert_output.dense.bias = roberta_layer.fc2.bias
        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
        #### end of layer

    if classification_head:
        model.classifier.dense.weight = roberta.model.classification_heads[
            'mnli'].dense.weight
        model.classifier.dense.bias = roberta.model.classification_heads[
            'mnli'].dense.bias
        model.classifier.out_proj.weight = roberta.model.classification_heads[
            'mnli'].out_proj.weight
        model.classifier.out_proj.bias = roberta.model.classification_heads[
            'mnli'].out_proj.bias
    else:
        # LM Head
        model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
        model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
        model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
        model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
        model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
        model.lm_head.bias = roberta.model.decoder.lm_head.bias

    # Let's check that we get the same results.
    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(
        0)  # batch of size 1

    our_output = model(input_ids)[0]
    if classification_head:
        their_output = roberta.model.classification_heads['mnli'](
            roberta.extract_features(input_ids))
    else:
        their_output = roberta.model(input_ids)[0]
    print(our_output.shape, their_output.shape)
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
    success = torch.allclose(our_output, their_output, atol=1e-3)
    print("Do both models output the same tensors?",
          "🔥" if success else "💩")
    if not success:
        raise Exception("Something went wRoNg")

    print(f"Saving model to {pytorch_dump_folder_path}")
    model.save_pretrained(pytorch_dump_folder_path)
示例#16
0
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False

    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True


# configuration for the model
from pytorch_transformers import BertConfig

config = BertConfig(vocab_size_or_config_json_file=32000,
                    hidden_size=768,
                    num_hidden_layers=12,
                    num_attention_heads=12,
                    intermediate_size=3072)

num_labels = 2

# creating the model
model = BertForSequenceClassification(num_labels)

# # # Testing model # # #
# Converting inputs to PyTorch tensors
tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(text_tokenized)])

logits = model(tokens_tensor)
# viewing the logits
print(logits)
示例#17
0
def main():
    parser = argparse.ArgumentParser(description="Training")

    ## Required parameters
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--output_dir", type=str, required=True,
                        help="The output directory (log, checkpoints, parameters, etc.)")
    # parser.add_argument("--data_file", type=str, required=True,
    #                     help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
    # parser.add_argument("--token_counts", type=str, required=True,
    #                     help="The token counts in the data_file for MLM.")
    parser.add_argument("--force", action='store_true',
                        help="Overwrite output_dir if it already exists.")
    parser.add_argument("--task_name", default=None, type=str, required=True,
                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")
    # parser.add_argument("--output_dir", default=None, type=str, required=True,
    #                     help="The output directory where the model predictions and checkpoints will be written.")

    parser.add_argument("--vocab_size", default=30522, type=int,
                        help="The vocabulary size.")
    parser.add_argument("--max_position_embeddings", default=512, type=int,
                        help="Maximum sequence length we can model (including [CLS] and [SEP]).")
    parser.add_argument("--sinusoidal_pos_embds", action='store_false',
                        help="If true, the position embeddings are simply fixed with sinusoidal embeddings.")
    parser.add_argument("--n_layers", default=6, type=int,
                        help="Number of Transformer blocks.")
    parser.add_argument("--n_heads", default=12, type=int,
                        help="Number of heads in the self-attention module.")
    parser.add_argument("--dim", default=768, type=int,
                        help="Dimension through the network. Must be divisible by n_heads")
    parser.add_argument("--hidden_dim", default=3072, type=int,
                        help="Intermediate dimension in the FFN.")
    parser.add_argument("--dropout", default=0.1, type=float,
                        help="Dropout.")
    parser.add_argument("--attention_dropout", default=0.1, type=float,
                        help="Dropout in self-attention.")
    parser.add_argument("--activation", default='gelu', type=str,
                        help="Activation to use in self-attention")
    parser.add_argument("--tie_weights_", action='store_false',
                        help="If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true.")

    # parser.add_argument("--from_pretrained_weights", default=None, type=str,
    #                     help="Load student initialization checkpoint.")
    # parser.add_argument("--from_pretrained_config", default=None, type=str,
    #                     help="Load student initialization architecture config.")
    parser.add_argument("--teacher_name", default="bert-base-uncased", type=str,
                        help="The teacher model.")

    parser.add_argument("--temperature", default=2., type=float,
                        help="Temperature for the softmax temperature.")
    parser.add_argument("--alpha_ce", default=1.0, type=float,
                        help="Linear weight for the distillation loss. Must be >=0.")
    # parser.add_argument("--alpha_mlm", default=0.5, type=float,
    #                     help="Linear weight for the MLM loss. Must be >=0.")
    parser.add_argument("--alpha_mse", default=0.0, type=float,
                        help="Linear weight of the MSE loss. Must be >=0.")
    parser.add_argument("--alpha_cos", default=0.0, type=float,
                        help="Linear weight of the cosine embedding loss. Must be >=0.")
    # parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
    #                     help="Proportion of tokens for which we need to make a prediction.")
    # parser.add_argument("--word_mask", default=0.8, type=float,
    #                     help="Proportion of tokens to mask out.")
    # parser.add_argument("--word_keep", default=0.1, type=float,
    #                     help="Proportion of tokens to keep.")
    # parser.add_argument("--word_rand", default=0.1, type=float,
    #                     help="Proportion of tokens to randomly replace.")
    # parser.add_argument("--mlm_smoothing", default=0.7, type=float,
    #                     help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
    # parser.add_argument("--restrict_ce_to_mask", action='store_true',
    #                     help="If true, compute the distilation loss only the [MLM] prediction distribution.")

    parser.add_argument("--n_epoch", type=int, default=3,
                        help="Number of pass on the whole dataset.")
    parser.add_argument("--batch_size", type=int, default=5,
                        help="Batch size (for each process).")
    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    # parser.add_argument("--tokens_per_batch", type=int, default=-1,
    #                     help="If specified, modify the batches so that they have approximately this number of tokens.")
    # parser.add_argument("--shuffle", action='store_false',
    #                     help="If true, shuffle the sequence order. Default is true.")
    parser.add_argument("--group_by_size", action='store_false',
                        help="If true, group sequences that have similar length into the same batch. Default is true.")

    parser.add_argument("--gradient_accumulation_steps", type=int, default=50,
                        help="Gradient accumulation for larger training batches.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_prop", default=0.05, type=float,
                        help="Linear warmup proportion.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--learning_rate", default=5e-4, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon", default=1e-6, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=5.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--initializer_range", default=0.02, type=float,
                        help="Random initialization range.")

    # parser.add_argument('--fp16', action='store_true',
    #                     help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    # parser.add_argument('--fp16_opt_level', type=str, default='O1',
    #                     help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
    #                          "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--n_gpu", type=int, default=1,
                        help="Number of GPUs in the node.")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="Distributed training - Local rank")
    parser.add_argument("--seed", type=int, default=56,
                        help="Random seed")

    parser.add_argument("--log_interval", type=int, default=10,
                        help="Tensorboard logging interval.")
    parser.add_argument('--log_examples', action='store_false',
                        help="Show input examples on the command line during evaluation. Enabled by default.")
    parser.add_argument("--evaluate_during_training", action='store_true',
                        help="Rul evaluation during training at each logging step.")
    parser.add_argument("--checkpoint_interval", type=int, default=100,
                        help="Checkpoint interval.")
    parser.add_argument("--no_cuda", type=bool, default=False,
                        help="Avoid using CUDA when available")
    parser.add_argument("--toy_mode", action='store_true', help="Toy mode for development.")
    parser.add_argument("--rich_eval", action='store_true', help="Rich evaluation (more metrics + mistake reporting).")

    args = parser.parse_args()
    args.no_cuda = False
    print("NO CUDA", args.no_cuda)

    ## ARGS ##
    init_gpu_params(args)
    set_seed(args)
    if args.is_master:
        if os.path.exists(args.output_dir):
            if not args.force:
                raise ValueError(f'Serialization dir {args.output_dir} already exists, but you have not precised wheter to overwrite it'
                                   'Use `--force` if you want to overwrite it')
            else:
                shutil.rmtree(args.output_dir)

        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        logger.info(f'Experiment will be dumped and logged in {args.output_dir}')


        ### SAVE PARAMS ###
        logger.info(f'Param: {args}')
        with open(os.path.join(args.output_dir, 'parameters.json'), 'w') as f:
            json.dump(vars(args), f, indent=4)
        # git_log(args.output_dir)
    print(args.local_rank)
    print(args.no_cuda)
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
        args.local_rank = -1
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        device = torch.device("cuda", args.local_rank)
    args.device = device

    logger.info("DEVICE: {}".format(args.device))
    logger.info("N_GPU: {}".format(args.n_gpu))
    # exit(0)
    print(args.local_rank)

    ### TOKENIZER ###
    # if args.teacher_type == 'bert':
    tokenizer = BertTokenizer.from_pretrained(args.teacher_name, do_lower_case=args.do_lower_case)
    # elif args.teacher_type == 'roberta':
    #     tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name)
    special_tok_ids = {}
    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
        idx = tokenizer.all_special_tokens.index(tok_symbol)
        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
    logger.info(f'Special tokens {special_tok_ids}')
    args.special_tok_ids = special_tok_ids

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    args.model_name_or_path = args.teacher_name
    args.max_seq_length = args.max_position_embeddings
    args.model_type = "bert"
    args.output_mode = output_modes[args.task_name]
    train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)

    ## DATA LOADER ##
    # logger.info(f'Loading data from {args.data_file}')
    # with open(args.data_file, 'rb') as fp:
    #     data = pickle.load(fp)


    # assert os.path.isfile(args.token_counts)
    # logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
    # with open(args.token_counts, 'rb') as fp:
    #     counts = pickle.load(fp)
    #     assert len(counts) == args.vocab_size
    # token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
    # for idx in special_tok_ids.values():
    #     token_probs[idx] = 0.  # do not predict special tokens
    # token_probs = torch.from_numpy(token_probs)


    # train_dataloader = Dataset(params=args, data=data)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.batch_size)
    logger.info(f'Data loader created.')


    ## STUDENT ##
    # if args.from_pretrained_weights is not None:
    #     assert os.path.isfile(args.from_pretrained_weights)
    #     assert os.path.isfile(args.from_pretrained_config)
    #     logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}')
    #     logger.info(f'Loading pretrained config from {args.from_pretrained_config}')
    #     stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config)
    #     stu_architecture_config.output_hidden_states = True
    #     student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights,
    #                                                     config=stu_architecture_config)
    # else:
    #     args.vocab_size_or_config_json_file = args.vocab_size
    #     stu_architecture_config = DistilBertConfig(**vars(args), output_hidden_states=True)
    #     student = DistilBertForMaskedLM(stu_architecture_config)
    student_config = BertConfig(
        vocab_size_or_config_json_file=args.vocab_size,
        hidden_size=args.dim,
        num_hidden_layers=args.n_layers,
        num_attention_heads=args.n_heads,
        intermediate_size=args.hidden_dim,
        hidden_dropout_prob=args.dropout,
        attention_probs_dropout_prob=args.attention_dropout,
        max_position_embeddings=args.max_position_embeddings,
        hidden_act=args.activation,
        initializer_range=0.02)
    student = BertForSequenceClassification(student_config)
    if args.n_gpu > 0:
        student.to(f'cuda:{args.local_rank}')
    logger.info(f'Student loaded.')


    ## TEACHER ##
    teacher = BertForSequenceClassification.from_pretrained(args.teacher_name) # take outputs[1] for the logits
    if args.n_gpu > 0:
        teacher.to(f'cuda:{args.local_rank}')
    logger.info(f'Teacher loaded from {args.teacher_name}.')

    ## DISTILLER ##
    torch.cuda.empty_cache()
    distiller = Distiller(params=args,
                          dataloader=train_dataloader,
                          # token_probs=token_probs,
                          student=student,
                          teacher=teacher,
                          tokenizer=tokenizer)
    distiller.train()
    logger.info("Let's go get some drinks.")
from pytorch_transformers import BertTokenizer, BertConfig
from pytorch_transformers import WarmupLinearSchedule
from torch.utils.data import DataLoader, SubsetRandomSampler
from tqdm import tqdm, trange

from BertModules import BertClassifier
from Constants import *
from DataModules import SequenceDataset
from Utils import seed_everything

seed_everything()

# Load BERT default config object and make necessary changes as per requirement
config = BertConfig(hidden_size=768,
                    num_hidden_layers=12,
                    num_attention_heads=12,
                    intermediate_size=3072,
                    num_labels=2)

# Create our custom BERTClassifier model object
model = BertClassifier(config)
model.to(DEVICE)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load Train dataset and split it into Train and Validation dataset
train_dataset = SequenceDataset(TRAIN_FILE_PATH, tokenizer)

validation_split = 0.2
dataset_size = len(train_dataset)
示例#19
0
    def __init__(self,
                 args,
                 device,
                 checkpoint=None,
                 bert_from_extractive=None):
        super(AbsSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = Bert(args.large, args.temp_dir,
                         args.finetune_bert)  #false, ../temp, ture
        #输入最多512个词(还要除掉[CLS]和[SEP]),最多两个句子合成一句。这之外的词和句子会没有对应的embedding,pooler是对cls位置编码
        if bert_from_extractive is not None:
            self.bert.model.load_state_dict(dict([
                (n[11:], p) for n, p in bert_from_extractive.items()
                if n.startswith('bert.model')
            ]),
                                            strict=True)

        if (args.encoder == 'baseline'):  #default:bert
            bert_config = BertConfig(
                self.bert.model.config.vocab_size,
                hidden_size=args.enc_hidden_size,
                num_hidden_layers=args.enc_layers,
                num_attention_heads=8,
                intermediate_size=args.enc_ff_size,
                hidden_dropout_prob=args.enc_dropout,
                attention_probs_dropout_prob=args.enc_dropout)
            self.bert.model = BertModel(bert_config)

        if (args.max_pos > 512):  #最大不大于512,故此层用不到
            my_pos_embeddings = nn.Embedding(
                args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:
                                          512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[
                512:] = self.bert.model.embeddings.position_embeddings.weight.data[
                    -1][None, :].repeat(args.max_pos - 512, 1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
        self.vocab_size = self.bert.model.config.vocab_size  #此为bert.model中config的vocab_size:21128
        tgt_embeddings = nn.Embedding(
            self.vocab_size, self.bert.model.config.hidden_size,
            padding_idx=0)  #同上hidden_size:768# #对摘要进行编码
        if (self.args.share_emb):  #False
            tgt_embeddings.weight = copy.deepcopy(
                self.bert.model.embeddings.word_embeddings.weight)

        #bertmodel可作为特征提取过程,既此时对应的encoder,transformer作为decoder
        self.decoder = TransformerDecoder(  #多头机制
            self.args.dec_layers,
            self.args.dec_hidden_size,
            heads=self.args.dec_heads,
            d_ff=self.args.dec_ff_size,
            dropout=self.args.dec_dropout,
            embeddings=tgt_embeddings)

        self.generator = get_generator(self.vocab_size,
                                       self.args.dec_hidden_size, device)
        self.generator[0].weight = self.decoder.embeddings.weight  #21168

        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:  #对模型进行训练
            for module in self.decoder.modules():
                if isinstance(module, (nn.Linear, nn.Embedding)):
                    module.weight.data.normal_(mean=0.0, std=0.02)
                elif isinstance(module, nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)
                if isinstance(module, nn.Linear) and module.bias is not None:
                    module.bias.data.zero_()
            for p in self.generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
                else:
                    p.data.zero_()
            if (args.use_bert_emb):
                tgt_embeddings = nn.Embedding(
                    self.vocab_size,
                    self.bert.model.config.hidden_size,
                    padding_idx=0)
                tgt_embeddings.weight = copy.deepcopy(
                    self.bert.model.embeddings.word_embeddings.weight)
                self.decoder.embeddings = tgt_embeddings
                self.generator[0].weight = self.decoder.embeddings.weight

        self.to(device)
示例#20
0
            with torch.no_grad():
                top_vec, _ = self.model(x, segs, attention_mask=mask)
        return top_vec


from tokenization import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)
bert = Bert(False, './temp', True)
hidden_size = 512
num_hidden_layers = 6
num_attention_heads = 8
intermediate_size = 512
hidden_dropout_prob = 0.1
attention_probs_dropout_prob = 0.1
bert_config = BertConfig(
    bert.model.config.vocab_size,
    hidden_size=hidden_size,
    num_hidden_layers=num_hidden_layers,
    num_attention_heads=num_attention_heads,
    intermediate_size=intermediate_size,
    hidden_dropout_prob=hidden_dropout_prob,
    attention_probs_dropout_prob=attention_probs_dropout_prob)
bert.model = BertModel(bert_config)
copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight)


class BertEmbeddings(nn.Module):
    def __init__(self, large, temp_dir, finetune=False):
        super(Bert, self).__init__()
示例#21
0
def main(args):

    class_weights, train, dev, test = get_data(option=args.option,
                                               dataset_size=args.dataset_size,
                                               unbalanced=args.unbalanced)

    option = args.option
    using_GPU = torch.cuda.is_available() and args.using_GPU

    config = BertConfig(num_labels=len(TASK_LABELS[option]),
                        output_attentions=True)
    # config = BertConfig()

    if args.model_type == 'BertOrigin':
        from pretrained.BertOrigin import BertOrigin
        modelcreator = BertOrigin
    elif args.model_type == 'BertCNN':
        from pretrained.BertCNN import BertCNN
        modelcreator = BertCNN
    elif args.model_type == 'BertAttn':
        from pretrained.BertAttn import BertAttn
        modelcreator = BertAttn

    if args.do_train:
        # create and train model
        #BertConfig
        config.from_pretrained(PRETRAINED_WEIGHTS)
        # print('before load',config)
        model = modelcreator(config,
                             option=option,
                             dropout=args.dropout,
                             gpu=using_GPU,
                             seed=args.seed,
                             do_lower_case=args.do_lower_case)
        #froze the parameters of bert
        if args.frozen:
            for param in model.bert.parameters():
                param.requires_grad = False
            print_params(model)

        # optimizer and Warmup Schedule
        model_params = list(model.named_parameters())
        # print_params(model)
        #set the weight decay of LayerNorm and bias is zero
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                param for name, param in model_params
                if not any(nd in name for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                param for name, param in model_params
                if any(nd in name for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        if args.optimizer == 'Adam':
            optimizer = torch.optim.Adam(optimizer_grouped_parameters,
                                         lr=args.learning_rate)
        elif args.optimizer == 'AdamW':
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              correct_bias=args.correct_bias)
        elif args.optimizer == 'SGD':
            optimizer = torch.optim.SGD(optimizer_grouped_parameters,
                                        lr=args.learning_rate,
                                        momentum=0.5)

        scheduler = None
        if args.warmup_proportion != 0:
            num_total_steps = int(len(train) / args.batch_size) * args.epochs
            # 1.implements AdamW without compensatation for the bias
            # 2.implements weight decay fix
            if args.warmup_schedules == 'linear':
                scheduler = WarmupLinearSchedule(optimizer,
                                                 warmup_steps=num_total_steps *
                                                 args.warmup_proportion,
                                                 t_total=num_total_steps,
                                                 last_epoch=-1)
            elif args.warmup_schedules == 'constant':
                scheduler = WarmupConstantSchedule(
                    optimizer,
                    warmup_steps=num_total_steps * args.warmup_proportion,
                    t_total=num_total_steps,
                    last_epoch=-1)
            elif args.warmup_schedules == 'cosine':
                scheduler = WarmupCosineSchedule(optimizer,
                                                 warmup_steps=num_total_steps *
                                                 args.warmup_proportion,
                                                 t_total=num_total_steps,
                                                 cycles=0.5,
                                                 last_epoch=-1)

        #datasample
        train_dataloader = dataloader(train,
                                      MAX_SEQ_LENGTH,
                                      model.tokenizer,
                                      args.batch_size,
                                      is_sample=args.sample)
        dev_dataloader = dataloader(dev, MAX_SEQ_LENGTH, model.tokenizer,
                                    args.batch_size)

        # reload for pretraining
        model.set_focal_loss(alpha=class_weights, gamma=args.gamma)
        model.load_model(args.model_load, args.model_dir)

        model_saved_path = do_train(model,
                                    train_dataloader,
                                    dev_dataloader,
                                    args.epochs,
                                    optimizer,
                                    scheduler,
                                    args.dataset_size,
                                    args.early_stop,
                                    args.print_step,
                                    args.gradient_accumulation_steps,
                                    args.batch_size,
                                    args.learning_rate,
                                    model_path=PATH_CONFIG)

        test_predictions(model, test, model_saved_path[:-1] + ".csv",
                         args.batch_size)
    elif args.model_dir:
        config.from_pretrained(PRETRAINED_WEIGHTS)
        model = modelcreator(config,
                             option=option,
                             dropout=args.dropout,
                             gpu=using_GPU,
                             seed=args.seed,
                             do_lower_case=args.do_lower_case)

        model.set_focal_loss(alpha=class_weights, gamma=args.gamma)
        model.load_model(args.model_load, args.model_dir)
        # model_dir = "./results/B64_lr1e-05_s0.01_0819_2023/"
        test_predictions(model,
                         test,
                         args.model_dir[:-1] + ".csv",
                         batch_size=args.batch_size)
示例#22
0
    def __init__(self, args, device, checkpoint, lamb=0.8):
        super(ExtSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.lamb = lamb
        # if args.
        # bert
        self.bert = Bert(args.large, args.temp_dir, args.finetune_bert)

        # Extraction layer.
        self.ext_layer = ExtTransformerEncoder(self.bert.model.config.hidden_size, args.ext_ff_size, args.ext_heads,
                                               args.ext_dropout, args.ext_layers)
        if (args.encoder == 'baseline'):
            bert_config = BertConfig(self.bert.model.config.vocab_size, hidden_size=args.ext_hidden_size,
                                     num_hidden_layers=args.ext_layers, num_attention_heads=args.ext_heads, intermediate_size=args.ext_ff_size)
            self.bert.model = BertModel(bert_config)
            self.ext_layer = Classifier(self.bert.model.config.hidden_size)

        if(args.max_pos>512):
            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][None,:].repeat(args.max_pos-512,1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings

        # initial the parameter for infor\rel\novel.
        self.W_cont = nn.Parameter(torch.Tensor(1 ,self.bert.model.config.hidden_size))
        self.W_sim = nn.Parameter(torch.Tensor(self.bert.model.config.hidden_size, self.bert.model.config.hidden_size))
        self.Sim_layer= nn.Linear(self.bert.model.config.hidden_size,self.bert.model.config.hidden_size)
        self.W_rel = nn.Parameter(torch.Tensor(self.bert.model.config.hidden_size, self.bert.model.config.hidden_size))
        self.Rel_layer= nn.Linear(self.bert.model.config.hidden_size,self.bert.model.config.hidden_size)
        self.W_novel = nn.Parameter(torch.Tensor(self.bert.model.config.hidden_size, self.bert.model.config.hidden_size))
        self.b_matrix = nn.Parameter(torch.Tensor(1, 1))

        self.q_transform = nn.Linear(100, 1)
        self.bq = nn.Parameter(torch.Tensor(1, 1))
        self.brel = nn.Parameter(torch.Tensor(1, 1))
        self.bsim = nn.Parameter(torch.Tensor(1, 1))
        self.bcont = nn.Parameter(torch.Tensor(1, 1))

        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
            print("checkpoint loaded! ")
        else:
            if args.param_init != 0.0:
                for p in self.ext_layer.parameters():
                    p.data.uniform_(-args.param_init, args.param_init)
            if args.param_init_glorot:
                for p in self.ext_layer.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)
                for p in self.Rel_layer.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)
                for p in self.Sim_layer.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)
            nn.init.xavier_uniform_(self.bq)
            nn.init.xavier_uniform_(self.W_cont)
            nn.init.xavier_uniform_(self.W_sim)
            nn.init.xavier_uniform_(self.W_rel)
            nn.init.xavier_uniform_(self.W_novel)
            nn.init.xavier_uniform_(self.b_matrix)
            nn.init.xavier_uniform_(self.bcont)
            nn.init.xavier_uniform_(self.brel)
            nn.init.xavier_uniform_(self.bsim)
        self.to(device)
示例#23
0
LABEL.build_vocab(train_data)

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=DEVICE)

model = BertForSequenceClassification(
    BertConfig(vocab_size=MAX_VOCAB_SIZE,
               max_position_embeddings=512,
               intermediate_size=1024,
               hidden_size=512,
               num_attention_heads=8,
               num_hidden_layers=6,
               type_vocab_size=5,
               hidden_dropout_prob=0.1,
               attention_probs_dropout_prob=0.1,
               num_labels=2))
model.to(DEVICE)

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.bert.embeddings.word_embeddings.weight.data[PAD_IDX] = torch.zeros(512)
model.bert.embeddings.word_embeddings.weight.data[UNK_IDX] = torch.zeros(512)

print(
    f'Parameter: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}'
)
示例#24
0
    def __init__(self, config):
        super(AbsSummarizer, self).__init__()
        self.config = config
        self.encoder = Bert(large=False,
                            temp_dir='../temp',
                            finetune=config.finetune_bert)

        if (config.encoder == 'Transformer'):
            '''
              "attention_probs_dropout_prob": 0.1,
                "finetuning_task": null,
                "hidden_act": "gelu",
                "hidden_dropout_prob": 0.1,
                "hidden_size": 768,
                "initializer_range": 0.02,
                "intermediate_size": 3072,
                "layer_norm_eps": 1e-12,
                "max_position_embeddings": 512,
                "num_attention_heads": 12,
                "num_hidden_layers": 12,
                "num_labels": 2,
                "output_attentions": false,
                "output_hidden_states": false,
                "pruned_heads": {},
                "torchscript": false,
                "type_vocab_size": 2,
                "vocab_size": 30522
            '''
            bert_config = BertConfig(
                self.encoder.model.config.vocab_size,
                hidden_size=config.enc_hidden_size,
                num_hidden_layers=config.enc_layers,
                num_attention_heads=config.enc_heads,
                intermediate_size=config.enc_ff_size,
                hidden_dropout_prob=config.enc_dropout,
                attention_probs_dropout_prob=config.enc_dropout)
            self.encoder.model = BertModel(bert_config)

        if (config.max_pos > 512):
            my_pos_embeddings = nn.Embedding(
                config.max_pos, self.encoder.model.config.hidden_size)
            my_pos_embeddings.weight.data[:
                                          512] = self.encoder.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[
                512:] = self.encoder.model.embeddings.position_embeddings.weight.data[
                    -1][None, :].repeat(config.max_pos - 512, 1)
            self.encoder.model.embeddings.position_embeddings = my_pos_embeddings
            print(my_pos_embeddings.weight.data.shape)
        self.vocab_size = self.encoder.model.config.vocab_size
        tgt_embeddings = nn.Embedding(self.vocab_size,
                                      self.encoder.model.config.hidden_size,
                                      padding_idx=0)
        if (config.share_emb):
            tgt_embeddings.weight = copy.deepcopy(
                self.encoder.model.embeddings.word_embeddings.weight)

        self.decoder = TransformerDecoder(config.dec_layers,
                                          config.dec_hidden_size,
                                          heads=config.dec_heads,
                                          d_ff=config.dec_ff_size,
                                          dropout=config.dec_dropout,
                                          embeddings=tgt_embeddings)

        for module in self.decoder.modules():
            if isinstance(module, (nn.Linear, nn.Embedding)):
                module.weight.data.normal_(mean=0.0, std=0.02)
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()

        if (config.use_bert_emb):
            tgt_embeddings = nn.Embedding(
                self.vocab_size,
                self.encoder.model.config.hidden_size,
                padding_idx=0)
            tgt_embeddings.weight = copy.deepcopy(
                self.encoder.model.embeddings.word_embeddings.weight)
            self.decoder.embeddings = tgt_embeddings

        self.word_prob = WordProbLayer(config,
                                       self.vocab_size,
                                       config.dec_hidden_size,
                                       self.decoder.embeddings,
                                       copy=config.copy)
        for p in self.word_prob.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)
            else:
                p.data.zero_()
        )

        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(
            sequence_output, pooled_output
        )

        mean_pooled_output = torch.mean(sequence_output, dim=1)
        mean_pooled_output = self.dropout(mean_pooled_output)
        logits = self.classifier(mean_pooled_output)

        outputs = (prediction_scores, seq_relationship_score, logits)
        return outputs


config = BertConfig(str(PATH_TO_CKPT_CONFIG / "config.json"))
model = BertPretrain(config, len(TARGETS))

# Prepare extended bert embedding
orig_bert = BertForPreTraining.from_pretrained("bert-base-cased")
orig_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

state_dict = orig_bert.state_dict()
del state_dict["cls.predictions.decoder.weight"], state_dict["cls.predictions.bias"]

orig_embedding = state_dict["bert.embeddings.word_embeddings.weight"]

extra_tokens = list(tokenizer.vocab.keys())[len(orig_tokenizer.vocab) :]
new_tokens_as_orig_indices = [[i] for i in range(len(orig_tokenizer.vocab))] + [
    orig_tokenizer.encode(t, add_special_tokens=False) for t in extra_tokens
]