def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = BertConfig( vocab_size_or_config_json_file=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def __init__(self, args, device, checkpoint): super(ExtSummarizer, self).__init__() self.args = args self.device = device self.bert = Bert(args, args.temp_dir, args.finetune_bert) self.ext_layer = ExtTransformerEncoder(self.bert.model.config.hidden_size, args.ext_ff_size, args.ext_heads, args.ext_dropout, args.ext_layers) if (args.encoder == 'baseline'): bert_config = BertConfig(self.bert.model.config.vocab_size, hidden_size=args.ext_hidden_size, num_hidden_layers=args.ext_layers, num_attention_heads=args.ext_heads, intermediate_size=args.ext_ff_size) self.bert.model = BertModel(bert_config) self.ext_layer = Classifier(self.bert.model.config.hidden_size) if(args.max_pos>512): my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size) my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][None,:].repeat(args.max_pos-512,1) self.bert.model.embeddings.position_embeddings = my_pos_embeddings if checkpoint is not None: self.load_state_dict(checkpoint['model'], strict=True) else: if args.param_init != 0.0: for p in self.ext_layer.parameters(): p.data.uniform_(-args.param_init, args.param_init) if args.param_init_glorot: for p in self.ext_layer.parameters(): if p.dim() > 1: xavier_uniform_(p) self.to(device)
def __init__(self): super().__init__() config = BertConfig() config.output_hidden_states = True self.bert = BertModel.from_pretrained('bert-base-uncased', config=config) self.bertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def __init__(self, *, pretrained_model_name=None, config_filename=None, vocab_size=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", max_position_embeddings=512, random_init=False, **kwargs): TrainableNM.__init__(self, **kwargs) # Check that only one of pretrained_model_name, config_filename, and # vocab_size was passed in total = 0 if pretrained_model_name is not None: total += 1 if config_filename is not None: total += 1 if vocab_size is not None: total += 1 if total != 1: raise ValueError( "Only one of pretrained_model_name, vocab_size, " + "or config_filename should be passed into the " + "BERT constructor.") if vocab_size is not None: config = BertConfig( vocab_size_or_config_json_file=vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings) model = BertModel(config) elif pretrained_model_name is not None: model = BertModel.from_pretrained(pretrained_model_name) elif config_filename is not None: config = BertConfig.from_json_file(config_filename) model = BertModel(config) else: raise ValueError( "Either pretrained_model_name or vocab_size must" + "be passed into the BERT constructor") model.to(self._device) self.add_module("bert", model) self.config = model.config if random_init: self.apply( lambda module: transformer_weights_init(module, xavier=False))
def __init__(self, args, device, checkpoint=None, bert_from_extractive=None): super(AbsSummarizer, self).__init__() self.args = args self.device = device self.bert = Bert(args.large, args.temp_dir, args.finetune_bert) if bert_from_extractive is not None: self.bert.model.load_state_dict( dict([(n[11:], p) for n, p in bert_from_extractive.items() if n.startswith('bert.model')]), strict=True) if (args.encoder == 'baseline'): bert_config = BertConfig(self.bert.model.config.vocab_size, hidden_size=args.enc_hidden_size, num_hidden_layers=args.enc_layers, num_attention_heads=8, intermediate_size=args.enc_ff_size, hidden_dropout_prob=args.enc_dropout, attention_probs_dropout_prob=args.enc_dropout) self.bert.model = BertModel(bert_config) if(args.max_pos>512): my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size) my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][None,:].repeat(args.max_pos-512,1) self.bert.model.embeddings.position_embeddings = my_pos_embeddings self.vocab_size = self.bert.model.config.vocab_size tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) if (self.args.share_emb): tgt_embeddings = self.bert.model.embeddings.word_embeddings self.decoder = TransformerDecoder( self.args.dec_layers, self.args.dec_hidden_size, heads=self.args.dec_heads, d_ff=self.args.dec_ff_size, dropout=self.args.dec_dropout, embeddings=tgt_embeddings) self.generator = get_generator(self.vocab_size, self.args.dec_hidden_size, device) self.generator[0].weight = self.decoder.embeddings.weight if checkpoint is not None: self.load_state_dict(checkpoint['model'], strict=True) else: for module in self.decoder.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_(mean=0.0, std=0.02) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() for p in self.generator.parameters(): if p.dim() > 1: xavier_uniform_(p) else: p.data.zero_() if(args.use_bert_emb): tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) tgt_embeddings.weight = copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight) self.decoder.embeddings = tgt_embeddings self.generator[0].weight = self.decoder.embeddings.weight self.to(device)
def __init__(self, num_labels=2, model_type='bert-base-uncased', token_layer='token-cls', output_logits=True): super(BertForWSD, self).__init__() self.config = BertConfig() self.token_layer = token_layer self.num_labels = 2 self.bert = BertModel.from_pretrained(model_type) self.dropout = nn.Dropout(self.config.hidden_dropout_prob) self.output_logits = output_logits # Define which token selection layer to use if token_layer == 'token-cls': self.tokenselectlayer = TokenClsLayer() elif token_layer in ['sent-cls', 'sent-cls-ws']: self.tokenselectlayer = SentClsLayer() else: raise ValueError( "Unidentified parameter for token selection layer") self.classifier = nn.Linear(768, num_labels) if not output_logits: self.softmax = nn.Softmax(dim=1) # to be checked!!! nn.init.xavier_normal_(self.classifier.weight)
def __init__(self, args, device, checkpoint): super(ExtSummarizer, self).__init__() self.args = args self.device = device self.bert = Bert(args.large, args.temp_dir, args.finetune_bert) self.ext_layer = ExtTransformerEncoder( self.bert.model.config.hidden_size, args.ext_ff_size, args.ext_heads, args.ext_dropout, args.ext_layers) if (args.encoder == 'baseline'): bert_config = BertConfig(self.bert.model.config.vocab_size, hidden_size=args.hidden_size, num_hidden_layers=6, num_attention_heads=8, intermediate_size=args.ff_size) self.bert.model = BertModel(bert_config) self.ext_layer = Classifier(self.bert.model.config.hidden_size) if checkpoint is not None: self.load_state_dict(checkpoint['model'], strict=True) else: if args.param_init != 0.0: for p in self.ext_layer.parameters(): p.data.uniform_(-args.param_init, args.param_init) if args.param_init_glorot: for p in self.ext_layer.parameters(): if p.dim() > 1: xavier_uniform_(p) self.to(device)
def load_model(): model_dir = '../../model/model/' config = BertConfig(num_labels=3, output_attentions=True) config.from_pretrained('../../model/bert-cased/') model = BertAttn(config, option='feed', dropout=0.1, gpu=False, seed=0, do_lower_case=False) class_weights = [0.6058, 0.1161, 0.2781] model.set_focal_loss(alpha=class_weights, gamma=-1) model.load_model(True, model_dir) return model
def init_embedding(self, np_file, embedding_type="glove"): if embedding_type == "bert" and SUPPORT_BERT_EMBEDDING: bert = Bert(False, '/temp/', False) bert.model.config.vocab_size = self.ntoken bert_config = BertConfig(bert.model.config.vocab_size, hidden_size=self.emb_dim, num_hidden_layers=2, num_attention_heads=8, intermediate_size=512, hidden_dropout_prob=self.dropout, attention_probs_dropout_prob=self.dropout) bert.model = BertModel(bert_config) weight_init = copy.deepcopy(bert.model.embeddings.word_embeddings.weight) assert weight_init.shape == (self.ntoken, self.emb_dim) self.emb.weight.data[:self.ntoken] = weight_init else: weight_init = torch.from_numpy(np.load(np_file)) assert weight_init.shape == (self.ntoken, self.emb_dim) self.emb.weight.data[:self.ntoken] = weight_init
def init(self): bert_config = BertConfig(self.args.output_config_file) if os.path.exists(self.args.output_model_file): if self.args.model_name == 'BertCNNPlus': bert_config.filter_num = self.args.filter_num bert_config.filter_sizes = [int(val) for val in self.args.filter_sizes.split()] elif self.args.model_name == 'BertRCNN': bert_config.rnn_hidden_size = self.args.rnn_hidden_size bert_config.num_layers = self.args.num_layers bert_config.bidirectional = self.args.bidirectional bert_config.dropout = self.args.dropout else: pass self.model = Net(config=bert_config) self.model.load_state_dict(torch.load(self.args.output_model_file)) self.model.to(DEVICE) self.tokenizer = BertTokenizer(self.args.bert_vocab_file).from_pretrained(self.args.bert_model_dir, do_lower_case=self.args.do_lower_case)
def __init__(self): super().__init__() config = BertConfig() config.output_hidden_states = True self.bert = BertModel.from_pretrained('bert-base-uncased', config=config) self.bertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.ninput = 768 self.nlayers = 1 self.bidirectional = True self.num_directions = 2 if self.bidirectional else 1 self.nhidden = 256 // self.num_directions self.drop_prob = 0.5 self.rnn = nn.LSTM(self.ninput, self.nhidden, self.nlayers, batch_first=True, dropout=self.drop_prob, bidirectional=self.bidirectional)
def __init__(self, path_to_csv="tractatus_with_splits.csv", pretrained_name="bert-base-cased", n_layers=4, start_split="train"): """ :type n_layers: int """ self.n_layers = n_layers self.tokenizer = BertTokenizer.from_pretrained(pretrained_name) our_config = BertConfig(vocab_size_or_config_json_file=28996, output_hidden_states=True) self.model = BertModel.from_pretrained(pretrained_name, config=our_config) self.model.eval() self.all_df = pd.read_csv(path_to_csv) self.test_df = self.all_df[self.all_df.split == "test"] self.train_df = self.all_df[self.all_df.split == "train"] self.val_df = self.all_df[self.all_df.split == "validation"] self._lookup_dict = {"train": self.train_df, "val": self.val_df, "test": self.test_df} self.set_split(start_split)
ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing)) ax.set_xticklabels([''] + list(df.columns)) ax.set_yticklabels([''] + list(df.index)) for tick in ax.get_xticklabels(): tick.set_rotation(45) # tick.label.set_fontsize(14) # for tick in ax.get_yticklabels(): # tick.label.set_fontsize(14) ax.tick_params(axis='both', which='major', labelsize=15) plt.savefig(f"./figures/{step}_{sentence}.png") # plt.show() model_dir = './B96_lr1e-06_s1.0_0903_1905/' config = BertConfig(num_labels=3, output_attentions=True) # PRETRAINED_WEIGHTS = "bert-base-cased" config.from_pretrained('bert-base-cased') model = BertAttn(config, option='emoji', dropout=0.1, gpu=False, seed=0, do_lower_case=False) # model.set_focal_loss(alpha=class_weights,gamma=-1) model.load_model(True, model_dir) # model.bert.save_pretrained('./bert-cased/') class_weights, train, dev, test = get_data(option='emoji', dataset_size=1, unbalanced=False)
def main(): parser = argparse.ArgumentParser("") parser.add_argument("--model", type=str, default='') parser.add_argument("--resume", action='store_true') parser.add_argument("--eval", action='store_true') parser.add_argument("--batch_size", type=int, default=CFG.batch_size) parser.add_argument("--nepochs", type=int, default=CFG.num_train_epochs) parser.add_argument("--wsteps", type=int, default=CFG.warmup_steps) parser.add_argument("--nlayers", type=int, default=CFG.num_hidden_layers) parser.add_argument("--nahs", type=int, default=CFG.num_attention_heads) parser.add_argument("--seed", type=int, default=7) parser.add_argument("--lr", type=float, default=CFG.learning_rate) parser.add_argument("--dropout", type=float, default=CFG.dropout) parser.add_argument("--types", nargs='+', type=str, default=['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN'], help='3JHC,2JHC,1JHC,3JHH,2JHH,3JHN,2JHN,1JHN') parser.add_argument("--train_file", default="train_mute_cp") parser.add_argument("--test_file", default="test_mute_cp") parser.add_argument("--pseudo_path", default="") parser.add_argument("--pseudo", action='store_true') parser.add_argument("--gen_pseudo", action='store_true') parser.add_argument("--use_all", action='store_true') parser.add_argument("--structure_file", default="structures_mu") parser.add_argument("--contribution_file", default="scalar_coupling_contributions") args = parser.parse_args() print(args) CFG.batch_size=args.batch_size CFG.num_train_epochs=args.nepochs CFG.warmup_steps=args.wsteps CFG.num_hidden_layers=args.nlayers CFG.num_attention_heads=args.nahs CFG.learning_rate=args.lr CFG.dropout=args.dropout CFG.seed = args.seed print(CFG.__dict__) random.seed(CFG.seed) np.random.seed(CFG.seed) torch.manual_seed(CFG.seed) #if not args.eval: if True: train_df = load_csv(args.train_file) structures_df = load_csv(args.structure_file) structures_df[['x', 'y', 'z']] -= structures_df.groupby('molecule_name')[['x', 'y', 'z']].transform('mean') contributions_df = load_csv(args.contribution_file) train_df = train_df.merge(contributions_df, how='left') train_df = normalize_cols(train_df, ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) train_df = add_extra_features(train_df, structures_df) train_df = train_df.fillna(1e08) n_mols = train_df['molecule_name'].nunique() train_df, valid_df = train_test_split(train_df, 5000 ) # only molecules with the args.types print(train_df['molecule_name'].nunique()) mol_names_with_at = train_df[train_df['type'].isin(args.types)]['molecule_name'].unique() train_df = train_df[train_df['molecule_name'].isin(mol_names_with_at)].reset_index(drop=True) print(train_df['molecule_name'].nunique()) # Print the 5 rows of valid_df to verify whether the valid_df is the same as the previous experiment. print(valid_df.head(5)) if args.pseudo: test_df = load_csv(args.test_file) logger.info(f'loading dataset - {args.pseudo_path} ...') test_pseudo_df = pd.read_csv(args.pseudo_path) #mol_names_jhn = train_df[test_df['type'].isin(['1JHN', '2JHN', '3JHN'])]['molecule_name'].unique() #test_df = test_df[test_df['molecule_name'].isin(mol_names_jhn)].reset_index(drop=True) test_df = add_extra_features(test_df, structures_df) test_df = test_df.set_index('id') test_pseudo_df = test_pseudo_df.set_index('id') test_df[['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']] = test_pseudo_df[['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']] test_df = test_df.reset_index() #test_df = normalize_target(test_df) test_df = normalize_cols(test_df, ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) #test_df = test_df.assign(fc=1e08, sd=1e08, pso=1e08, dso=1e08) train_df['weight'] = 1.0 valid_df['weight'] = 1.0 test_df['weight'] = 1.0 n_mols = test_df['molecule_name'].nunique() train_df = train_df.append(test_df).reset_index(drop=True) else: train_df['weight'] = 1.0 valid_df['weight'] = 1.0 if args.use_all: train_df = train_df.append(valid_df) print(f' n_train:{len(train_df)}, n_valid:{len(valid_df)}') config = BertConfig( 3, # not used hidden_size=CFG.hidden_size, num_hidden_layers=CFG.num_hidden_layers, num_attention_heads=CFG.num_attention_heads, intermediate_size=CFG.intermediate_size, hidden_dropout_prob=CFG.dropout, attention_probs_dropout_prob=CFG.dropout, ) model = cust_model.SelfAttn(config) if args.model != "": print("=> loading checkpoint '{}'".format(args.model)) checkpoint = torch.load(args.model) CFG.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.model, checkpoint['epoch'])) model.cuda() def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) print('parameters: ', count_parameters(model)) n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # to produce the submission.csv if args.eval: test_df = load_csv(args.test_file) structures_df = load_csv(args.structure_file) structures_df[['x', 'y', 'z']] -= structures_df.groupby('molecule_name')[['x', 'y', 'z']].transform('mean') test_df = add_extra_features(test_df, structures_df) test_df = test_df.assign(fc=1e08, sd=1e08, pso=1e08, dso=1e08) test_df['scalar_coupling_constant'] = 0 test_df['weight'] = 1.0 test_db = db.MolDB(test_df, CFG.max_seq_length) test_loader = DataLoader( test_db, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers) res_df = validate(test_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') if args.gen_pseudo: res_df['scalar_coupling_constant'] = res_df['prediction1'] res_df = res_df[res_df['id']>-1].sort_values('id') res_df[['id', 'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']].to_csv(f'pseudo_{CFG.seed}.csv', index=False) return res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df['scalar_coupling_constant'] = res_df['prediction'] res_df = res_df[res_df['id']>-1].sort_values('id') os.makedirs('output', exist_ok=True) res_df[['id', 'scalar_coupling_constant']].to_csv(f'output/submission_{CFG.seed}.csv', index=False) return train_db = db.MolDB(train_df, CFG.max_seq_length) print('preloading dataset ...') train_db = db.MolDB_FromDB(train_db, 10) valid_db = db.MolDB(valid_df, CFG.max_seq_length) num_train_optimization_steps = int( len(train_db) / CFG.batch_size / CFG.gradient_accumulation_steps) * (CFG.num_train_epochs-CFG.start_epoch) print('num_train_optimization_steps', num_train_optimization_steps) train_loader = DataLoader( train_db, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True) val_loader = DataLoader( valid_db, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=CFG.learning_rate, weight_decay=CFG.weight_decay, ) scheduler = WarmupLinearSchedule(optimizer, CFG.warmup_steps, t_total=num_train_optimization_steps ) def get_lr(): return scheduler.get_lr()[0] if args.model != "": if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) #for param_group in optimizer.param_groups: # param_group['lr'] = CFG.learning_rate mae_log_df = checkpoint['mae_log'] del checkpoint else: mae_log_df = pd.DataFrame(columns=(['EPOCH']+['LR']+args.types + ['OVERALL']) ) os.makedirs('log', exist_ok=True) res_df = validate(val_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df.to_csv(f'log/valid_df_{"_".join(args.types)}.csv', index=False) overall_mae, maes = metric(res_df, args.types) print(overall_mae, maes) curr_lr = get_lr() print(f'initial learning rate:{curr_lr}') for epoch in range(CFG.start_epoch, CFG.num_train_epochs): # train for one epoch #print(adjust_learning_rate(optimizer, epoch)) train(train_loader, model, optimizer, epoch, args.types, scheduler) if epoch % CFG.test_freq == 0: res_df = validate(val_loader, model, args.types) res_df = unnormalize_cols(res_df, cols=['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']) res_df = unnormalize_target(res_df, 'prediction1') res_df['prediction4']= res_df[['fc', 'sd', 'pso', 'dso']].sum(1) res_df['prediction']= res_df[['prediction1','prediction4']].mean(1) res_df.to_csv(f'log/valid_df_{"_".join(args.types)}.csv', index=False) overall_mae, maes = metric(res_df, args.types) # write log file mae_row = dict([(typ, [mae]) for typ, mae in maes.items() if typ in args.types]) mae_row.update({'EPOCH':(epoch),'OVERALL':overall_mae, 'LR':curr_lr}) mae_log_df = mae_log_df.append(pd.DataFrame(mae_row), sort=False) print(mae_log_df.tail(20)) mae_log_df.to_csv(f'log/{"_".join(args.types)}.csv', index=False) #scheduler.step(overall_mae) curr_lr = get_lr() print(f'set the learning_rate: {curr_lr}') # evaluate on validation set batch_size = CFG.batch_size pseudo_path = '' if not args.pseudo else '_' + args.pseudo_path curr_model_name = (f'b{batch_size}_l{config.num_hidden_layers}_' f'mh{config.num_attention_heads}_h{config.hidden_size}_' f'd{CFG.dropout}_' f'ep{epoch}_{"_".join(args.types)}_s{CFG.seed}{pseudo_path}.pt') model_to_save = model.module if hasattr(model, 'module') else model # Only save the cust_model it-self save_checkpoint({ 'epoch': epoch + 1, 'arch': 'transformer', 'state_dict': model_to_save.state_dict(), 'mae_log': mae_log_df, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, FINETUNED_MODEL_PATH, curr_model_name ) print('done')
def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head): """ Copy/paste/tweak roberta's weights to our BERT structure. """ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout config = BertConfig( vocab_size_or_config_json_file=50265, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, intermediate_size=roberta.args.encoder_ffn_embed_dim, max_position_embeddings=514, type_vocab_size=1, layer_norm_eps=1e-5, # PyTorch default used in fairseq ) if classification_head: config.num_labels = roberta.args.num_classes print("Our BERT config:", config) model = RobertaForSequenceClassification( config) if classification_head else RobertaForMaskedLM(config) model.eval() # Now let's copy all the weights. # Embeddings roberta_sent_encoder = roberta.model.decoder.sentence_encoder model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( model.roberta.embeddings.token_type_embeddings.weight ) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[ i] ### self attention self_attn: BertSelfAttention = layer.attention.self assert (roberta_layer.self_attn.in_proj_weight.shape == torch.Size( (3 * config.hidden_size, config.hidden_size))) # we use three distinct linear layers so we split the source layer here. self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[: config . hidden_size, :] self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[: config . hidden_size] self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[ config.hidden_size:2 * config.hidden_size, :] self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[ config.hidden_size:2 * config.hidden_size] self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[ 2 * config.hidden_size:, :] self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[ 2 * config.hidden_size:] ### self-attention output self_output: BertSelfOutput = layer.attention.output assert (self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape) self_output.dense.weight = roberta_layer.self_attn.out_proj.weight self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias ### intermediate intermediate: BertIntermediate = layer.intermediate assert ( intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape) intermediate.dense.weight = roberta_layer.fc1.weight intermediate.dense.bias = roberta_layer.fc1.bias ### output bert_output: BertOutput = layer.output assert ( bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape) bert_output.dense.weight = roberta_layer.fc2.weight bert_output.dense.bias = roberta_layer.fc2.bias bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias #### end of layer if classification_head: model.classifier.dense.weight = roberta.model.classification_heads[ 'mnli'].dense.weight model.classifier.dense.bias = roberta.model.classification_heads[ 'mnli'].dense.bias model.classifier.out_proj.weight = roberta.model.classification_heads[ 'mnli'].out_proj.weight model.classifier.out_proj.bias = roberta.model.classification_heads[ 'mnli'].out_proj.bias else: # LM Head model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight model.lm_head.bias = roberta.model.decoder.lm_head.bias # Let's check that we get the same results. input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze( 0) # batch of size 1 our_output = model(input_ids)[0] if classification_head: their_output = roberta.model.classification_heads['mnli']( roberta.extract_features(input_ids)) else: their_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 success = torch.allclose(our_output, their_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if success else "💩") if not success: raise Exception("Something went wRoNg") print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path)
def freeze_bert_encoder(self): for param in self.bert.parameters(): param.requires_grad = False def unfreeze_bert_encoder(self): for param in self.bert.parameters(): param.requires_grad = True # configuration for the model from pytorch_transformers import BertConfig config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) num_labels = 2 # creating the model model = BertForSequenceClassification(num_labels) # # # Testing model # # # # Converting inputs to PyTorch tensors tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(text_tokenized)]) logits = model(tokens_tensor) # viewing the logits print(logits)
def main(): parser = argparse.ArgumentParser(description="Training") ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--output_dir", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)") # parser.add_argument("--data_file", type=str, required=True, # help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.") # parser.add_argument("--token_counts", type=str, required=True, # help="The token counts in the data_file for MLM.") parser.add_argument("--force", action='store_true', help="Overwrite output_dir if it already exists.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") # parser.add_argument("--output_dir", default=None, type=str, required=True, # help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--vocab_size", default=30522, type=int, help="The vocabulary size.") parser.add_argument("--max_position_embeddings", default=512, type=int, help="Maximum sequence length we can model (including [CLS] and [SEP]).") parser.add_argument("--sinusoidal_pos_embds", action='store_false', help="If true, the position embeddings are simply fixed with sinusoidal embeddings.") parser.add_argument("--n_layers", default=6, type=int, help="Number of Transformer blocks.") parser.add_argument("--n_heads", default=12, type=int, help="Number of heads in the self-attention module.") parser.add_argument("--dim", default=768, type=int, help="Dimension through the network. Must be divisible by n_heads") parser.add_argument("--hidden_dim", default=3072, type=int, help="Intermediate dimension in the FFN.") parser.add_argument("--dropout", default=0.1, type=float, help="Dropout.") parser.add_argument("--attention_dropout", default=0.1, type=float, help="Dropout in self-attention.") parser.add_argument("--activation", default='gelu', type=str, help="Activation to use in self-attention") parser.add_argument("--tie_weights_", action='store_false', help="If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true.") # parser.add_argument("--from_pretrained_weights", default=None, type=str, # help="Load student initialization checkpoint.") # parser.add_argument("--from_pretrained_config", default=None, type=str, # help="Load student initialization architecture config.") parser.add_argument("--teacher_name", default="bert-base-uncased", type=str, help="The teacher model.") parser.add_argument("--temperature", default=2., type=float, help="Temperature for the softmax temperature.") parser.add_argument("--alpha_ce", default=1.0, type=float, help="Linear weight for the distillation loss. Must be >=0.") # parser.add_argument("--alpha_mlm", default=0.5, type=float, # help="Linear weight for the MLM loss. Must be >=0.") parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.") parser.add_argument("--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0.") # parser.add_argument("--mlm_mask_prop", default=0.15, type=float, # help="Proportion of tokens for which we need to make a prediction.") # parser.add_argument("--word_mask", default=0.8, type=float, # help="Proportion of tokens to mask out.") # parser.add_argument("--word_keep", default=0.1, type=float, # help="Proportion of tokens to keep.") # parser.add_argument("--word_rand", default=0.1, type=float, # help="Proportion of tokens to randomly replace.") # parser.add_argument("--mlm_smoothing", default=0.7, type=float, # help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).") # parser.add_argument("--restrict_ce_to_mask", action='store_true', # help="If true, compute the distilation loss only the [MLM] prediction distribution.") parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.") parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") # parser.add_argument("--tokens_per_batch", type=int, default=-1, # help="If specified, modify the batches so that they have approximately this number of tokens.") # parser.add_argument("--shuffle", action='store_false', # help="If true, shuffle the sequence order. Default is true.") parser.add_argument("--group_by_size", action='store_false', help="If true, group sequences that have similar length into the same batch. Default is true.") parser.add_argument("--gradient_accumulation_steps", type=int, default=50, help="Gradient accumulation for larger training batches.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.") parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.") # parser.add_argument('--fp16', action='store_true', # help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") # parser.add_argument('--fp16_opt_level', type=str, default='O1', # help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." # "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.") parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank") parser.add_argument("--seed", type=int, default=56, help="Random seed") parser.add_argument("--log_interval", type=int, default=10, help="Tensorboard logging interval.") parser.add_argument('--log_examples', action='store_false', help="Show input examples on the command line during evaluation. Enabled by default.") parser.add_argument("--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument("--checkpoint_interval", type=int, default=100, help="Checkpoint interval.") parser.add_argument("--no_cuda", type=bool, default=False, help="Avoid using CUDA when available") parser.add_argument("--toy_mode", action='store_true', help="Toy mode for development.") parser.add_argument("--rich_eval", action='store_true', help="Rich evaluation (more metrics + mistake reporting).") args = parser.parse_args() args.no_cuda = False print("NO CUDA", args.no_cuda) ## ARGS ## init_gpu_params(args) set_seed(args) if args.is_master: if os.path.exists(args.output_dir): if not args.force: raise ValueError(f'Serialization dir {args.output_dir} already exists, but you have not precised wheter to overwrite it' 'Use `--force` if you want to overwrite it') else: shutil.rmtree(args.output_dir) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info(f'Experiment will be dumped and logged in {args.output_dir}') ### SAVE PARAMS ### logger.info(f'Param: {args}') with open(os.path.join(args.output_dir, 'parameters.json'), 'w') as f: json.dump(vars(args), f, indent=4) # git_log(args.output_dir) print(args.local_rank) print(args.no_cuda) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.local_rank = -1 else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs device = torch.device("cuda", args.local_rank) args.device = device logger.info("DEVICE: {}".format(args.device)) logger.info("N_GPU: {}".format(args.n_gpu)) # exit(0) print(args.local_rank) ### TOKENIZER ### # if args.teacher_type == 'bert': tokenizer = BertTokenizer.from_pretrained(args.teacher_name, do_lower_case=args.do_lower_case) # elif args.teacher_type == 'roberta': # tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name) special_tok_ids = {} for tok_name, tok_symbol in tokenizer.special_tokens_map.items(): idx = tokenizer.all_special_tokens.index(tok_symbol) special_tok_ids[tok_name] = tokenizer.all_special_ids[idx] logger.info(f'Special tokens {special_tok_ids}') args.special_tok_ids = special_tok_ids # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) args.model_name_or_path = args.teacher_name args.max_seq_length = args.max_position_embeddings args.model_type = "bert" args.output_mode = output_modes[args.task_name] train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) ## DATA LOADER ## # logger.info(f'Loading data from {args.data_file}') # with open(args.data_file, 'rb') as fp: # data = pickle.load(fp) # assert os.path.isfile(args.token_counts) # logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)') # with open(args.token_counts, 'rb') as fp: # counts = pickle.load(fp) # assert len(counts) == args.vocab_size # token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing # for idx in special_tok_ids.values(): # token_probs[idx] = 0. # do not predict special tokens # token_probs = torch.from_numpy(token_probs) # train_dataloader = Dataset(params=args, data=data) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.batch_size) logger.info(f'Data loader created.') ## STUDENT ## # if args.from_pretrained_weights is not None: # assert os.path.isfile(args.from_pretrained_weights) # assert os.path.isfile(args.from_pretrained_config) # logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}') # logger.info(f'Loading pretrained config from {args.from_pretrained_config}') # stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config) # stu_architecture_config.output_hidden_states = True # student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights, # config=stu_architecture_config) # else: # args.vocab_size_or_config_json_file = args.vocab_size # stu_architecture_config = DistilBertConfig(**vars(args), output_hidden_states=True) # student = DistilBertForMaskedLM(stu_architecture_config) student_config = BertConfig( vocab_size_or_config_json_file=args.vocab_size, hidden_size=args.dim, num_hidden_layers=args.n_layers, num_attention_heads=args.n_heads, intermediate_size=args.hidden_dim, hidden_dropout_prob=args.dropout, attention_probs_dropout_prob=args.attention_dropout, max_position_embeddings=args.max_position_embeddings, hidden_act=args.activation, initializer_range=0.02) student = BertForSequenceClassification(student_config) if args.n_gpu > 0: student.to(f'cuda:{args.local_rank}') logger.info(f'Student loaded.') ## TEACHER ## teacher = BertForSequenceClassification.from_pretrained(args.teacher_name) # take outputs[1] for the logits if args.n_gpu > 0: teacher.to(f'cuda:{args.local_rank}') logger.info(f'Teacher loaded from {args.teacher_name}.') ## DISTILLER ## torch.cuda.empty_cache() distiller = Distiller(params=args, dataloader=train_dataloader, # token_probs=token_probs, student=student, teacher=teacher, tokenizer=tokenizer) distiller.train() logger.info("Let's go get some drinks.")
from pytorch_transformers import BertTokenizer, BertConfig from pytorch_transformers import WarmupLinearSchedule from torch.utils.data import DataLoader, SubsetRandomSampler from tqdm import tqdm, trange from BertModules import BertClassifier from Constants import * from DataModules import SequenceDataset from Utils import seed_everything seed_everything() # Load BERT default config object and make necessary changes as per requirement config = BertConfig(hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, num_labels=2) # Create our custom BERTClassifier model object model = BertClassifier(config) model.to(DEVICE) # Initialize BERT tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Load Train dataset and split it into Train and Validation dataset train_dataset = SequenceDataset(TRAIN_FILE_PATH, tokenizer) validation_split = 0.2 dataset_size = len(train_dataset)
def __init__(self, args, device, checkpoint=None, bert_from_extractive=None): super(AbsSummarizer, self).__init__() self.args = args self.device = device self.bert = Bert(args.large, args.temp_dir, args.finetune_bert) #false, ../temp, ture #输入最多512个词(还要除掉[CLS]和[SEP]),最多两个句子合成一句。这之外的词和句子会没有对应的embedding,pooler是对cls位置编码 if bert_from_extractive is not None: self.bert.model.load_state_dict(dict([ (n[11:], p) for n, p in bert_from_extractive.items() if n.startswith('bert.model') ]), strict=True) if (args.encoder == 'baseline'): #default:bert bert_config = BertConfig( self.bert.model.config.vocab_size, hidden_size=args.enc_hidden_size, num_hidden_layers=args.enc_layers, num_attention_heads=8, intermediate_size=args.enc_ff_size, hidden_dropout_prob=args.enc_dropout, attention_probs_dropout_prob=args.enc_dropout) self.bert.model = BertModel(bert_config) if (args.max_pos > 512): #最大不大于512,故此层用不到 my_pos_embeddings = nn.Embedding( args.max_pos, self.bert.model.config.hidden_size) my_pos_embeddings.weight.data[: 512] = self.bert.model.embeddings.position_embeddings.weight.data my_pos_embeddings.weight.data[ 512:] = self.bert.model.embeddings.position_embeddings.weight.data[ -1][None, :].repeat(args.max_pos - 512, 1) self.bert.model.embeddings.position_embeddings = my_pos_embeddings self.vocab_size = self.bert.model.config.vocab_size #此为bert.model中config的vocab_size:21128 tgt_embeddings = nn.Embedding( self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) #同上hidden_size:768# #对摘要进行编码 if (self.args.share_emb): #False tgt_embeddings.weight = copy.deepcopy( self.bert.model.embeddings.word_embeddings.weight) #bertmodel可作为特征提取过程,既此时对应的encoder,transformer作为decoder self.decoder = TransformerDecoder( #多头机制 self.args.dec_layers, self.args.dec_hidden_size, heads=self.args.dec_heads, d_ff=self.args.dec_ff_size, dropout=self.args.dec_dropout, embeddings=tgt_embeddings) self.generator = get_generator(self.vocab_size, self.args.dec_hidden_size, device) self.generator[0].weight = self.decoder.embeddings.weight #21168 if checkpoint is not None: self.load_state_dict(checkpoint['model'], strict=True) else: #对模型进行训练 for module in self.decoder.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_(mean=0.0, std=0.02) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() for p in self.generator.parameters(): if p.dim() > 1: xavier_uniform_(p) else: p.data.zero_() if (args.use_bert_emb): tgt_embeddings = nn.Embedding( self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) tgt_embeddings.weight = copy.deepcopy( self.bert.model.embeddings.word_embeddings.weight) self.decoder.embeddings = tgt_embeddings self.generator[0].weight = self.decoder.embeddings.weight self.to(device)
with torch.no_grad(): top_vec, _ = self.model(x, segs, attention_mask=mask) return top_vec from tokenization import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) bert = Bert(False, './temp', True) hidden_size = 512 num_hidden_layers = 6 num_attention_heads = 8 intermediate_size = 512 hidden_dropout_prob = 0.1 attention_probs_dropout_prob = 0.1 bert_config = BertConfig( bert.model.config.vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_dropout_prob=hidden_dropout_prob, attention_probs_dropout_prob=attention_probs_dropout_prob) bert.model = BertModel(bert_config) copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight) class BertEmbeddings(nn.Module): def __init__(self, large, temp_dir, finetune=False): super(Bert, self).__init__()
def main(args): class_weights, train, dev, test = get_data(option=args.option, dataset_size=args.dataset_size, unbalanced=args.unbalanced) option = args.option using_GPU = torch.cuda.is_available() and args.using_GPU config = BertConfig(num_labels=len(TASK_LABELS[option]), output_attentions=True) # config = BertConfig() if args.model_type == 'BertOrigin': from pretrained.BertOrigin import BertOrigin modelcreator = BertOrigin elif args.model_type == 'BertCNN': from pretrained.BertCNN import BertCNN modelcreator = BertCNN elif args.model_type == 'BertAttn': from pretrained.BertAttn import BertAttn modelcreator = BertAttn if args.do_train: # create and train model #BertConfig config.from_pretrained(PRETRAINED_WEIGHTS) # print('before load',config) model = modelcreator(config, option=option, dropout=args.dropout, gpu=using_GPU, seed=args.seed, do_lower_case=args.do_lower_case) #froze the parameters of bert if args.frozen: for param in model.bert.parameters(): param.requires_grad = False print_params(model) # optimizer and Warmup Schedule model_params = list(model.named_parameters()) # print_params(model) #set the weight decay of LayerNorm and bias is zero no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ param for name, param in model_params if not any(nd in name for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ param for name, param in model_params if any(nd in name for nd in no_decay) ], 'weight_decay': 0.0 }] if args.optimizer == 'Adam': optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=args.learning_rate) elif args.optimizer == 'AdamW': optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=args.correct_bias) elif args.optimizer == 'SGD': optimizer = torch.optim.SGD(optimizer_grouped_parameters, lr=args.learning_rate, momentum=0.5) scheduler = None if args.warmup_proportion != 0: num_total_steps = int(len(train) / args.batch_size) * args.epochs # 1.implements AdamW without compensatation for the bias # 2.implements weight decay fix if args.warmup_schedules == 'linear': scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_total_steps * args.warmup_proportion, t_total=num_total_steps, last_epoch=-1) elif args.warmup_schedules == 'constant': scheduler = WarmupConstantSchedule( optimizer, warmup_steps=num_total_steps * args.warmup_proportion, t_total=num_total_steps, last_epoch=-1) elif args.warmup_schedules == 'cosine': scheduler = WarmupCosineSchedule(optimizer, warmup_steps=num_total_steps * args.warmup_proportion, t_total=num_total_steps, cycles=0.5, last_epoch=-1) #datasample train_dataloader = dataloader(train, MAX_SEQ_LENGTH, model.tokenizer, args.batch_size, is_sample=args.sample) dev_dataloader = dataloader(dev, MAX_SEQ_LENGTH, model.tokenizer, args.batch_size) # reload for pretraining model.set_focal_loss(alpha=class_weights, gamma=args.gamma) model.load_model(args.model_load, args.model_dir) model_saved_path = do_train(model, train_dataloader, dev_dataloader, args.epochs, optimizer, scheduler, args.dataset_size, args.early_stop, args.print_step, args.gradient_accumulation_steps, args.batch_size, args.learning_rate, model_path=PATH_CONFIG) test_predictions(model, test, model_saved_path[:-1] + ".csv", args.batch_size) elif args.model_dir: config.from_pretrained(PRETRAINED_WEIGHTS) model = modelcreator(config, option=option, dropout=args.dropout, gpu=using_GPU, seed=args.seed, do_lower_case=args.do_lower_case) model.set_focal_loss(alpha=class_weights, gamma=args.gamma) model.load_model(args.model_load, args.model_dir) # model_dir = "./results/B64_lr1e-05_s0.01_0819_2023/" test_predictions(model, test, args.model_dir[:-1] + ".csv", batch_size=args.batch_size)
def __init__(self, args, device, checkpoint, lamb=0.8): super(ExtSummarizer, self).__init__() self.args = args self.device = device self.lamb = lamb # if args. # bert self.bert = Bert(args.large, args.temp_dir, args.finetune_bert) # Extraction layer. self.ext_layer = ExtTransformerEncoder(self.bert.model.config.hidden_size, args.ext_ff_size, args.ext_heads, args.ext_dropout, args.ext_layers) if (args.encoder == 'baseline'): bert_config = BertConfig(self.bert.model.config.vocab_size, hidden_size=args.ext_hidden_size, num_hidden_layers=args.ext_layers, num_attention_heads=args.ext_heads, intermediate_size=args.ext_ff_size) self.bert.model = BertModel(bert_config) self.ext_layer = Classifier(self.bert.model.config.hidden_size) if(args.max_pos>512): my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size) my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][None,:].repeat(args.max_pos-512,1) self.bert.model.embeddings.position_embeddings = my_pos_embeddings # initial the parameter for infor\rel\novel. self.W_cont = nn.Parameter(torch.Tensor(1 ,self.bert.model.config.hidden_size)) self.W_sim = nn.Parameter(torch.Tensor(self.bert.model.config.hidden_size, self.bert.model.config.hidden_size)) self.Sim_layer= nn.Linear(self.bert.model.config.hidden_size,self.bert.model.config.hidden_size) self.W_rel = nn.Parameter(torch.Tensor(self.bert.model.config.hidden_size, self.bert.model.config.hidden_size)) self.Rel_layer= nn.Linear(self.bert.model.config.hidden_size,self.bert.model.config.hidden_size) self.W_novel = nn.Parameter(torch.Tensor(self.bert.model.config.hidden_size, self.bert.model.config.hidden_size)) self.b_matrix = nn.Parameter(torch.Tensor(1, 1)) self.q_transform = nn.Linear(100, 1) self.bq = nn.Parameter(torch.Tensor(1, 1)) self.brel = nn.Parameter(torch.Tensor(1, 1)) self.bsim = nn.Parameter(torch.Tensor(1, 1)) self.bcont = nn.Parameter(torch.Tensor(1, 1)) if checkpoint is not None: self.load_state_dict(checkpoint['model'], strict=True) print("checkpoint loaded! ") else: if args.param_init != 0.0: for p in self.ext_layer.parameters(): p.data.uniform_(-args.param_init, args.param_init) if args.param_init_glorot: for p in self.ext_layer.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in self.Rel_layer.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in self.Sim_layer.parameters(): if p.dim() > 1: xavier_uniform_(p) nn.init.xavier_uniform_(self.bq) nn.init.xavier_uniform_(self.W_cont) nn.init.xavier_uniform_(self.W_sim) nn.init.xavier_uniform_(self.W_rel) nn.init.xavier_uniform_(self.W_novel) nn.init.xavier_uniform_(self.b_matrix) nn.init.xavier_uniform_(self.bcont) nn.init.xavier_uniform_(self.brel) nn.init.xavier_uniform_(self.bsim) self.to(device)
LABEL.build_vocab(train_data) train_iterator, test_iterator = data.BucketIterator.splits( (train_data, test_data), batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True, device=DEVICE) model = BertForSequenceClassification( BertConfig(vocab_size=MAX_VOCAB_SIZE, max_position_embeddings=512, intermediate_size=1024, hidden_size=512, num_attention_heads=8, num_hidden_layers=6, type_vocab_size=5, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, num_labels=2)) model.to(DEVICE) PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] model.bert.embeddings.word_embeddings.weight.data[PAD_IDX] = torch.zeros(512) model.bert.embeddings.word_embeddings.weight.data[UNK_IDX] = torch.zeros(512) print( f'Parameter: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}' )
def __init__(self, config): super(AbsSummarizer, self).__init__() self.config = config self.encoder = Bert(large=False, temp_dir='../temp', finetune=config.finetune_bert) if (config.encoder == 'Transformer'): ''' "attention_probs_dropout_prob": 0.1, "finetuning_task": null, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "num_attention_heads": 12, "num_hidden_layers": 12, "num_labels": 2, "output_attentions": false, "output_hidden_states": false, "pruned_heads": {}, "torchscript": false, "type_vocab_size": 2, "vocab_size": 30522 ''' bert_config = BertConfig( self.encoder.model.config.vocab_size, hidden_size=config.enc_hidden_size, num_hidden_layers=config.enc_layers, num_attention_heads=config.enc_heads, intermediate_size=config.enc_ff_size, hidden_dropout_prob=config.enc_dropout, attention_probs_dropout_prob=config.enc_dropout) self.encoder.model = BertModel(bert_config) if (config.max_pos > 512): my_pos_embeddings = nn.Embedding( config.max_pos, self.encoder.model.config.hidden_size) my_pos_embeddings.weight.data[: 512] = self.encoder.model.embeddings.position_embeddings.weight.data my_pos_embeddings.weight.data[ 512:] = self.encoder.model.embeddings.position_embeddings.weight.data[ -1][None, :].repeat(config.max_pos - 512, 1) self.encoder.model.embeddings.position_embeddings = my_pos_embeddings print(my_pos_embeddings.weight.data.shape) self.vocab_size = self.encoder.model.config.vocab_size tgt_embeddings = nn.Embedding(self.vocab_size, self.encoder.model.config.hidden_size, padding_idx=0) if (config.share_emb): tgt_embeddings.weight = copy.deepcopy( self.encoder.model.embeddings.word_embeddings.weight) self.decoder = TransformerDecoder(config.dec_layers, config.dec_hidden_size, heads=config.dec_heads, d_ff=config.dec_ff_size, dropout=config.dec_dropout, embeddings=tgt_embeddings) for module in self.decoder.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_(mean=0.0, std=0.02) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() if (config.use_bert_emb): tgt_embeddings = nn.Embedding( self.vocab_size, self.encoder.model.config.hidden_size, padding_idx=0) tgt_embeddings.weight = copy.deepcopy( self.encoder.model.embeddings.word_embeddings.weight) self.decoder.embeddings = tgt_embeddings self.word_prob = WordProbLayer(config, self.vocab_size, config.dec_hidden_size, self.decoder.embeddings, copy=config.copy) for p in self.word_prob.parameters(): if p.dim() > 1: xavier_uniform_(p) else: p.data.zero_()
) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls( sequence_output, pooled_output ) mean_pooled_output = torch.mean(sequence_output, dim=1) mean_pooled_output = self.dropout(mean_pooled_output) logits = self.classifier(mean_pooled_output) outputs = (prediction_scores, seq_relationship_score, logits) return outputs config = BertConfig(str(PATH_TO_CKPT_CONFIG / "config.json")) model = BertPretrain(config, len(TARGETS)) # Prepare extended bert embedding orig_bert = BertForPreTraining.from_pretrained("bert-base-cased") orig_tokenizer = BertTokenizer.from_pretrained("bert-base-cased") state_dict = orig_bert.state_dict() del state_dict["cls.predictions.decoder.weight"], state_dict["cls.predictions.bias"] orig_embedding = state_dict["bert.embeddings.word_embeddings.weight"] extra_tokens = list(tokenizer.vocab.keys())[len(orig_tokenizer.vocab) :] new_tokens_as_orig_indices = [[i] for i in range(len(orig_tokenizer.vocab))] + [ orig_tokenizer.encode(t, add_special_tokens=False) for t in extra_tokens ]