def _init_deep_model(self, model_type, model_path, num_labels, num_regs=None): if 'roberta' in model_type: tokenizer = RobertaTokenizer.from_pretrained(model_path) config = RobertaConfig.from_pretrained(model_path) config.num_labels = num_labels model = RobertaForSequenceClassification.from_pretrained(model_path, config=config) model.eval() model.to(self.device) elif 'electra_multitask' in model_type: tokenizer = ElectraTokenizer.from_pretrained(model_path) tokenizer.add_special_tokens({'additional_special_tokens': ['[VALUES]']}) config = ElectraConfig.from_pretrained(model_path) config.num_labels = num_labels config.num_regs = num_regs config.vocab_size = len(tokenizer) model = ElectraForSequenceClassificationMultiTask.from_pretrained(model_path, config=config) model.eval() model.to(self.device) elif 'electra' in model_type: tokenizer = ElectraTokenizer.from_pretrained(model_path) config = ElectraConfig.from_pretrained(model_path) config.num_labels = num_labels model = ElectraForSequenceClassification.from_pretrained(model_path, config=config) model.eval() model.to(self.device) else: raise NotImplementedError() return config, tokenizer, model
def main(train_cfg='config/electra_pretrain.json', model_cfg='config/electra_small.json', data_file='../tbc/books_large_all.txt', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', log_dir='../exp/electra/pretrain/runs', save_dir='../exp/electra/pretrain', max_len=128, max_pred=20, mask_prob=0.15, quantize=False): check_dirs_exist([log_dir, save_dir]) train_cfg = ElectraConfig().from_json_file(train_cfg) model_cfg = ElectraConfig().from_json_file(model_cfg) set_seeds(train_cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [ Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, max_len) ] data_iter = SentPairDataLoader(data_file, train_cfg.batch_size, tokenize, max_len, pipeline=pipeline) # Get distilled-electra and quantized-distilled-electra generator = ElectraForMaskedLM.from_pretrained( 'google/electra-small-generator') t_discriminator = ElectraForPreTraining.from_pretrained( 'google/electra-base-discriminator') s_discriminator = QuantizedElectraForPreTraining( model_cfg) if quantize else ElectraForPreTraining s_discriminator = s_discriminator.from_pretrained( 'google/electra-small-discriminator', config=model_cfg) # model # config is used for model "QuantizedElectraForPreTraining" model = DistillElectraForPreTraining(generator, t_discriminator, s_discriminator, model_cfg) optimizer = optim.optim4GPU(train_cfg, model) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX base_trainer_args = (train_cfg, model_cfg, model, data_iter, None, optimizer, save_dir, get_device()) trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args) trainer.train(model_file, None, data_parallel) trainer._eval()
def main(task='mrpc', base_train_cfg='config/QDElectra_pretrain.json', train_cfg='config/train_mrpc.json', model_cfg='config/QDElectra_base.json', data_file='../glue/MRPC/train.tsv', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', log_dir='../exp/electra/pretrain/runs', save_dir='../exp/bert/mrpc', mode='train', pred_distill=True): train_cfg_dict = json.load(open(base_train_cfg, "r")) train_cfg_dict.update(json.load(open(train_cfg, "r"))) train_cfg = ElectraConfig().from_dict(train_cfg_dict) # train_cfg = ElectraConfig().from_json_file(train_cfg) model_cfg = ElectraConfig().from_json_file(model_cfg) output_mode, train_cfg.n_epochs, max_len = get_task_params(task) set_seeds(train_cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) TaskDataset = dataset_class(task) # task dataset class according to the task num_labels = len(TaskDataset.labels) pipeline = [ Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), AddSpecialTokensWithTruncation(max_len), TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, output_mode, max_len) ] data_set = TaskDataset(data_file, pipeline) data_iter = DataLoader(data_set, batch_size=train_cfg.batch_size, shuffle=True) t_discriminator = ElectraForSequenceClassification.from_pretrained( 'google/electra-base-discriminator' ) s_discriminator = QuantizedElectraForSequenceClassification.from_pretrained( 'google/electra-small-discriminator', config=model_cfg ) model = DistillElectraForSequenceClassification(t_discriminator, s_discriminator, model_cfg) optimizer = optim.optim4GPU(train_cfg, model) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX base_trainer_args = (train_cfg, model_cfg, model, data_iter, optimizer, save_dir, get_device()) trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args) if mode == 'train': trainer.train(model_file, None, data_parallel) elif mode == 'eval': input_ids, attention_mask, token_type_ids, label_ids = TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, output_mode, max_len) _, eval_labels = get_tensor_data(output_mode, input_ids, attention_mask, token_type_ids, label_ids) results = trainer.eval(model_file, output_mode, eval_labels, num_labels, data_parallel) total_accuracy = torch.cat(results).mean().item() print('Accuracy:', total_accuracy)
def get_model(args): if args.model_size == 'debug': num_hidden_layers = 1 embedding_size = 8 hidden_size = 16 intermediate_size = 32 num_attention_heads = 2 args.gen_ratio = 2 elif args.model_size == 'tiny': num_hidden_layers = 4 embedding_size = 128 hidden_size = 336 intermediate_size = 1344 num_attention_heads = 12 elif args.model_size == 'small': num_hidden_layers = 12 embedding_size = 128 hidden_size = 256 intermediate_size = 1024 num_attention_heads = 4 elif args.model_size == 'base': num_hidden_layers = 12 embedding_size = 768 hidden_size = 768 intermediate_size = 3072 num_attention_heads = 12 else: raise Exception('Which model? small, base, large') generator_config = ElectraConfig( max_position_embeddings=args.seq_length, vocab_size=args.vocab_size, num_hidden_layers=num_hidden_layers, embedding_size=embedding_size, hidden_size=hidden_size // args.gen_ratio, intermediate_size=intermediate_size // args.gen_ratio, num_attention_heads=num_attention_heads // args.gen_ratio, ) discriminator_config = ElectraConfig( max_position_embeddings=args.seq_length, vocab_size=args.vocab_size, num_hidden_layers=num_hidden_layers, embedding_size=embedding_size, hidden_size=hidden_size, intermediate_size=intermediate_size, num_attention_heads=num_attention_heads, ) model = Electra(args, gen_config=generator_config, dis_config=discriminator_config) return model
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) attention_mask = None if self.use_attention_mask: attention_mask = random_attention_mask( [self.batch_size, self.seq_length]) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) config = ElectraConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, embedding_size=self.embedding_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, ) return config, input_ids, token_type_ids, attention_mask
def _load_model(self): config = ElectraConfig.from_pretrained(self.backbone) p_encoder = ElectraEncoder.from_pretrained(self.backbone, config=config).cuda() q_encoder = ElectraEncoder.from_pretrained(self.backbone, config=config).cuda() return p_encoder, q_encoder
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = ElectraConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): # Initialise PyTorch model config = ElectraConfig.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) if discriminator_or_generator == "discriminator": model = ElectraForPreTraining(config) elif discriminator_or_generator == "generator": model = ElectraForMaskedLM(config) else: raise ValueError( "The discriminator_or_generator argument should be either 'discriminator' or 'generator'" ) # Load weights from tf checkpoint load_tf_weights_in_electra( model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def __init__(self): self.root_path = '..' self.checkpoint_path = f"{self.root_path}/checkpoint" self.save_ckpt_path = f"{self.checkpoint_path}/koelectra-wellnesee-text-classification.pth" model_name_or_path = "monologg/koelectra-base-discriminator" # 답변과 카테고리 불러오기 self.category, self.answer = load_wellness_answer() ctx = "cuda" if torch.cuda.is_available() else "cpu" self.device = torch.device(ctx) # 저장한 Checkpoint 불러오기 checkpoint = torch.load(self.save_ckpt_path, map_location=self.device) # Electra Tokenizer self.tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path) electra_config = ElectraConfig.from_pretrained(model_name_or_path) self.model = koElectraForSequenceClassification.from_pretrained( pretrained_model_name_or_path=model_name_or_path, config=electra_config, num_labels=359) self.model.load_state_dict(checkpoint['model_state_dict']) self.model.to(self.device) self.model.eval()
def load_electra_model(self): parser = argparse.ArgumentParser() args = parser.parse_args() args.output_encoded_layers = True args.output_attention_layers = True args.output_att_score = True args.output_att_sum = True self.args = args # 解析配置文件, 教师模型和student模型的vocab是不变的 # 这里是使用的teacher的config和微调后的teacher模型, 也可以换成student的config和蒸馏后的student模型 # student config: config/chinese_bert_config_L4t.json # distil student model: distil_model/gs8316.pkl bert_config_file_S = self.model_conf tuned_checkpoint_S = self.model_file # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度 bert_config_S = ElectraConfig.from_json_file(bert_config_file_S) bert_config_S.num_labels = self.num_labels # 加载tokenizer self.predict_tokenizer = BertTokenizer(vocab_file=self.vocab_file) # 加载模型 self.predict_model = ElectraSPC(bert_config_S) assert os.path.exists(tuned_checkpoint_S), "模型文件不存在,请检查" state_dict_S = torch.load(tuned_checkpoint_S, map_location=self.device) self.predict_model.load_state_dict(state_dict_S) if self.verbose: print("模型已加载") logger.info(f"预测模型{tuned_checkpoint_S}加载完成")
def get_model_and_tokenizer(model_name, device): save_ckpt_path = CHECK_POINT[model_name] if model_name == "koelectra": model_name_or_path = "monologg/koelectra-base-discriminator" tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path) electra_config = ElectraConfig.from_pretrained(model_name_or_path) model = koElectraForSequenceClassification.from_pretrained( pretrained_model_name_or_path=model_name_or_path, config=electra_config, num_labels=359) elif model_name == 'kobert': tokenizer = get_tokenizer() model = KoBERTforSequenceClassfication() if os.path.isfile(save_ckpt_path): checkpoint = torch.load(save_ckpt_path, map_location=device) pre_epoch = checkpoint['epoch'] # pre_loss = checkpoint['loss'] model.load_state_dict(checkpoint['model_state_dict']) print(f"load pretrain from: {save_ckpt_path}, epoch={pre_epoch}") return model, tokenizer
def __init__(self, root_path='../ai/chatbot'): checkpoint_path = f"{root_path}/checkpoint" self.model_path = f"{checkpoint_path}/koelectra-wellness-text-classification.pth" model_name_or_path = "monologg/koelectra-base-discriminator" checkpoint = torch.load(self.model_path, map_location=device) electra_config = ElectraConfig.from_pretrained(model_name_or_path) self.model = koElectraForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=electra_config, num_labels=359) self.model.load_state_dict(checkpoint['model_state_dict']) self.model.to(device) self.model.eval() self.tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path) self.category = [] idx = -1 with open(root_path+'/data/wellness_data_for_text_classification.txt', 'r') as f: while True: line = f.readline() if not line: break datas = line.strip().split("\t") if datas[1] != str(idx): self.category.append(datas[2]) idx += 1
def predict_pair(model_args, data_args, training_args): # Set seed set_seed(training_args.seed) if 'roberta' in model_args.model_type: tokenizer = RobertaTokenizer.from_pretrained(model_args.tokenizer_name_or_path) config = RobertaConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = RobertaForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) elif 'electra' in model_args.model_type: tokenizer = ElectraTokenizer.from_pretrained(model_args.tokenizer_name_or_path) config = ElectraConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = ElectraForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) else: # default -> bert tokenizer = BertTokenizer.from_pretrained(model_args.tokenizer_name_or_path) config = BertConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = BertForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) model.to(training_args.device) test_df = pickle.load(open(data_args.test_data_file, 'rb')) test_dataset = get_dataset(data_args, tokenizer, test_df, model_args.model_type) data_collator = MyDataCollator() if training_args.local_rank != -1: sampler = SequentialDistributedSampler(test_dataset) model = torch.nn.DataParallel(model) else: n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) sampler = SequentialSampler(test_dataset) print(len(test_dataset)) dataloader = DataLoader( test_dataset, sampler=sampler, batch_size=training_args.eval_batch_size, collate_fn=data_collator, ) model.eval() all_probs = [] for inputs in tqdm(dataloader): for k, v in inputs.items(): inputs[k] = v.to(training_args.device) inputs.pop('labels') with torch.no_grad(): outputs = model(**inputs) logits = outputs[0] probs = torch.softmax(logits, dim=-1) maxp, maxi = torch.max(probs, dim=-1) result = [(_i, _p) for _p, _i in zip(maxp, maxi)] all_probs.extend(result) with open('./{}_{}.answer_classify.result'.format(data_args.data_type, model_args.model_type), 'w', encoding='utf-8') as fout: for i in range(len(test_df)): fout.write('{} | {} | {} | {} | {}\n'.format(test_df[i][0], test_df[i][1], test_df[i][2], all_probs[i][0], all_probs[i][1]))
def __init__(self, output_size=24005, device='cpu'): super().__init__() self.device = device config = ElectraConfig.from_pretrained( 'google/electra-small-discriminator') self.electra = AutoModel.from_config(config).to(device) self.output = nn.Linear(self.electra.config.hidden_size, output_size).to(device)
def revise_config(config: ElectraConfig, args: argparse.Namespace): """ Revise config as we want 1. Add multiplier between generator and discriminator 2. Degree of weight sharing 'no' : Share nothing 'embedding' : Share only embedding layer 'all' : Share all layers 3. Set configuration as electra-small """ config.multiplier_generator_and_discriminator = args.multiplier_generator_and_discriminator config.weight_sharing_degree = args.weight_sharing_degree config.rtd_loss_weight = args.rtd_loss_weight config.generator_num_hidden_layers = args.generator_num_hidden_layers config.save_log_steps = args.save_log_steps return config
def __init__(self): super(ElectraEncoder, self).__init__() self.config = ElectraConfig().from_pretrained( os.path.join('../pretrained', 'electra', 'config.json')) self.net = ElectraModel(self.config).from_pretrained( os.path.join('../pretrained', 'electra', 'tf_model.h5')) print(self.net)
def get_electra(): ids = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='ids') att = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='att') tok_type_ids = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='tti') config = ElectraConfig.from_pretrained(Config.Electra.config) electra_model = TFElectraModel.from_pretrained(Config.Electra.model, config=config) x = electra_model(ids, attention_mask=att, token_type_ids=tok_type_ids) x1 = keras.layers.Dropout(0.15)(x[0]) x1 = keras.layers.Conv1D(768, 2, padding='same')(x1) x1 = keras.layers.LeakyReLU()(x1) x1 = keras.layers.LayerNormalization()(x1) x1 = keras.layers.Conv1D(64, 2, padding='same')(x1) x1 = keras.layers.LeakyReLU()(x1) x1 = keras.layers.LayerNormalization()(x1) x1 = keras.layers.Conv1D(32, 2, padding='same')(x1) x1 = keras.layers.Conv1D(1, 1)(x1) x1 = keras.layers.Flatten()(x1) x1 = keras.layers.Activation('softmax', dtype='float32', name='sts')(x1) x2 = keras.layers.Dropout(0.15)(x[0]) x2 = keras.layers.Conv1D(768, 2, padding='same')(x2) x2 = keras.layers.LeakyReLU()(x2) x2 = keras.layers.LayerNormalization()(x2) x2 = keras.layers.Conv1D(64, 2, padding='same')(x2) x2 = keras.layers.LeakyReLU()(x2) x2 = keras.layers.LayerNormalization()(x2) x2 = keras.layers.Conv1D(32, 2, padding='same')(x2) x2 = keras.layers.Conv1D(1, 1)(x2) x2 = keras.layers.Flatten()(x2) x2 = keras.layers.Activation('softmax', dtype='float32', name='ets')(x2) model = keras.models.Model(inputs=[ids, att, tok_type_ids], outputs=[x1, x2]) optimizer = keras.optimizers.Adam(learning_rate=6e-5) if Config.Train.use_amp: optimizer = keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, 'dynamic') loss = keras.losses.CategoricalCrossentropy( label_smoothing=Config.Train.label_smoothing) model.compile(loss=loss, optimizer=optimizer) return model
def bert_config(self): if self.bert_model_name.startswith('bert-'): return BertConfig.from_pretrained(self.bert_model_name, cache_dir=self.bert_cache_dir) elif 'roberta' in self.bert_model_name: return RobertaConfig.from_pretrained(self.bert_model_name, cache_dir=self.bert_cache_dir) elif self.bert_model_name.startswith('xlm-roberta-'): return XLMRobertaConfig.from_pretrained( self.bert_model_name, cache_dir=self.bert_cache_dir) elif 'electra' in self.bert_model_name: return ElectraConfig.from_pretrained(self.bert_model_name, cache_dir=self.bert_cache_dir) else: raise ValueError('Unknown model: {}'.format(self.bert_model_name))
def get_config(self): return ElectraConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, )
def define_config(name): if name in [ "bert-base-multilingual-cased", "sangrimlee/bert-base-multilingual-cased-korquad", "kykim/bert-kor-base", "monologg/kobert" ]: return BertConfig.from_pretrained(name) elif name in [ "monologg/koelectra-base-v3-discriminator", "kykim/electra-kor-base" ]: return ElectraConfig.from_pretrained(name) elif name in ["xlm-roberta-large"]: return XLMRobertaConfig.from_pretrained(name) elif name in ["kykim/funnel-kor-base"]: return FunnelConfig.from_pretrained(name)
def _get_bert(model_type, model_path_dict): if model_type == 'bert': config = BertConfig.from_pretrained(model_path_dict['config']) config.output_hidden_states = True bert = BertModel.from_pretrained(model_path_dict['model'], config=config) elif model_type == 'electra': config = ElectraConfig.from_pretrained(model_path_dict['config']) config.output_hidden_states = True bert = ElectraModel.from_pretrained(model_path_dict['model'], config=config) elif model_type == 'roberta': config = RobertaConfig.from_pretrained(model_path_dict['config']) config.output_hidden_states = True bert = RobertaModel.from_pretrained(model_path_dict['model'], config=config) return bert, config
def __call_model_torch(self): if self.model_to_use.lower() == 'bert': self.config = BertConfig(num_labels=2) self.model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=self.config) elif self.model_to_use.lower() == 'albert': self.config = AlbertConfig(num_labels=2) self.model = AlbertForSequenceClassification.from_pretrained( 'albert-base-v1', config=self.config) elif self.model_to_use.lower() == 'electra': self.config = ElectraConfig(num_labels=2) self.model = ElectraForSequenceClassification.from_pretrained( 'google/electra-small-discriminator', config=self.config) elif self.model_to_use.lower() == 'distilbert': self.config = DistilBertConfig(num_labels=2) self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', config=self.config) else: print('Model not avaiable yet.')
def load_model(dataBunch, pretrained_path, finetuned_wgts_path, device, multi_label): model_type = dataBunch.model_type model_state_dict = None if torch.cuda.is_available(): map_location = lambda storage, loc: storage.cuda() else: map_location = "cpu" if finetuned_wgts_path: model_state_dict = torch.load(finetuned_wgts_path, map_location=map_location) else: model_state_dict = None if multi_label is True: config_class, model_class, _ = MODEL_CLASSES[model_type] config = config_class.from_pretrained( str(pretrained_path), num_labels=len(dataBunch.labels) ) model = model_class[1].from_pretrained( str(pretrained_path), config=config, state_dict=model_state_dict ) else: if model_type == "electra": config = ElectraConfig.from_pretrained( str(pretrained_path), model_type=model_type, num_labels=len(dataBunch.labels), ) else: config = AutoConfig.from_pretrained( str(pretrained_path), model_type=model_type, num_labels=len(dataBunch.labels), ) model = AutoModelForSequenceClassification.from_pretrained( str(pretrained_path), config=config, state_dict=model_state_dict ) return model.to(device)
def __init__(self, config: dict): super(Model, self).__init__() self.electra_cfg = ElectraConfig() self.electra = ElectraModel.from_pretrained(config["pretrained_dir"] + "electra_small.index", config=self.electra_cfg, from_tf=True) self.sentence_encoder = AttentionSentenceEncoder( self.electra_cfg.hidden_size, config["sent_head"], config["max_sents"] + 1) # 多一个位置给CLS self.img_encoder = SimpleImageEncoder(config["img_input_size"], config["img_output_size"], config["img_num"], dropout=config["dropout"]) self.output_layer = OutputLayer( config["task"], self.electra_cfg.hidden_size + config["img_output_size"], config["output_size"], config["dropout"])
def __init__(self, params, name="model", **kwargs): super(NERwithHFBERT, self).__init__(params, name=name, **kwargs) self._tag_string_mapper = get_sm(self._params.tags_fn_) self.tag_vocab_size = self._tag_string_mapper.size() + 2 self._tracked_layers = dict() if self.pretrained_bert is None: if self._params.use_hf_electra_model_: self.pretrained_bert = TFElectraModel(ElectraConfig.from_pretrained(params.pretrained_hf_model_,cache_dir=params.hf_cache_dir_)) else: self.pretrained_bert = TFBertModel(BertConfig.from_pretrained(params.pretrained_hf_model_,cache_dir=params.hf_cache_dir_)) self._dropout = tf.keras.layers.Dropout(self._params.dropout_last) if self._params.bet_tagging_: # print(self.tag_vocab_size-1) # half of the classes is used plus O-Class, sos, eos self._layer_cls = tf.keras.layers.Dense( int(self._tag_string_mapper.size() // 2 + 3), activation=tf.keras.activations.softmax, name="layer_cls" ) self._layer_start = tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid, name="layer_start") self._layer_end = tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid, name="layer_end") elif self._params.use_crf: self._last_layer = tf.keras.layers.Dense(self.tag_vocab_size, name="last_layer") self._trans_params = tf.keras.layers.Embedding( self.tag_vocab_size, self.tag_vocab_size, name="trans_params" ) # ,embeddings_initializer=tf.keras.initializers.Constant(1)) if self._params.crf_with_ner_rule: self._penalty_factor = tf.keras.layers.Embedding(1, 1, name="penalty_factor") # ,embeddings_initializer=tf.keras.initializers.Constant(1)) self._penalty_absolute = tf.keras.layers.Embedding(1, 1, name="penalty_absolute") # ,embeddings_initializer=tf.keras.initializers.Constant(1)) elif self.params.crf_with_ner_forb_trans: self._penalty_factor = tf.constant(0.0, name="penalty_factor", dtype=tf.float32) self._penalty_absolute = tf.constant(-100000.0, name="penalty_absolute", dtype=tf.float32) self.init_crf_with_ner_rule((self.tag_vocab_size - 3) // 2) else: self._last_layer = tf.keras.layers.Dense( self.tag_vocab_size, activation=tf.keras.activations.softmax, name="last_layer" )
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected") parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help= "The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help= "The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--version_2_with_negative", action="store_true", help= "If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", default=True, action="store_true", help="Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=10000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument( "--threads", type=int, default=1, help="multiple threads for converting example to features") ### DO NOT MODIFY THIS BLOCK ### # arguments for nsml parser.add_argument('--pause', type=int, default=0) parser.add_argument('--mode', type=str, default='train') ################################ args = parser.parse_args() # for NSML args.data_dir = os.path.join(DATASET_PATH, args.data_dir) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() logger.warning('IF args.n_gpu : ' + str(args.n_gpu) + ' / device : ' + str(device) + '\n') else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 logger.warning('ELSE args.n_gpu : ' + str(args.n_gpu) + ' / device : ' + str(device) + '\n') args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, filename='log.log') logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() logger.warning("Model Loading ..") config = ElectraConfig.from_pretrained(args.model_name_or_path) model = ElectraForQuestionAnswering.from_pretrained( args.model_name_or_path, config=config) tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=False) logger.warning("Model Loading Completed") if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) ### DO NOT MODIFY THIS BLOCK ### if IS_ON_NSML: bind_nsml(model, tokenizer, args) if args.pause: nsml.paused(scope=locals()) ################################ logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
def get_model(args, tokenizer): config = ElectraConfig.from_pretrained('google/electra-base-discriminator') config.num_labels = 4 config.vocab_size = tokenizer.get_vocab_size() if tokenizer else args.vocab_size model = ElectraForSequenceClassification(config) return model
c.lr = 1e-4 c.layer_lr_decay = 0.8 c.max_length = 512 elif c.size == "large": c.lr = 5e-5 c.layer_lr_decay = 0.9 c.max_length = 512 else: raise ValueError(f"Invalid size {c.size}") if c.pretrained_checkpoint is None: c.max_length = 512 # All public models is ++, which use max_length 512 # huggingface/transformers hf_tokenizer = ElectraTokenizerFast.from_pretrained( f"google/electra-{c.size}-discriminator") electra_config = ElectraConfig.from_pretrained( f"google/electra-{c.size}-discriminator") # wsc if c.wsc_trick: from _utils.wsc_trick import * # importing spacy model takes time # logging # light logging callback here is to only log the last score and avoid exceeding the api access limit if c.logger == "neptune": import neptune from fastai.callback.neptune import NeptuneCallback class LightNeptuneCallback(NeptuneCallback): def after_batch(self): pass
import tensorflow as tf from transformers import ( ElectraConfig, ElectraTokenizer, TFElectraForMaskedLM, TFElectraForPreTraining, ) from electra.utils import colorize_dis, colorize_gen os.environ["CUDA_VISIBLE_DEVICES"] = "" # TODO: Should I use bert-base-uncased? tokenizer = ElectraTokenizer.from_pretrained("bert-base-uncased") gen_config = ElectraConfig.from_pretrained("google/electra-small-generator") dis_config = ElectraConfig.from_pretrained( "google/electra-small-discriminator") # gen = TFElectraForMaskedLM.from_pretrained("google/electra-small-generator") # dis = TFElectraForPreTraining.from_pretrained("google/electra-small-discriminator") gen = TFElectraForMaskedLM(config=gen_config) dis = TFElectraForPreTraining(config=dis_config) optimizer = tf.keras.optimizers.Adam(lr=1e-4) # Load in WikiText-2. filename = "/fsx/wikitext/wikitext-2-raw/wiki.test.raw" with open(filename) as infile: wiki_text: str = infile.read() # length 1,288,556 # Load in text strings.
save_ckpt_path = f"{checkpoint_path}/koelectra-wellnesee-text-classification.pth" model_name_or_path = "monologg/koelectra-base-discriminator" #답변과 카테고리 불러오기 category, answer = load_wellness_answer() ctx = "cuda" if torch.cuda.is_available() else "cpu" device = torch.device(ctx) # 저장한 Checkpoint 불러오기 checkpoint = torch.load(save_ckpt_path, map_location=device) # Electra Tokenizer tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path) electra_config = ElectraConfig.from_pretrained(model_name_or_path) model = koElectraForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=electra_config, num_labels=359) model.load_state_dict(checkpoint['model_state_dict']) model.to(device) model.eval() while 1: sent = input('\nQuestion: ') # '요즘 기분이 우울한 느낌이에요' data = koelectra_input(tokenizer,sent, device,512) # print(data) output = model(**data)