def __init__(self, split, domain, max_src_length, max_tgt_length, ignore_index=-100, n_docs=None): self.tokenizer = get_kobart_tokenizer() self.max_src_length = max_src_length self.max_tgt_length = max_tgt_length self.ignore_index = ignore_index self.bos_token = '<s>' self.eos_token = '</s>' data_path = f'data/{domain}/{split}.json' docs = json.load(open(data_path)) docs = docs[:n_docs] self._examples = [] for doc in docs: for asp_sum in doc['aspect_summaries']: self._examples.append({ 'aspect': asp_sum['aspect'], 'rel_words': asp_sum['rel_words'], 'document': doc['document'], 'summary': asp_sum['summary'] })
def main(log_path, wiki_sup=True): supervisor = pickle.load(open('supervisions/supervisor.pickle', 'rb')) \ if wiki_sup else None dataset = SummaryDataset(split='test', domain='earphone', max_src_length=512, max_tgt_length=MAX_LEN) test_examples = [example for example in dataset] tokenizer = get_kobart_tokenizer() bart = BartForConditionalGeneration.from_pretrained(f'{log_path}') src_file = open(f'{log_path}/test.source', 'w') gold_file = open(f'{log_path}/test.gold', 'w') hypo_file = open(f'{log_path}/test.hypo', 'w', encoding='utf-8') for i in trange(0, len(test_examples[:10]), BATCH_SIZE, desc=f'Generating'): batch_examples = test_examples[i:i + BATCH_SIZE] for example in batch_examples: output = bart.generate(example['input_ids'].unsqueeze(0), max_length=MAX_LEN, num_beams=BEAM_SIZE, no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE, length_penalty=LEN_PENALTY) output = tokenizer.decode(output[0], skip_special_tokens=True) print(example['src'].replace('\n\n', ' ||| '), file=src_file) print(example['tgt'], file=gold_file) print(output, file=hypo_file) print('\n', output)
def __init__(self, hparams, **kwargs): super(KoBARTConditionalGeneration, self).__init__(hparams, **kwargs) self.model = BartForConditionalGeneration.from_pretrained(get_pytorch_kobart_model()) self.model.train() self.bos_token = '<s>' self.eos_token = '</s>' self.pad_token_id = 0 self.tokenizer = get_kobart_tokenizer()
def preprocess_bert(config): args = config['args'] if config['emb_class'] == 'bart' and config['use_kobart']: from kobart import get_kobart_tokenizer tokenizer = get_kobart_tokenizer() tokenizer.cls_token = '<s>' tokenizer.sep_token = '</s>' tokenizer.pad_token = '<pad>' elif config['emb_class'] in ['gpt']: tokenizer = AutoTokenizer.from_pretrained(args.bert_model_name_or_path) tokenizer.bos_token = '<|startoftext|>' tokenizer.eos_token = '<|endoftext|>' tokenizer.cls_token = '<|startoftext|>' tokenizer.sep_token = '<|endoftext|>' tokenizer.pad_token = '<|pad|>' elif config['emb_class'] in ['t5']: tokenizer = AutoTokenizer.from_pretrained(args.bert_model_name_or_path) tokenizer.cls_token = '<s>' tokenizer.sep_token = '</s>' tokenizer.pad_token = '<pad>' else: tokenizer = AutoTokenizer.from_pretrained(args.bert_model_name_or_path) # build labels path = os.path.join(args.data_dir, _TRAIN_FILE) labels = build_label(path) # build features if args.augmented: path = os.path.join(args.data_dir, args.augmented_filename) else: path = os.path.join(args.data_dir, _TRAIN_FILE) train_features = build_features(path, tokenizer, labels, config, mode='train') path = os.path.join(args.data_dir, _VALID_FILE) valid_features = build_features(path, tokenizer, labels, config, mode='valid') path = os.path.join(args.data_dir, _TEST_FILE) test_features = build_features(path, tokenizer, labels, config, mode='test') # write features if args.augmented: path = os.path.join(args.data_dir, args.augmented_filename + _FSUFFIX) else: path = os.path.join(args.data_dir, _TRAIN_FILE + _FSUFFIX) write_features(train_features, path) path = os.path.join(args.data_dir, _VALID_FILE + _FSUFFIX) write_features(valid_features, path) path = os.path.join(args.data_dir, _TEST_FILE + _FSUFFIX) write_features(test_features, path) # write labels path = os.path.join(args.data_dir, _LABEL_FILE) write_label(labels, path)
def __init__(self, hparam=None, text_logger=None): super(BART, self).__init__() self._model = BartForConditionalGeneration.from_pretrained( get_pytorch_kobart_model()) self._model.train() self.tokenizer = get_kobart_tokenizer() self._hparams = hparam self._text_logger = text_logger
def __init__(self, train_file, test_file, tok, max_len=512, batch_size=8, num_workers=5): super().__init__() self.batch_size = batch_size self.max_len = max_len self.train_file_path = train_file self.test_file_path = test_file if tok is None: self.tok = get_kobart_tokenizer() else: self.tok = tok self.num_workers = num_workers
def __init__(self, data, max_seq_len=128): self.data = data self.max_seq_len = max_seq_len self.tokenizer = get_kobart_tokenizer()
def __init__(self, ckpt_path="./n_title_epoch_3"): self.model = BartForConditionalGeneration.from_pretrained( ckpt_path).cuda() self.tokenizer = get_kobart_tokenizer()
def main(): # Get ArgParse args = get_args() if args.checkpoint: args.checkpoint = ( "./model_checkpoint/" + args.checkpoint[-1] if args.checkpoint[-1] == "/" else "./model_checkpoint/" + args.checkpoint ) else: args.checkpoint = "./model_checkpoint/" + gen_checkpoint_id(args) # If checkpoint path exists, load the last model if os.path.isdir(args.checkpoint): # EXAMPLE: "{engine_name}_{task_name}_{timestamp}/saved_checkpoint_1" args.checkpoint_count = checkpoint_count(args.checkpoint) logger = get_logger(args) logger.info(f"Checkpoint path directory exists") logger.info(f"Loading model from saved_checkpoint_{args.checkpoint_count}") model = torch.load(f"{args.checkpoint}/saved_checkpoint_{args.checkpoint_count}") args.checkpoint_count += 1 # # If there is none, create a checkpoint folder and train from scratch else: try: os.makedirs(args.checkpoint) except: print("Ignoring Existing File Path ...") # model = BartModel.from_pretrained(get_pytorch_kobart_model()) model = AutoModelForSeq2SeqLM.from_pretrained(get_pytorch_kobart_model()) args.checkpoint_count = 0 logger = get_logger(args) logger.info(f"Creating a new directory for {args.checkpoint}") args.logger = logger model.to(args.device) # Define Tokenizer tokenizer = get_kobart_tokenizer() # Add Additional Special Tokens #special_tokens_dict = {"sep_token": "<sep>"} #tokenizer.add_special_tokens(special_tokens_dict) #model.resize_token_embeddings(new_num_tokens=len(tokenizer)) # Define Optimizer optimizer_class = getattr(transformers, args.optimizer_class) optimizer = optimizer_class(model.parameters(), lr=args.learning_rate) logger.info(f"Loading data from {args.data_dir} ...") with open("data/Brunch_accm_20210328_train.json", 'r') as f: train_data = json.load(f) train_context = [data['text'] for data in train_data] train_tag = [data['tag'] for data in train_data] with open("data/Brunch_accm_20210328_test.json", 'r') as f: test_data = json.load(f) test_context = [data['text'] for data in test_data] test_tag = [data['tag'] for data in test_data] train_dataset = SummaryDataset(train_context, train_tag, tokenizer, args.enc_max_len, args.dec_max_len, ignore_index=-100) test_dataset = SummaryDataset(test_context, test_tag, tokenizer, args.enc_max_len, args.dec_max_len, ignore_index=-100) # train_dataset = Seq2SeqDataset(data_path=os.path.join(args.data_dir, "train.json")) # valid_dataset = Seq2SeqDataset(data_path=os.path.join(args.data_dir, "valid.json")) # test_dataset = Seq2SeqDataset(data_path=os.path.join(args.data_dir, "test.json")) batch_generator = SummaryBatchGenerator(tokenizer) train_loader = get_dataloader( train_dataset, batch_generator=batch_generator, batch_size=args.train_batch_size, shuffle=True, ) test_loader = get_dataloader( test_dataset, batch_generator=batch_generator, batch_size=args.eval_batch_size, shuffle=False, ) # test_loader = get_dataloader( # test_dataset, # batch_generator=batch_generator, # batch_size=args.eval_batch_size, # shuffle=False, # ) train(model, optimizer, tokenizer, train_loader, test_loader, test_tag, args)# test_loader, args)
import torch import streamlit as st from kobart import get_kobart_tokenizer from transformers.models.bart import BartForConditionalGeneration @st.cache def load_model(): model = BartForConditionalGeneration.from_pretrained('./translation_binary') # tokenizer = get_kobart_tokenizer() return model model = load_model() tokenizer = get_kobart_tokenizer() st.title("KoBART Translation Test") text = st.text_area("한글 문장 입력:") st.markdown("### 한글 문장") st.write(text) if text: text = text.replace('\n', '') st.markdown("### KoBART Translation 결과") with st.spinner('processing..'): input_ids = tokenizer.encode(text) input_ids = torch.tensor(input_ids) input_ids = input_ids.unsqueeze(0) output = model.generate(input_ids, eos_token_id=1, max_length=512, num_beams=5) output = tokenizer.decode(output[0], skip_special_tokens=True) st.write(output)
def __init__(self, filepath, max_seq_len=128): self.filepath = filepath self.data = pd.read_csv(self.filepath, sep='\t') self.max_seq_len = max_seq_len self.tokenizer = get_kobart_tokenizer()
def load_model(config, checkpoint): args = config['args'] labels = load_label(args.label_path) label_size = len(labels) config['labels'] = labels if config['emb_class'] == 'glove': if config['enc_class'] == 'gnb': model = TextGloveGNB(config, args.embedding_path, label_size) if config['enc_class'] == 'cnn': model = TextGloveCNN(config, args.embedding_path, label_size, emb_non_trainable=True) if config['enc_class'] == 'densenet-cnn': model = TextGloveDensenetCNN(config, args.embedding_path, label_size, emb_non_trainable=True) if config['enc_class'] == 'densenet-dsa': model = TextGloveDensenetDSA(config, args.embedding_path, label_size, emb_non_trainable=True) else: if config['emb_class'] == 'bart' and config['use_kobart']: from transformers import BartModel from kobart import get_kobart_tokenizer, get_pytorch_kobart_model bert_tokenizer = get_kobart_tokenizer() bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_model = BartModel.from_pretrained(get_pytorch_kobart_model()) bert_config = bert_model.config elif config['emb_class'] in ['gpt']: bert_tokenizer = AutoTokenizer.from_pretrained( args.bert_output_dir) bert_tokenizer.bos_token = '<|startoftext|>' bert_tokenizer.eos_token = '<|endoftext|>' bert_tokenizer.cls_token = '<|startoftext|>' bert_tokenizer.sep_token = '<|endoftext|>' bert_tokenizer.pad_token = '<|pad|>' bert_config = AutoConfig.from_pretrained(args.bert_output_dir) bert_model = AutoModel.from_pretrained(args.bert_output_dir) elif config['emb_class'] in ['t5']: from transformers import T5EncoderModel bert_tokenizer = AutoTokenizer.from_pretrained( args.bert_output_dir) bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_config = AutoConfig.from_pretrained(args.bert_output_dir) bert_model = T5EncoderModel(bert_config) else: bert_tokenizer = AutoTokenizer.from_pretrained( args.bert_output_dir) bert_config = AutoConfig.from_pretrained(args.bert_output_dir) bert_model = AutoModel.from_config(bert_config) ModelClass = TextBertCNN if config['enc_class'] == 'cls': ModelClass = TextBertCLS if config['enc_class'] == 'densenet-cnn': ModelClass = TextBertDensenetCNN model = ModelClass(config, bert_config, bert_model, bert_tokenizer, label_size) if args.enable_qat: assert args.device == 'cpu' model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') ''' # fuse if applicable # model = torch.quantization.fuse_modules(model, [['']]) ''' model = torch.quantization.prepare_qat(model) model.eval() model.to('cpu') logger.info("[Convert to quantized model with device=cpu]") model = torch.quantization.convert(model) if args.enable_qat_fx: import torch.quantization.quantize_fx as quantize_fx qconfig_dict = { "": torch.quantization.get_default_qat_qconfig('fbgemm') } model = quantize_fx.prepare_qat_fx(model, qconfig_dict) logger.info("[Convert to quantized model]") model = quantize_fx.convert_fx(model) if args.enable_diffq: quantizer = DiffQuantizer(model) config['quantizer'] = quantizer quantizer.restore_quantized_state(checkpoint) else: model.load_state_dict(checkpoint) model = model.to(args.device) ''' for name, param in model.named_parameters(): print(name, param.data, param.device, param.requires_grad) ''' logger.info("[model] :\n{}".format(model.__str__())) logger.info("[Model loaded]") return model
def prepare_model(config, bert_model_name_or_path=None): args = config['args'] emb_non_trainable = not args.embedding_trainable labels = load_label(args.label_path) label_size = len(labels) config['labels'] = labels # prepare model if config['emb_class'] == 'glove': if config['enc_class'] == 'gnb': model = TextGloveGNB(config, args.embedding_path, label_size) if config['enc_class'] == 'cnn': model = TextGloveCNN(config, args.embedding_path, label_size, emb_non_trainable=emb_non_trainable) if config['enc_class'] == 'densenet-cnn': model = TextGloveDensenetCNN(config, args.embedding_path, label_size, emb_non_trainable=emb_non_trainable) if config['enc_class'] == 'densenet-dsa': model = TextGloveDensenetDSA(config, args.embedding_path, label_size, emb_non_trainable=emb_non_trainable) else: model_name_or_path = args.bert_model_name_or_path if bert_model_name_or_path: model_name_or_path = bert_model_name_or_path if config['emb_class'] == 'bart' and config['use_kobart']: from transformers import BartModel from kobart import get_kobart_tokenizer, get_pytorch_kobart_model bert_tokenizer = get_kobart_tokenizer() bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_model = BartModel.from_pretrained(get_pytorch_kobart_model()) elif config['emb_class'] in ['gpt']: bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) bert_tokenizer.bos_token = '<|startoftext|>' bert_tokenizer.eos_token = '<|endoftext|>' bert_tokenizer.cls_token = '<|startoftext|>' bert_tokenizer.sep_token = '<|endoftext|>' bert_tokenizer.pad_token = '<|pad|>' bert_model = AutoModel.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path)) # 3 new tokens added bert_model.resize_token_embeddings(len(bert_tokenizer)) elif config['emb_class'] in ['t5']: from transformers import T5EncoderModel bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) bert_tokenizer.cls_token = '<s>' bert_tokenizer.sep_token = '</s>' bert_tokenizer.pad_token = '<pad>' bert_model = T5EncoderModel.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path)) else: bert_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) bert_model = AutoModel.from_pretrained( model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path)) bert_config = bert_model.config # bert model reduction reduce_bert_model(config, bert_model, bert_config) ModelClass = TextBertCNN if config['enc_class'] == 'cls': ModelClass = TextBertCLS if config['enc_class'] == 'densenet-cnn': ModelClass = TextBertDensenetCNN model = ModelClass(config, bert_config, bert_model, bert_tokenizer, label_size, feature_based=args.bert_use_feature_based, finetune_last=args.bert_use_finetune_last) if args.restore_path: checkpoint = load_checkpoint(args.restore_path) model.load_state_dict(checkpoint) if args.enable_qat: model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') ''' # fuse if applicable # model = torch.quantization.fuse_modules(model, [['']]) ''' model = torch.quantization.prepare_qat(model) if args.enable_qat_fx: import torch.quantization.quantize_fx as quantize_fx model.train() qconfig_dict = { "": torch.quantization.get_default_qat_qconfig('fbgemm') } model = quantize_fx.prepare_qat_fx(model, qconfig_dict) logger.info("[model] :\n{}".format(model.__str__())) logger.info("[model prepared]") return model