def main(parser): # Config args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer with open(data_config.token2idx_vocab, mode='rb') as io: token2idx_vocab = json.load(io) print("token2idx_vocab: ", token2idx_vocab) vocab = Vocabulary(token2idx=token2idx_vocab) tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) model_config.vocab_size = len(vocab.token2idx) # Model model = Transformer(config=model_config, vocab=vocab) checkpoint_manager = CheckpointManager(model_dir) # experiments/base_model checkpoint = checkpoint_manager.load_checkpoint('best.tar') model.load_state_dict(checkpoint['model_state_dict']) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) model.eval() while (True): input_text = input("문장을 입력하세요: ") enc_input = torch.tensor( tokenizer.list_of_string_to_arr_of_pad_token_ids([input_text])) dec_input = torch.tensor([[vocab.token2idx[vocab.START_TOKEN]]]) for i in range(model_config.maxlen): y_pred = model(enc_input.to(device), dec_input.to(device)) y_pred_ids = y_pred.max(dim=-1)[1] if (y_pred_ids[0, -1] == vocab.token2idx[vocab.END_TOKEN]).to( torch.device('cpu')).numpy(): decoding_from_result(enc_input=enc_input, y_pred=y_pred, tokenizer=tokenizer) break # decoding_from_result(enc_input, y_pred, tokenizer) dec_input = torch.cat([ dec_input.to(torch.device('cpu')), y_pred_ids[0, -1].unsqueeze(0).unsqueeze(0).to( torch.device('cpu')) ], dim=-1) if i == model_config.maxlen - 1: decoding_from_result(enc_input=enc_input, y_pred=y_pred, tokenizer=tokenizer)
def load_data(data_path): """ code from "https://github.com/changwookjun/Transformer/blob/25d9472155cb0788d11dbfe274526690915fe95e/data.py#L27" :param data_path: :return: list of train_input, train_label, eval_input, eval_label """ # 판다스를 통해서 데이터를 불러온다. data_df = pd.read_csv(data_path, header=0) # 질문과 답변 열을 가져와 question과 answer에 넣는다. question, answer = list(data_df['Q']), list(data_df['A']) # skleran에서 지원하는 함수를 통해서 학습 셋과 # 테스트 셋을 나눈다. tr_input, val_input, tr_label, val_label = train_test_split( question, answer, test_size=0.05, random_state=42) data_config = Config(json_path='./data_in/config.json') with open(data_config.train, mode='w', encoding='utf-8') as io: for input, label in zip(tr_input, tr_label): text = input + "\t" + label + "\n" io.write(text) with open(data_config.validation, mode='w', encoding='utf-8') as io: for input, label in zip(val_input, val_label): text = input + "\t" + label + "\n" io.write(text) total_courpus = question + answer # 그 값을 리턴한다. return total_courpus
def main(): cur_path = os.path.dirname(sys.argv[0]) if cur_path: os.chdir(cur_path) model_dir = Path('./experiments/base_model_with_crf') model_config = Config(json_path=model_dir / 'config.json') # load vocab & tokenizer tok_path = "./ptr_lm_model/tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # load ner_to_index.json with open(model_dir / "ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} # model model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # load model_dict = model.state_dict() checkpoint = torch.load("./model.bin", map_location=torch.device('cpu')) convert_keys = {} for k, v in checkpoint['model_state_dict'].items(): new_key_name = k.replace("module.", '') if new_key_name not in model_dict: print("{} is not int model_dict".format(new_key_name), file=sys.stderr) continue convert_keys[new_key_name] = v model.load_state_dict(convert_keys, strict=False) model.eval() device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') #model.to(device) decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=tokenizer, index_to_ner=index_to_ner) try: while(True): input_text = input() list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text]) x_input = torch.tensor(list_of_input_ids).long() list_of_pred_ids = model(x_input) list_of_ner_word = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids) if list_of_ner_word: print(",".join(list_of_ner_word)) else: print("/") except: print("EOF", file=sys.stderr)
def load_generator(args): # 载入预训练的生成器 data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') checkpoint_manager = CheckpointManager(model_dir) # experiments/base_model checkpoint = checkpoint_manager.load_checkpoint('best.tar') with open(data_config.token2idx_vocab, mode='rb') as io: token2idx_vocab = json.load(io) print("token2idx_vocab: ", token2idx_vocab) vocab = Vocabulary(token2idx = token2idx_vocab) model_config.vocab_size = len(vocab.token2idx) tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # loss_fn = nn.CrossEntropyLoss(ignore_index=vocab.PAD_ID) return Generator(model_config, vocab, checkpoint['model_state_dict']), tokenizer, vocab.PAD_ID, checkpoint_manager
def post(): value = request.form['input'] model_dir = Path('./experiments/base_model_with_crf') model_config = Config(json_path=model_dir / 'config.json') # load vocab & tokenizer tok_path = "ptr_lm_model/tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # load ner_to_index.json with open(model_dir / "ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} # model model = KobertCRFViz(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # load model_dict = model.state_dict() checkpoint = torch.load( "./experiments/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin", map_location=torch.device('cpu')) convert_keys = {} for k, v in checkpoint['model_state_dict'].items(): new_key_name = k.replace("module.", '') if new_key_name not in model_dict: print("{} is not int model_dict".format(new_key_name)) continue convert_keys[new_key_name] = v model.load_state_dict(convert_keys) model.eval() device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) decoder_from_res = DecoderFromNamedEntitySequence( tokenizer=tokenizer, index_to_ner=index_to_ner) input_text = value list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids( [input_text]) x_input = torch.tensor(list_of_input_ids).long() list_of_pred_ids, _ = model(x_input) list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids) return {'word': list_of_ner_word, 'decoding': decoding_ner_sentence}
def transformation(): # Do an inference on a single batch of data data = None # 1) INPUT: convert Korean text input to NER code array if flask.request.content_type == 'text/plain': '''CHECK file locations''' model_config = Config(json_path="config.json") tok_path = "./tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open("vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) with open("ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=tokenizer, index_to_ner=index_to_ner) f = flask.request.get_data() # ftype = str(type(f)) string_f = f.decode("utf-8") lines = string_f.splitlines(True) with open("result.txt", 'w', encoding='utf-8-sig') as w: # w.write('start\n') # w.write(ftype) # w.write('\nand\n') # w.write(string_f) # w.write('\nend\n') index = 0 for i in range(len(lines)): input_text = '' if i% 4 == 1: input_text = lines[i][3:] addInfo = lines[i+1][3:] if input_text == '': continue index += 1 # print("\n## " + str(index) + "\n") list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text]) x_input = torch.tensor(list_of_input_ids).long() w.write('## '+str(index)+'\n') w.write(addInfo) # w.write('\n'+str(list_of_input_ids)) predictions = run_inference_for_single_data(list_of_input_ids[0], ModelHandler.get_model()) # 2) OUTPUT: convert NER code to Korean text (FILE) emission = torch.tensor(predictions['output']) num_classes = len(ner_to_index) crf = CRF(num_tags=num_classes, batch_first=True) # 순서 (rearrange tag sequences) list_of_pred_ids = crf.decode(emission) input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=False) unkTokenList = makeUNKTokenList(input_text, input_token) input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=unkTokenList) w.write(str(list_of_ner_word) + '\n') w.write(str(decoding_ner_sentence[6:-5]) + '\n') return flask.Response(response=open("result.txt", 'r', encoding='utf-8-sig'), status=200, mimetype='text/plain') else: return flask.Response(response='This predictor only supports TEXT data', status=415, mimetype='text/plain')
def main(parser): args = parser.parse_args() model_dir = Path(args.model_dir) model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer # tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model tok_path = "./tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) # load vocab & tokenizer with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # load ner_to_index.json with open(model_dir / "ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} # Model # model = KobertSequenceFeatureExtractor(config=model_config, num_classes=len(ner_to_index)) model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # model = KobertBiLSTMCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # model = KobertBiGRUCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # load model_dict = model.state_dict() # checkpoint = torch.load("./experiments/base_model/best-epoch-9-step-600-acc-0.845.bin", map_location=torch.device('cpu')) checkpoint = torch.load( "./experiments/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin", map_location=torch.device('cpu')) # checkpoint = torch.load("./experiments/base_model_with_bilstm_crf/best-epoch-15-step-2750-acc-0.992.bin", map_location=torch.device('cpu')) # checkpoint = torch.load("./experiments/base_model_with_bigru_crf/model-epoch-18-step-3250-acc-0.997.bin", map_location=torch.device('cpu')) convert_keys = {} for k, v in checkpoint['model_state_dict'].items(): new_key_name = k.replace("module.", '') if new_key_name not in model_dict: print("{} is not int model_dict".format(new_key_name)) continue convert_keys[new_key_name] = v model.load_state_dict(convert_keys) model.eval() device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # n_gpu = torch.cuda.device_count() # if n_gpu > 1: # model = torch.nn.DataParallel(model) model.to(device) decoder_from_res = DecoderFromNamedEntitySequence( tokenizer=tokenizer, index_to_ner=index_to_ner) while (True): input_text = input("문장을 입력하세요: ") list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids( [input_text]) x_input = torch.tensor(list_of_input_ids).long() ## for bert alone # y_pred = model(x_input) # list_of_pred_ids = y_pred.max(dim=-1)[1].tolist() ## for bert crf list_of_pred_ids = model(x_input) ## for bert bilstm crf & bert bigru crf # list_of_pred_ids = model(x_input, using_pack_sequence=False) list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids) print("list_of_ner_word:", list_of_ner_word) print("decoding_ner_sentence:", decoding_ner_sentence)
import pickle import flask import torch from gluonnlp.data import SentencepieceTokenizer from data_utils.vocab_tokenizer import Tokenizer from data_utils.pad_sequence import keras_pad_fn from data_utils.utils import Config from decode import DecoderFromNamedEntitySequence from torchcrf import CRF import tensorflow.compat.v1 as tf # Model prefix = '/opt/ml/' model_path = os.path.join(prefix, 'model') model_config = Config(json_path="config.json") class ModelHandler(object): model = None tokenizer = None ner_to_index = None vocab = None ner_to_index = None index_to_ner = None token_to_index = None index_to_token = None @classmethod def get_vocab(cls):
def transformation(): # Do an inference on a single batch of data data = None # 1) INPUT: convert Korean text input to NER code array if flask.request.content_type == 'text/plain': '''CHECK file locations''' model_config = Config(json_path="config.json") tok_path = "./tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open("vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) with open("ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} decoder_from_res = DecoderFromNamedEntitySequence( tokenizer=tokenizer, index_to_ner=index_to_ner) ''' Assuming request.data is a string: name of txt file > NER_OY_data.txt as an example > 지금은 /opt/program에 (product-tags) HERE:? ''' f = flask.request.data.decode("utf-8") lines = f.splitlines(True) index = 0 with open("NER_OY_result.txt", 'w', encoding='utf-8-sig') as w: for i in range(len(lines)): input_text = '' if i % 4 == 1: input_text = lines[i][3:] addInfo = lines[i + 1][3:] if input_text == '': continue index += 1 # print("\n## " + str(index) + "\n") list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids( [input_text]) x_input = torch.tensor(list_of_input_ids).long() # print(list_of_input_ids) # print(x_input) data = {"instances": list_of_input_ids} predictions = ScoringService.predict(data) # 2) OUTPUT: convert NER code to Korean text (FILE) emission = torch.tensor(predictions['predictions']) num_classes = len(ner_to_index) crf = CRF(num_tags=num_classes, batch_first=True) list_of_pred_ids = crf.decode(emission) input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=False) unkTokenList = makeUNKTokenList(input_text, input_token) input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=unkTokenList) w.write('## ' + str(index) + '\n') w.write(addInfo) w.write(str(list_of_ner_word) + '\n') w.write(str(decoding_ner_sentence[6:-5]) + '\n') '''RETURN a file: NER_OY_result.txt''' return flask.Response(response=open("NER_OY_result.txt", 'r'), status=200, mimetype='text/plain') else: return flask.Response( response='This predictor only supports TEXT data', status=415, mimetype='text/plain')
def main(parser): # Config args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer with open(data_config.token2idx_vocab, mode='rb') as io: token2idx_vocab = json.load(io) print("token2idx_vocab: ", token2idx_vocab) vocab = Vocabulary(token2idx=token2idx_vocab) tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) model_config.vocab_size = len(vocab.token2idx) # Model & Model Params model = Transformer(config=model_config, vocab=vocab) # Train & Val Datasets tr_ds = ChatbotDataset(data_config.train, tokenizer.list_of_string_to_arr_of_pad_token_ids) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False) val_ds = ChatbotDataset(data_config.validation, tokenizer.list_of_string_to_arr_of_pad_token_ids) val_dl = DataLoader(val_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False) # loss loss_fn = nn.CrossEntropyLoss(ignore_index=vocab.PAD_ID) # nn.NLLLoss() # optim opt = optim.Adam( params=model.parameters(), lr=model_config.learning_rate ) # torch.optim.SGD(params=model.parameters(), lr=model_config.learning_rate) # scheduler = ReduceLROnPlateau(opt, patience=5) # Check scheduler = GradualWarmupScheduler(opt, multiplier=8, total_epoch=model_config.epochs) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) # save # writer = SummaryWriter('{}/runs'.format(model_dir)) checkpoint_manager = CheckpointManager(model_dir) summary_manager = SummaryManager(model_dir) best_val_loss = 1e+10 best_train_acc = 0 # load if (model_dir / 'best.tar').exists(): print("pretrained model exists") checkpoint = checkpoint_manager.load_checkpoint('best.tar') model.load_state_dict(checkpoint['model_state_dict']) # Train for epoch in tqdm(range(model_config.epochs), desc='epoch', total=model_config.epochs): scheduler.step(epoch) print("epoch : {}, lr: {}".format(epoch, opt.param_groups[0]['lr'])) tr_loss = 0 tr_acc = 0 model.train() for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)): opt.zero_grad() enc_input, dec_input, dec_output = map(lambda elm: elm.to(device), mb) y_pred = model(enc_input, dec_input) y_pred_copy = y_pred.detach() dec_output_copy = dec_output.detach() # loss 계산을 위해 shape 변경 y_pred = y_pred.reshape(-1, y_pred.size(-1)) dec_output = dec_output.view(-1).long() # padding 제외한 value index 추출 real_value_index = [dec_output != 0] # padding은 loss 계산시 제외 mb_loss = loss_fn( y_pred[real_value_index], dec_output[real_value_index]) # Input: (N, C) Target: (N) mb_loss.backward() opt.step() with torch.no_grad(): mb_acc = acc(y_pred, dec_output) tr_loss += mb_loss.item() tr_acc = mb_acc.item() tr_loss_avg = tr_loss / (step + 1) tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc} total_step = epoch * len(tr_dl) + step # Eval if total_step % model_config.summary_step == 0 and total_step != 0: print("train: ") decoding_from_result(enc_input, y_pred_copy, dec_output_copy, tokenizer) model.eval() print("eval: ") val_summary = evaluate(model, val_dl, { 'loss': loss_fn, 'acc': acc }, device, tokenizer) val_loss = val_summary['loss'] # writer.add_scalars('loss', {'train': tr_loss_avg, # 'val': val_loss}, epoch * len(tr_dl) + step) tqdm.write( 'epoch : {}, step : {}, ' 'tr_loss: {:.3f}, val_loss: {:.3f}, tr_acc: {:.2%}, val_acc: {:.2%}' .format(epoch + 1, total_step, tr_summary['loss'], val_summary['loss'], tr_summary['acc'], val_summary['acc'])) val_loss = val_summary['loss'] # is_best = val_loss < best_val_loss # loss 기준 is_best = tr_acc > best_train_acc # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야) # Save if is_best: print( "[Best model Save] train_acc: {}, train_loss: {}, val_loss: {}" .format(tr_summary['acc'], tr_summary['loss'], val_loss)) # CPU에서도 동작 가능하도록 자료형 바꾼 뒤 저장 state = { 'epoch': epoch + 1, 'model_state_dict': model.to(torch.device('cpu')).state_dict(), 'opt_state_dict': opt.state_dict() } summary = {'train': tr_summary, 'validation': val_summary} summary_manager.update(summary) summary_manager.save('summary.json') checkpoint_manager.save_checkpoint(state, 'best.tar') best_val_loss = val_loss model.to(device) model.train() else: if step % 50 == 0: print( 'epoch : {}, step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}' .format(epoch + 1, total_step, tr_summary['loss'], tr_summary['acc']))
if is_prev_entity is True: decoding_ner_sentence += ':' + prev_entity_tag + '>' + token_str is_prev_entity = False is_there_B_before_I = False else: decoding_ner_sentence += token_str return list_of_ner_word, decoding_ner_sentence import os ABS_PATH = os.environ.get('BASEDIR') model_dir = Path(f'{ABS_PATH}/experiments/base_model_with_crf_val') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer tok_path = f"{ABS_PATH}/tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) # load vocab & tokenizer with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=None) # load ner_to_index.json