def main(cfgpath, global_step): # parsing config.json proj_dir = Path.cwd() params = json.load((proj_dir / cfgpath).open()) # create dataset batch_size = params['training'].get('batch_size') tr_filepath = params['filepath'].get('tr') val_filepath = params['filepath'].get('val') tr_ds = create_dataset(tr_filepath, batch_size, True) val_ds = create_dataset(val_filepath, batch_size, False) # create pre_processor vocab = pickle.load((proj_dir / params['filepath'].get('vocab')).open(mode='rb')) pre_processor = PreProcessor(vocab=vocab, tokenizer=MeCab().morphs, pad_idx=1) # create model model = SenCNN(num_classes=2, vocab=vocab) # create optimizer & loss_fn epochs = params['training'].get('epochs') learning_rate = params['training'].get('learning_rate') opt = tf.optimizers.Adam(learning_rate=learning_rate) loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True) writer = tf.summary.create_file_writer(logdir='./runs/exp') # training for epoch in tqdm(range(epochs), desc='epochs'): tr_loss = 0 tf.keras.backend.set_learning_phase(1) for step, mb in tqdm(enumerate(tr_ds), desc='steps'): x_mb, y_mb = pre_processor.convert2idx(mb) with tf.GradientTape() as tape: mb_loss = loss_fn(y_mb, model(x_mb)) grads = tape.gradient(target=mb_loss, sources=model.trainable_variables) opt.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables)) tr_loss += mb_loss.numpy() if tf.equal(opt.iterations % global_step, 0): with writer.as_default(): val_loss = evaluate(model, val_ds, loss_fn, pre_processor.convert2idx) tf.summary.scalar('tr_loss', tr_loss / (step + 1), step=opt.iterations) tf.summary.scalar('val_loss', val_loss, step=opt.iterations) tf.keras.backend.set_learning_phase(1) else: tr_loss /= (step + 1) val_loss = evaluate(model, val_ds, loss_fn, pre_processor.convert2idx) tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, tr_loss, val_loss)) ckpt_path = proj_dir / params['filepath'].get('ckpt') ckpt = tf.train.Checkpoint(model=model) ckpt.save(ckpt_path)
def main(cfgpath): # parsing config.json proj_dir = Path.cwd() params = json.load((proj_dir / cfgpath).open()) # create dataset batch_size = params['training'].get('batch_size') tr_filepath = params['filepath'].get('tr') val_filepath = params['filepath'].get('val') tr_ds = create_dataset(tr_filepath, batch_size, True) val_ds = create_dataset(val_filepath, batch_size, False) # create pre_processor vocab = pickle.load( (proj_dir / params['filepath'].get('vocab')).open(mode='rb')) pre_processor = PreProcessor(vocab=vocab, tokenizer=Okt) # create model model = SenCNN(num_classes=2, vocab=vocab) # create optimizer & loss_fn epochs = params['training'].get('epochs') learning_rate = params['training'].get('learning_rate') opt = tf.optimizers.Adam(learning_rate=learning_rate) loss_fn = tf.losses.SparseCategoricalCrossentropy() # training for epoch in tqdm(range(epochs), desc='epochs'): tr_loss = 0 tf.keras.backend.set_learning_phase(1) for step, mb in tqdm(enumerate(tr_ds), desc='steps'): x_mb, y_mb = pre_processor.convert2idx(mb) with tf.GradientTape() as tape: mb_loss = loss_fn(y_mb, model(x_mb)) grads = tape.gradient(target=mb_loss, sources=model.trainable_variables) opt.apply_gradients( grads_and_vars=zip(grads, model.trainable_variables)) tr_loss += mb_loss.numpy() else: tr_loss /= (step + 1) tf.keras.backend.set_learning_phase(0) val_loss = 0 for step, mb in tqdm(enumerate(val_ds), desc='steps'): x_mb, y_mb = pre_processor.convert2idx(mb) mb_loss = loss_fn(y_mb, model(x_mb)) val_loss += mb_loss.numpy() else: val_loss /= (step + 1) tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format( epoch + 1, tr_loss, val_loss))
def main(self): batch_size = self._batch_size tr_filepath = 'data/train.txt' val_filepath = 'data/val.txt' tr_ds = SmCnn.create_dataset(self, tr_filepath, batch_size, shuffle=True) val_ds = SmCnn.create_dataset(self, val_filepath, batch_size, shuffle=False) vocab = pd.read_pickle('data/vocab.pkl') pre_processor = PreProcessor(vocab=vocab, tokenizer=Mecab()) # create model model = SmCnn() # create optimizer & loss_fn epochs = self._epochs learning_rate = self._learning_rate opt = tf.keras.optimizers.Adam(learning_rate=learning_rate) loss_fn = tf.keras.losses.SparseCategoricalCrossentropy() # training for epoch in tqdm(range(epochs), desc='steps'): tr_loss = 0 tf.keras.backend.set_learning_phase(1) for step, mb in tqdm(enumerate(tr_ds), desc='steps'): x_mb, y_mb = pre_processor.convert2idx(mb) with tf.GradientTape() as tape: mb_loss = loss_fn(y_mb, model(x_mb)) grads = tape.gradient(target=mb_loss, sources=model.trainable_variables) opt.apply_gradients( grads_and_vars=zip(grads, model.trainable_variables)) tr_loss += mb_loss.numpy() else: tr_loss /= (step + 1) tf.keras.backend.set_learning_phase(0) val_loss = 0 for step, mb in tqdm(enumerate(val_ds), desc='steps'): x_mb, y_mb = pre_processor.convert2idx(mb) mb_loss = loss_fn(y_mb, model(x_mb)) val_loss += mb_loss.numpy() else: val_loss /= (step + 1) tqdm.write( 'epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format( epoch + 1, tr_loss, val_loss))
def get_preprocessor(ptr_config_info, model_config): with open(ptr_config_info.vocab, mode='rb') as io: vocab = pickle.load(io) if model_config.type == 'etri': ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_config_info.tokenizer, do_lower_case=False) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence) elif model_config.type == 'skt': ptr_tokenizer = SentencepieceTokenizer(ptr_config_info.tokenizer) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence) return preprocessor
def main(cfgpath): # parsing config.json proj_dir = Path.cwd() params = json.load((proj_dir / cfgpath).open()) # create dataset batch_size = params['training'].get('batch_size') tr_filepath = params['filepath'].get('tr') val_filepath = params['filepath'].get('val') tst_filepath = params['filepath'].get('tst') tr_ds = create_dataset(tr_filepath, batch_size, False, False) val_ds = create_dataset(val_filepath, batch_size, False, False) tst_ds = create_dataset(tst_filepath, batch_size, False, False) # create pre_processor vocab = pickle.load( (proj_dir / params['filepath'].get('vocab')).open(mode='rb')) pre_processor = PreProcessor(vocab=vocab, tokenizer=MeCab().morphs, pad_idx=1) # create model model = SenCNN(num_classes=2, vocab=vocab) ckpt = tf.train.Checkpoint(model=model) ckpt.restore(save_path=tf.train.latest_checkpoint(proj_dir / 'checkpoint')) # evluation tr_acc = get_accuracy(model, tr_ds, pre_processor.convert2idx) val_acc = get_accuracy(model, val_ds, pre_processor.convert2idx) tst_acc = get_accuracy(model, tst_ds, pre_processor.convert2idx) print('tr_acc: {:.2%}, val_acc : {:.2%}, tst_acc: {:.2%}'.format( tr_acc, val_acc, tst_acc))
def predict(sentence1, sentence2): ptr_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/pretrained" data_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/data" caseType = "skt" model_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/experiments/base_model" checkpoint_model_file = "best_skt.tar" # ptr_dir = "BERT_pairwise_text_classification/pretrained" # data_dir = "BERT_pairwise_text_classification/data" # caseType = "skt" # model_dir = "BERT_pairwise_text_classification/experiments/base_model" # checkpoint_model_file = "best_skt.tar" # ptr_dir = "pretrained" # data_dir = "data" # caseType = "skt" # model_dir = "experiments/base_model" # checkpoint_model_file = "best_skt.tar" ptr_dir = Path(ptr_dir) data_dir = Path(data_dir) model_dir = Path(model_dir) checkpoint_model_file = Path(checkpoint_model_file) ptr_config = Config(ptr_dir / 'config_skt.json') data_config = Config(data_dir / 'config.json') model_config = Config(model_dir / 'config.json') # vocab with open(os.path.join(ptr_dir, ptr_config.vocab), mode='rb') as io: vocab = pickle.load(io) ptr_tokenizer = SentencepieceTokenizer(os.path.join(ptr_dir, ptr_config.tokenizer)) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence) # model (restore) checkpoint_manager = CheckpointManager(model_dir) checkpoint = checkpoint_manager.load_checkpoint(checkpoint_model_file) config = BertConfig(os.path.join(ptr_dir, ptr_config.config)) model = PairwiseClassifier(config, num_classes=model_config.num_classes, vocab=preprocessor.vocab) model.load_state_dict(checkpoint['model_state_dict']) device = torch.device('cpu') model.to(device) transform = preprocessor.preprocess if model.training: model.eval() indices, token_types = [torch.tensor([elm]) for elm in transform(sentence1, sentence2)] with torch.no_grad(): label = model(indices, token_types) label = label.max(dim=1)[1] label = label.numpy()[0] return label
def get_preprocessor(dataset_config, coarse_split_fn, fine_split_fn): with open(dataset_config.fine_vocab, mode="rb") as io: fine_vocab = pickle.load(io) with open(dataset_config.coarse_vocab, mode="rb") as io: coarse_vocab = pickle.load(io) preprocessor = PreProcessor(coarse_vocab=coarse_vocab, fine_vocab=fine_vocab, coarse_split_fn=coarse_split_fn, fine_split_fn=fine_split_fn) return preprocessor
data_config = Config(data_dir / 'config.json') model_config = Config(model_dir / 'config.json') # vocab with open(ptr_config.vocab, mode='rb') as io: vocab = pickle.load(io) # tokenizer if args.type == 'etri': ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_config.tokenizer, do_lower_case=False) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices( vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence) elif args.type == 'skt': ptr_tokenizer = SentencepieceTokenizer(ptr_config.tokenizer) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices( vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence) # model (restore) checkpoint_manager = CheckpointManager(model_dir) checkpoint = checkpoint_manager.load_checkpoint('best_{}.tar'.format( args.type)) config = BertConfig(ptr_config.config)
def main(): tr_filepath = Path.cwd() / 'data' / 'train.txt' val_filepath = Path.cwd() / 'data' / 'val.txt' with open(Path.cwd() / 'data/vocab.pkl', mode='rb') as f: vocab = pickle.load(f) tr_ds = create_dataset(str(tr_filepath), 128, shuffle=True) val_ds = create_dataset(str(val_filepath), 128, shuffle=False) # 평가 데이터는 셔플 ㄴㄴ tokenized = Okt() pre_processor = PreProcessor(vocab=vocab, tokenizer=tokenized) # create model model = SenCNN(num_classes=2, vocab=vocab) # create optimizer & loss_fn epochs = 10 learning_rate = 1e-3 opt = tf.optimizers.Adam(learning_rate=learning_rate) loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True) # metrics tr_loss_metric = tf.keras.metrics.Mean(name='train_loss') tr_accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') val_loss_metric = tf.keras.metrics.Mean(name='validation_loss') val_accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy( name='validation_accuracy') # training for epoch in tqdm(range(epochs), desc='epochs'): # trining data tf.keras.backend.set_learning_phase(1) # train mode for _, mb in tqdm(enumerate(tr_ds), desc='steps'): x_mb, y_mb = pre_processor.convert2idx(mb) x_mb = pre_processor.pad_sequences(x_mb, 70) x_mb, y_mb = pre_processor.convert_to_tensor(x_mb, y_mb) with tf.GradientTape() as tape: mb_loss = loss_fn(y_mb, model(x_mb)) grads = tape.gradient(target=mb_loss, sources=model.trainable_variables) opt.apply_gradients( grads_and_vars=zip(grads, model.trainable_variables)) tr_loss_metric.update_state(mb_loss) tr_accuracy_metric(y_mb, model(x_mb)) tr_mean_loss = tr_loss_metric.result() tr_mean_accuracy = tr_accuracy_metric.result() # test data tf.keras.backend.set_learning_phase(0) # test mode for _, mb in tqdm(enumerate(val_ds), desc='steps'): x_mb, y_mb = pre_processor.convert2idx(mb) x_mb = pre_processor.pad_sequences(x_mb, 70) x_mb, y_mb = pre_processor.convert_to_tensor(x_mb, y_mb) mb_loss = loss_fn(y_mb, model(x_mb)) val_loss_metric.update_state(mb_loss) val_accuracy_metric.update_state(y_mb, model(x_mb)) val_mean_loss = val_loss_metric.result() val_mean_accuracy = val_accuracy_metric.result() tqdm.write( 'epoch : {}, tr_accuracy : {:.3f}, tr_loss : {:.3f}, val_accuracy : {:.3f}, val_loss : {:.3f}' .format(epoch + 1, tr_mean_accuracy, tr_mean_loss, val_mean_accuracy, val_mean_loss)) ckpt_path = Path.cwd() / 'checkpoint/ckpt' ckpt = tf.train.Checkpoint(model=model) ckpt.save(ckpt_path)
# vocab vocab = pickle.load(open(ptr_config.vocab, mode='rb')) # tokenizer if args.tokenizer == 'ranked': print('[RANKED TOKENIZER]') ptr_tokenizer = KBertRankedTokenizer(ptr_config.tokenizer, do_lower_case=False) else: ptr_tokenizer = BertTokenizer.from_pretrained(ptr_config.tokenizer, do_lower_case=False) print('[BERT TOKENIZER]') pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence, subchar=args.subchar) # model config = BertConfig(ptr_config.config) model = SentenceClassifier(config, num_classes=model_config.num_classes, vocab=preprocessor.vocab) bert_pretrained = torch.load(ptr_config.bert) model.load_state_dict(bert_pretrained, strict=False) # training tr_ds = Corpus(data_config.train, preprocessor.preprocess) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True,
model_dir = Path(args.model_dir) data_config = Config(data_dir / 'config.json') model_config = Config(model_dir / 'config.json') # Vocab and Tokenizer ptr_dir = Path("pretrained") vocab_filepath = ptr_dir / "{}-vocab.pkl".format(args.type) with open(vocab_filepath, mode='rb') as io: vocab = pickle.load(io) ptr_tokenizer = BertTokenizer.from_pretrained(args.type, do_lower_case="uncased" in args.type) ptr_tokenizer = Tokenizer(vocab, ptr_tokenizer.tokenize) preprocessor = PreProcessor(ptr_tokenizer, model_config.max_len) # Load Model config_filepath = ptr_dir / "{}-config.json".format(args.type) config = BertConfig.from_pretrained(config_filepath, output_hidden_states=False) model = BIIN(config, vocab, model_config.hidden_size, enc_num_layers=len(model_config.hidden_size)) # Data Loader tr_ds = Corpus(data_config.tr_path, preprocessor.preprocess, sep='\t', doc_col='question1',
if __name__ == "__main__": args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(data_dir / "config.json") model_config = Config(model_dir / "config.json") # tokenizer with open(data_config.fine_vocab, mode="rb") as io: fine_vocab = pickle.load(io) with open(data_config.coarse_vocab, mode="rb") as io: coarse_vocab = pickle.load(io) preprocessor = PreProcessor( coarse_vocab=coarse_vocab, fine_vocab=fine_vocab, coarse_split_fn=split_morphs, fine_split_fn=split_jamos, ) # model model = SAN(model_config.num_classes, coarse_vocab, fine_vocab, model_config.fine_embedding_dim, model_config.hidden_dim, model_config.multi_step, model_config.prediction_drop_ratio) # training tr_ds = Corpus(data_config.train, preprocessor.preprocess) tr_dl = DataLoader( tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4,