def get_processor(dataset_config): with open(dataset_config.source_vocab, mode="rb") as io: src_vocab = pickle.load(io) src_stemmer = Stemmer(language="ko") src_processor = SourceProcessor(src_vocab, src_stemmer.extract_stem) with open(dataset_config.target_vocab, mode="rb") as io: tgt_vocab = pickle.load(io) tgt_stemmer = Stemmer(language="en") tgt_processor = TargetProcessor(tgt_vocab, tgt_stemmer.extract_stem) return src_processor, tgt_processor
if not data_dir.exists(): data_dir.mkdir() ko = [] en = [] for filepath in list_of_filepath: with open(filepath, mode='r', encoding='utf-8') as io: corpus = [sen.strip() for sen in io.readlines()] if 'en' in filepath.suffix: en.append(corpus) else: ko.append(corpus) split_ko = Stemmer(language='ko') split_en = Stemmer(language='en') for ds in zip(ko, en, ['dev', 'test', 'train']): ko_idx = [ idx for idx, sen in enumerate(ds[0]) if len(split_ko.extract_stem(sen)) <= 30 ] en_idx = [ idx for idx, sen in enumerate(ds[1]) if len(split_en.extract_stem(sen)) <= 30 ] intersect_idx = set(np.intersect1d(ko_idx, en_idx)) ko_ds = [sen for idx, sen in enumerate(ds[0]) if idx in intersect_idx] en_ds = [sen for idx, sen in enumerate(ds[1]) if idx in intersect_idx] df = pd.DataFrame({'ko': ko_ds, 'en': en_ds})
parser.add_argument('--restore_file', default='best', help="name of the file in --model_dir \ containing weights to load") parser.add_argument('--data_name', default='test', help="name of the data in --data_dir to be evaluate") if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # processor with open(data_config.source_vocab, mode='rb') as io: src_vocab = pickle.load(io) ko_stemmer = Stemmer(language='ko') src_processor = SourceProcessor(src_vocab, ko_stemmer.extract_stem) with open(data_config.target_vocab, mode='rb') as io: tgt_vocab = pickle.load(io) en_stemmer = Stemmer(language='en') tgt_processor = TargetProcessor(tgt_vocab, en_stemmer.extract_stem) # model (restore) encoder = BidiEncoder(src_vocab, model_config.encoder_hidden_dim, model_config.drop_ratio) decoder = AttnDecoder(tgt_vocab, model_config.method, model_config.encoder_hidden_dim, model_config.decoder_hidden_dim, model_config.drop_ratio) checkpoint_manager = CheckpointManager(model_dir) checkpoint = checkpoint_manager.load_checkpoint(args.restore_file + '.tar') encoder.load_state_dict(checkpoint['encoder_state_dict'])
import pandas as pd import itertools import pickle import gluonnlp as nlp from collections import Counter from pathlib import Path from model.split import Stemmer from model.utils import Vocab data_dir = Path('data') tr_filepath = (data_dir / 'train').with_suffix('.txt') tr_dataset = pd.read_csv(tr_filepath, sep='\t') # korean vocab split_ko = Stemmer(language='ko') count_ko = Counter( itertools.chain.from_iterable(tr_dataset['ko'].apply( split_ko.extract_stem).tolist())) list_of_token_ko = sorted( [token[0] for token in count_ko.items() if token[1] >= 15]) tmp_vocab = nlp.Vocab(Counter(list_of_token_ko), bos_token=None, eos_token=None) ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko') tmp_vocab.set_embedding(ptr_embedding) array = tmp_vocab.embedding.idx_to_vec.asnumpy() vocab_ko = Vocab(list_of_token_ko, bos_token=None, eos_token=None) vocab_ko.embedding = array with open(data_dir / 'vocab_ko.pkl', mode='wb') as io:
help="Directory containing config.json of data") parser.add_argument('--model_dir', default='experiments/base_model', help="Directory containing config.json of model") if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # processor with open(data_config.source_vocab, mode='rb') as io: src_vocab = pickle.load(io) src_stemmer = Stemmer(language='ko') src_processor = SourceProcessor(src_vocab, src_stemmer.extract_stem) with open(data_config.target_vocab, mode='rb') as io: tgt_vocab = pickle.load(io) tgt_stemmer = Stemmer(language='en') tgt_processor = TargetProcessor(tgt_vocab, tgt_stemmer.extract_stem) # model encoder = Encoder(src_vocab, model_config.encoder_hidden_dim, model_config.drop_ratio) decoder = AttnDecoder(tgt_vocab, model_config.method, model_config.encoder_hidden_dim, model_config.decoder_hidden_dim, model_config.drop_ratio)