Пример #1
0
def get_processor(dataset_config):
    with open(dataset_config.source_vocab, mode="rb") as io:
        src_vocab = pickle.load(io)
    src_stemmer = Stemmer(language="ko")
    src_processor = SourceProcessor(src_vocab, src_stemmer.extract_stem)

    with open(dataset_config.target_vocab, mode="rb") as io:
        tgt_vocab = pickle.load(io)
    tgt_stemmer = Stemmer(language="en")
    tgt_processor = TargetProcessor(tgt_vocab, tgt_stemmer.extract_stem)
    return src_processor, tgt_processor
Пример #2
0
if not data_dir.exists():
    data_dir.mkdir()

ko = []
en = []

for filepath in list_of_filepath:
    with open(filepath, mode='r', encoding='utf-8') as io:
        corpus = [sen.strip() for sen in io.readlines()]

    if 'en' in filepath.suffix:
        en.append(corpus)
    else:
        ko.append(corpus)

split_ko = Stemmer(language='ko')
split_en = Stemmer(language='en')

for ds in zip(ko, en, ['dev', 'test', 'train']):
    ko_idx = [
        idx for idx, sen in enumerate(ds[0])
        if len(split_ko.extract_stem(sen)) <= 30
    ]
    en_idx = [
        idx for idx, sen in enumerate(ds[1])
        if len(split_en.extract_stem(sen)) <= 30
    ]
    intersect_idx = set(np.intersect1d(ko_idx, en_idx))
    ko_ds = [sen for idx, sen in enumerate(ds[0]) if idx in intersect_idx]
    en_ds = [sen for idx, sen in enumerate(ds[1]) if idx in intersect_idx]
    df = pd.DataFrame({'ko': ko_ds, 'en': en_ds})
Пример #3
0
parser.add_argument('--restore_file', default='best', help="name of the file in --model_dir \
                     containing weights to load")
parser.add_argument('--data_name', default='test', help="name of the data in --data_dir to be evaluate")


if __name__ == '__main__':
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)
    data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # processor
    with open(data_config.source_vocab, mode='rb') as io:
        src_vocab = pickle.load(io)
    ko_stemmer = Stemmer(language='ko')
    src_processor = SourceProcessor(src_vocab, ko_stemmer.extract_stem)

    with open(data_config.target_vocab, mode='rb') as io:
        tgt_vocab = pickle.load(io)
    en_stemmer = Stemmer(language='en')
    tgt_processor = TargetProcessor(tgt_vocab, en_stemmer.extract_stem)

    # model (restore)
    encoder = BidiEncoder(src_vocab, model_config.encoder_hidden_dim, model_config.drop_ratio)
    decoder = AttnDecoder(tgt_vocab, model_config.method, model_config.encoder_hidden_dim,
                          model_config.decoder_hidden_dim, model_config.drop_ratio)

    checkpoint_manager = CheckpointManager(model_dir)
    checkpoint = checkpoint_manager.load_checkpoint(args.restore_file + '.tar')
    encoder.load_state_dict(checkpoint['encoder_state_dict'])
Пример #4
0
import pandas as pd
import itertools
import pickle
import gluonnlp as nlp
from collections import Counter
from pathlib import Path
from model.split import Stemmer
from model.utils import Vocab

data_dir = Path('data')
tr_filepath = (data_dir / 'train').with_suffix('.txt')
tr_dataset = pd.read_csv(tr_filepath, sep='\t')

# korean vocab
split_ko = Stemmer(language='ko')
count_ko = Counter(
    itertools.chain.from_iterable(tr_dataset['ko'].apply(
        split_ko.extract_stem).tolist()))
list_of_token_ko = sorted(
    [token[0] for token in count_ko.items() if token[1] >= 15])
tmp_vocab = nlp.Vocab(Counter(list_of_token_ko),
                      bos_token=None,
                      eos_token=None)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab_ko = Vocab(list_of_token_ko, bos_token=None, eos_token=None)
vocab_ko.embedding = array

with open(data_dir / 'vocab_ko.pkl', mode='wb') as io:
Пример #5
0
                    help="Directory containing config.json of data")
parser.add_argument('--model_dir',
                    default='experiments/base_model',
                    help="Directory containing config.json of model")

if __name__ == '__main__':
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)
    data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # processor
    with open(data_config.source_vocab, mode='rb') as io:
        src_vocab = pickle.load(io)
    src_stemmer = Stemmer(language='ko')
    src_processor = SourceProcessor(src_vocab, src_stemmer.extract_stem)

    with open(data_config.target_vocab, mode='rb') as io:
        tgt_vocab = pickle.load(io)
    tgt_stemmer = Stemmer(language='en')
    tgt_processor = TargetProcessor(tgt_vocab, tgt_stemmer.extract_stem)

    # model
    encoder = Encoder(src_vocab, model_config.encoder_hidden_dim,
                      model_config.drop_ratio)
    decoder = AttnDecoder(tgt_vocab, model_config.method,
                          model_config.encoder_hidden_dim,
                          model_config.decoder_hidden_dim,
                          model_config.drop_ratio)