Пример #1
0
def load_dataset(data_path, train_batch_size=4096, dev_batch_size=1, max_len=100):
    """
    This assumes that the data is already pre-processed using Moses Tokenizer
    Returns iterators for the training/dev dataset

    Arguments:
        data_path: path of the dataset
        train_batch_size: batch size of the training data (defined in terms of number of tokens or sentences, depending on the model_type)
        dev_batch_size: batch size of the dev data (usually one)
        max_len: max length of sequeences in a batch
    """

    SRC = Field(tokenize=lambda s: s.split(), init_token="<s>",
                eos_token="</s>", batch_first=True, include_lengths=True)
    TRG = Field(tokenize=lambda s: s.split(), init_token="<s>",
                eos_token="</s>", batch_first=True, include_lengths=True)

    # create a TranslationDataset for both the train and dev set
    train_data = datasets.TranslationDataset(exts=("train.de", "train.en"), fields=(
        SRC, TRG), path=data_path, filter_pred=lambda x: len(vars(x)['src']) <= max_len and len(vars(x)['trg']) <= max_len)

    dev_data = datasets.TranslationDataset(
        exts=("dev.de", "dev.en"), fields=(SRC, TRG), path=data_path)

    # load in the Test Set
    test_examples = []
    with open(data_path + "test.de", "r") as f:
        for test_example in f.readlines():
            example = data.Example()
            setattr(example, "src", test_example.split())
            test_examples.append(example)

    test_data = data.Dataset(test_examples, fields=[("src", SRC)])

    # build he vocab using the training data
    SRC.build_vocab(train_data.src, train_data.trg)
    TRG.build_vocab(train_data.src, train_data.trg)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # use custom DataIterator in order to minimize padding in a sequence
    # and inoder to `pack` a batch fully inorder to maximmize the computation
    # in a GPU
    train_iterator = DataIterator(train_data, batch_size=train_batch_size, device=device,
                                  repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                                  batch_size_fn=batch_size_fn, train=True, sort_within_batch=True, shuffle=True)

    # use a regular Iterator since we want to be able to compare
    # our translations to a gold standard file. If we use a
    # `DataIterator` then we will get our translations in shuffled/random
    # order
    dev_iterator = Iterator(dev_data, batch_size=dev_batch_size,
                            train=False, sort=False, repeat=False, device=device)

    # create Test Iterator for the test data
    test_iterator = Iterator(
        test_data, batch_size=1, train=False, sort=False, repeat=False, device=device)

    print(len(test_iterator))
    return train_iterator, dev_iterator, test_iterator, SRC, TRG
Пример #2
0
def get_mt_datasets(exts, fields, train_path, val_path, test_path):
    train = datasets.TranslationDataset(path=train_path,
                                        exts=exts,
                                        fields=fields)
    val = datasets.TranslationDataset(path=val_path, exts=exts, fields=fields)
    test = datasets.TranslationDataset(path=test_path,
                                       exts=exts,
                                       fields=fields)
    return train, val, test
Пример #3
0
def build_field_dataset_vocab(data_directory, src_name, trg_name, vocab):

    tokenize = lambda x: x.split()

    # 定义field,这里source与target共用vocab字典
    source = data.Field(
        sequential=True,
        tokenize=tokenize,
        lower=True,
        use_vocab=True,
        init_token='<sos>',
        eos_token='<eos>',
        pad_token='<pad>',
        unk_token='<unk>',
        batch_first=True,
        fix_length=50)  #include_lengths=True为方便之后使用torch的pack_padded_sequence

    # 定义数据集
    train_data = datasets.TranslationDataset(
        path=data_directory,
        exts=(src_name, trg_name),
        fields=(source, source))  # source与target共用vocab可使用同一个Fields
    # 创建词汇表
    if vocab is None:
        source.build_vocab(train_data, min_freq=2)
    else:
        source.vocab = vocab

    return source, train_data
Пример #4
0
def load_data(lang_dir, src_ext, tgt_ext, src_path=None, tgt_path=None):
    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = '<pad>'
    SRC = data.Field(
        tokenize=tokenize_es,
        init_token=BOS_WORD,
        eos_token=EOS_WORD,
        pad_token=BLANK_WORD) if src_path is None else load_field(src_path)
    TGT = data.Field(
        tokenize=tokenize_en,
        init_token=BOS_WORD,
        eos_token=EOS_WORD,
        pad_token=BLANK_WORD) if tgt_path is None else load_field(tgt_path)

    print("Loading data...")
    dataset = datasets.TranslationDataset(
        lang_dir, (src_ext, tgt_ext), (SRC, TGT),
        filter_pred=lambda x: len(vars(x)['src']) <= 100 and len(
            vars(x)['trg']) <= 100)
    print("Data loaded!")

    train, valid, test = dataset.split(split_ratio=[0.7, 0.15, 0.15])

    if src_path is None:
        SRC.build_vocab(train.src, min_freq=2, max_size=39996)
    if tgt_path is None:
        TGT.build_vocab(train.trg, min_freq=2, max_size=39996)

    return SRC, TGT, train, valid, test
Пример #5
0
def load_data(lang1='de', lang2='en', directory=None):

    lang1_tokenizer = get_tokenizer(lang1)
    lang2_tokenizer = get_tokenizer(lang2)

    SRC = data.Field(tokenize=lang1_tokenizer, pad_token=BLANK_WORD)
    TGT = data.Field(tokenize=lang2_tokenizer,
                     init_token=BOS_WORD,
                     eos_token=EOS_WORD,
                     pad_token=BLANK_WORD)

    MAX_LEN = 100

    if directory:
        train, val = datasets.TranslationDataset(
            path=directory,
            exts=('.' + lang1, '.' + lang2),
            fields=(SRC, TGT),
            filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(
                vars(x)['trg']) <= MAX_LEN).split()
    else:
        train, val, test = datasets.IWSLT.splits(
            exts=('.de', '.en'),
            fields=(SRC, TGT),
            filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(
                vars(x)['trg']) <= MAX_LEN)
    MIN_FREQ = 2
    SRC.build_vocab(train.src, min_freq=MIN_FREQ)
    TGT.build_vocab(train.trg, min_freq=MIN_FREQ)
    return train, val, SRC, TGT  # todo  find out exactly what each of these variables are
Пример #6
0
    def __init__(self,
                 path,
                 SRC,
                 TGT,
                 exts=('.de', '.en'),
                 UNK='<unk>',
                 SOS='<s>',
                 EOS='</s>',
                 TMAX=100):
        '''

        '''
        self.ds = datasets.TranslationDataset(
            path=path,
            exts=exts,
            fields=(SRC, TGT),
            filter_pred=lambda x: len(x.src) < TMAX and len(x.trg) < TMAX)

        self.src = self.ds.src
        self.tgt = self.ds.trg

        self.SRC = SRC
        self.TGT = TGT

        self.sos = SOS
        self.unk = UNK
        self.eos = EOS
Пример #7
0
def get_data(file_path, MIN_FREQ=2, DEVICE_SET=None):

    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    PAD_WORD = "<pad>"

    field_en = data.Field(sequential=True,
                          use_vocab=True,
                          batch_first=True,
                          tokenize=str.split,
                          init_token=BOS_WORD,
                          eos_token=EOS_WORD,
                          pad_token=PAD_WORD)

    field_de = data.Field(sequential=True,
                          use_vocab=True,
                          batch_first=True,
                          tokenize=str.split,
                          init_token=BOS_WORD,
                          eos_token=EOS_WORD,
                          pad_token=PAD_WORD)

    trn = datasets.TranslationDataset(path=file_path,
                                      exts=('en', 'de'),
                                      fields=[('en', field_en),
                                              ('de', field_de)])

    field_en.build_vocab(trn.en, min_freq=MIN_FREQ)
    field_de.build_vocab(trn.de, min_freq=MIN_FREQ)

    return field_en, field_de, trn
Пример #8
0
def build_field_dataset_vocab(data_directory,
                              src_name,
                              trg_name,
                              vocab,
                              field_include_length=True,
                              oov=False):

    tokenize = lambda x: x.split()

    # 定义field,这里source与target共用vocab字典
    source = data.Field(
        sequential=True,
        tokenize=tokenize,
        lower=True,
        use_vocab=True,
        init_token='<sos>',
        eos_token='<eos>',
        pad_token='<pad>',
        unk_token='<unk>',
        batch_first=True,
        fix_length=50,
        include_lengths=field_include_length
    )  #include_lengths=True为方便之后使用torch的pack_padded_sequence

    # 定义数据集
    train_data = datasets.TranslationDataset(
        path=data_directory,
        exts=(src_name, trg_name),
        fields=(source, source))  # source与target共用vocab可使用同一个Fields
    # 创建词汇表
    if vocab is None:
        source.build_vocab(train_data, min_freq=2)  # 词频少于2的将被映射为<unk>
    else:
        source.vocab = vocab

    #获取训练集中的oov 词
    if oov:
        oov_words = get_oov_words(train_data, source.vocab.stoi)

    # 划分训练与验证集,一个问题,利用random_split进行数据集划分后,会丢失fields属性
    train_set, val_set = train_data.split(split_ratio=0.95,
                                          random_state=random.seed(1))

    BATCH_SIZE = 256
    # 生成训练与验证集的迭代器
    train_iterator, val_iterator = data.BucketIterator.splits(
        (train_set, val_set),
        batch_size=BATCH_SIZE,
        # shuffle=True,
        # device=device,
        sort_within_batch=True,  #为true则一个batch内的数据会按sort_key规则降序排序
        sort_key=lambda x: len(x.src)  #这里按src的长度降序排序,主要是为后面pack,pad操作
        # repeat=False
    )
    if oov:
        return source, train_iterator, val_iterator, oov_words
    else:
        return source, train_iterator, val_iterator
Пример #9
0
def get_dataloader(train_data_base: str, val_data_base: str,
                   test_data_base: str, ext: Dict):

    # load english
    en_vocab = Vocab()
    train_data_en = train_data_base + "." + ext["en"]
    with Path(train_data_en).open("r", encoding="utf-8") as f:
        sentences = [line.strip().split() for line in f]
    en_vocab.build_vocab(sentences)

    # load japanese
    ja_vocab = Vocab()
    train_data_ja = train_data_base + "." + ext["ja"]
    with Path(train_data_ja).open("r", encoding="utf-8") as f:
        sentences = [line.strip().split() for line in f]
    ja_vocab.build_vocab(sentences)

    src = CustomField(vocab=en_vocab,
                      bos_token=None,
                      eos_token=None,
                      lower=True,
                      tokenize=lambda x: x.strip().split(),
                      batch_first=True)

    tgt = CustomField(vocab=ja_vocab,
                      lower=False,
                      tokenize=lambda x: x.strip().split(),
                      batch_first=True)
    train_dataloader = datasets.TranslationDataset(path=train_data_base,
                                                   exts=("." + ext["en"],
                                                         "." + ext["ja"]),
                                                   fields=(src, tgt))

    val_dataloader = datasets.TranslationDataset(path=val_data_base,
                                                 exts=("." + ext["en"],
                                                       "." + ext["ja"]),
                                                 fields=(src, tgt))

    test_dataloader = data.TabularDataset(path=test_data_base + "." +
                                          ext["en"],
                                          format="tsv",
                                          fields=[('text', src)])

    return (train_dataloader, val_dataloader, test_dataloader), (en_vocab,
                                                                 ja_vocab)
Пример #10
0
def get_dataset(dataset):
    languages = {
        "antoloji": "tr",
        "tur": "tr",
        "cz": "cz",
        "turkish": "tr",
        "eng": "en",
        "tur-lower": "tr",
        "cz-lower": "cz",
        "turkish-lower": "tr",
        "eng-lower": "en"
    }
    language = languages[dataset]
    tokenizer = get_tokenizer(language)

    def tok(seq):
        return tokenizer.EncodeAsIds(seq)

    src = data.Field(tokenize=tok,
                     init_token=1,
                     eos_token=2,
                     pad_token=3,
                     use_vocab=False)
    tgt = data.Field(tokenize=tok,
                     init_token=1,
                     eos_token=2,
                     pad_token=3,
                     use_vocab=False)
    mt_train = datasets.TranslationDataset(path='data/{}/{}.train'.format(
        language, dataset),
                                           exts=('.src', '.tgt'),
                                           fields=(src, tgt))
    mt_dev = datasets.TranslationDataset(path='data/{}/{}.dev'.format(
        language, dataset),
                                         exts=('.src', '.tgt'),
                                         fields=(src, tgt))
    mt_test = datasets.TranslationDataset(path='data/{}/{}.test'.format(
        language, dataset),
                                          exts=('.src', '.tgt'),
                                          fields=(src, tgt))
    return mt_train, mt_dev, mt_test
Пример #11
0
    def init_dataloaders(self):
        batch_size = self.config.hp.batch_size
        project_path = self.config.firelab.project_path
        data_path_train = os.path.join(project_path, self.config.data.train)
        data_path_val = os.path.join(project_path, self.config.data.val)

        src = data.Field(batch_first=True, init_token='<bos>', eos_token='<eos>',)
        trg = data.Field(batch_first=True, init_token='<bos>', eos_token='<eos>')

        mt_train = datasets.TranslationDataset(
            path=data_path_train, exts=('.en', '.fr'), fields=(src, trg))
        mt_val = datasets.TranslationDataset(
            path=data_path_val, exts=('.en', '.fr'), fields=(src, trg))

        src.build_vocab(mt_train.src)
        trg.build_vocab(mt_train.trg)

        self.vocab_src = src.vocab
        self.vocab_trg = trg.vocab

        self.train_dataloader = data.BucketIterator(mt_train, batch_size, repeat=False)
        self.val_dataloader = data.BucketIterator(mt_val, batch_size, repeat=False)
Пример #12
0
def get_dataset(dpath):
    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = '<blank>'
    EN = data.Field(pad_token=BLANK_WORD)
    JA = data.Field(init_token=BOS_WORD,
                    eos_token=EOS_WORD,
                    pad_token=BLANK_WORD)

    train = datasets.TranslationDataset(
            path=os.path.join(dpath, 'train'),
            exts=('.en', '.ja'),
            fields=(EN, JA))
    val = datasets.TranslationDataset(
            path=os.path.join(dpath, 'dev'),
            exts=('.en', '.ja'),
            fields=(EN, JA))

    MIN_FREQ = 2
    EN.build_vocab(train.src, min_freq=MIN_FREQ)
    JA.build_vocab(train.trg, min_freq=MIN_FREQ)
    return train, val, EN, JA
Пример #13
0
    def load_translation(self,
                         data_path,
                         exts,
                         split_ratio=0.95,
                         batch_size=64,
                         dl_save_path=None):

        print("Loading parallel corpus [{}, {}]".format(
            data_path + exts[0], data_path + exts[1]))
        DATA = datasets.TranslationDataset(path=data_path,
                                           exts=exts,
                                           fields=(('src', self.SRC),
                                                   ('trg', self.TGT)))
        print("Successful.")

        train, valid = DATA.split(split_ratio=split_ratio)

        print("Building src and tgt vocab ...")
        self.SRC.build_vocab(train)
        self.TGT.build_vocab(train)
        self._add_index()
        print("Successful.")

        torch.save(self, dl_save_path, pickle_module=dill)
        print("The dataloader is save at {}".format(dl_save_path))

        train_iter = MyIterator(train,
                                batch_size=batch_size,
                                device=None,
                                repeat=False,
                                sort_key=lambda x: (len(x.src), len(x.trg)),
                                batch_size_fn=batch_size_fn,
                                train=True,
                                shuffle=True)
        valid_iter = MyIterator(valid,
                                batch_size=batch_size,
                                device=None,
                                repeat=False,
                                sort_key=lambda x: (len(x.src), len(x.trg)),
                                batch_size_fn=batch_size_fn,
                                train=True,
                                shuffle=True)

        return train_iter, valid_iter
Пример #14
0
def createVocab(datafile,
                output,
                exts=('.de', '.en'),
                UNK='<unk>',
                SOS='<s>',
                EOS='</s>',
                MIN_FREQ=2,
                TMAX=100):
    import spacy
    spacy_x = spacy.load(exts[0][1:])
    spacy_y = spacy.load(exts[1][1:])

    def split_x(text):
        return [tok.text for tok in spacy_x.tokenizer(text)]

    def split_y(text):
        return [tok.text for tok in spacy_y.tokenizer(text)]

    SRC = data.Field(tokenize=split_x, unk_token=UNK, pad_token=UNK)
    TGT = data.Field(tokenize=split_y,
                     init_token=SOS,
                     eos_token=EOS,
                     unk_token=UNK,
                     pad_token=UNK)
    ds = datasets.TranslationDataset(
        path=datafile,
        exts=exts,
        fields=(SRC, TGT),
        filter_pred=lambda x: len(x.src) < TMAX and len(x.trg) < TMAX)
    SRC.build_vocab(ds.src, min_freq=MIN_FREQ)
    TGT.build_vocab(ds.trg, min_freq=MIN_FREQ)

    vocab_src = SRC.vocab.stoi
    vocab_tgt = TGT.vocab.stoi

    print('src have length', len(vocab_src))
    print('tgt have length', len(vocab_tgt))

    save_dict = {'src': vocab_src, 'tgt': vocab_tgt}

    import pickle
    with open(output, 'wb') as fs:
        pickle.dump(save_dict, fs)
Пример #15
0
def load_wmt_small_dataset(args: argparse.ArgumentParser) -> LoadedDatasetType:
    src = data.Field(
        include_lengths=True,
        init_token='<sos>',
        eos_token='<eos>',
        batch_first=True,
        fix_length=args.torchtext_src_fix_length,
    )

    trg = data.Field(
        include_lengths=True,
        init_token='<sos>',
        eos_token='<eos>',
        batch_first=True,
    )

    mt_train = datasets.TranslationDataset(
        path=constants.WMT14_EN_FR_SMALL_TRAIN,
        exts=('.en', '.fr'),
        fields=(src, trg))

    return mt_train, src, trg
Пример #16
0

# ---------- prepare dataset ----------

def len_filter(example):
    return len(example.src) <= opt.max_len and len(example.tgt) <= opt.max_len


EN = SentencePieceField(init_token=Constants.BOS_WORD,
                        eos_token=Constants.EOS_WORD,
                        batch_first=True,
                        include_lengths=True,
                        fix_length=opt.max_len + 1)

train = datasets.TranslationDataset(
    path='./data/dualgan/train',
    exts=('.billion.sp', '.use.sp'), fields=[('src', EN), ('tgt', EN)],
    filter_pred=len_filter)
val = datasets.TranslationDataset(
    path='./data/dualgan/val',
    exts=('.billion.sp', '.use.sp'), fields=[('src', EN), ('tgt', EN)],
    filter_pred=len_filter)
train_lang8, val_lang8 = Lang8.splits(
    exts=('.err.sp', '.cor.sp'), fields=[('src', EN), ('tgt', EN)],
    train='test', validation='test', test=None, filter_pred=len_filter)

# 讀取 vocabulary(確保一致)
try:
    logging.info('Load voab from %s' % opt.load_vocab_from)
    EN.load_vocab(opt.load_vocab_from)
except FileNotFoundError:
    EN.build_vocab_from(opt.build_vocab_from)
Пример #17
0
def main():
    parser = argparse.ArgumentParser()
    opt = options.train_options(parser)
    opt = parser.parse_args()

    opt.cuda = torch.cuda.is_available()
    opt.device = None if opt.cuda else -1

    # 快速變更設定
    opt.exp_dir = './experiment/transformer-reinforce/use_billion'
    opt.load_vocab_from = './experiment/transformer/lang8-cor2err/vocab.pt'
    opt.build_vocab_from = './data/billion/billion.30m.model.vocab'

    opt.load_D_from = opt.exp_dir
    # opt.load_D_from = None

    # dataset params
    opt.max_len = 20

    # G params
    # opt.load_G_a_from = './experiment/transformer/lang8-err2cor/'
    # opt.load_G_b_from = './experiment/transformer/lang8-cor2err/'
    opt.d_word_vec = 300
    opt.d_model = 300
    opt.d_inner_hid = 600
    opt.n_head = 6
    opt.n_layers = 3
    opt.embs_share_weight = False
    opt.beam_size = 1
    opt.max_token_seq_len = opt.max_len + 2  # 包含<BOS>, <EOS>
    opt.n_warmup_steps = 4000

    # D params
    opt.embed_dim = opt.d_model
    opt.num_kernel = 100
    opt.kernel_sizes = [3, 4, 5, 6, 7]
    opt.dropout_p = 0.25

    # train params
    opt.batch_size = 1
    opt.n_epoch = 10

    if not os.path.exists(opt.exp_dir):
        os.makedirs(opt.exp_dir)
    logging.basicConfig(filename=opt.exp_dir + '/.log',
                        format=LOG_FORMAT,
                        level=logging.DEBUG)
    logging.getLogger().addHandler(logging.StreamHandler())

    logging.info('Use CUDA? ' + str(opt.cuda))
    logging.info(opt)

    # ---------- prepare dataset ----------

    def len_filter(example):
        return len(example.src) <= opt.max_len and len(
            example.tgt) <= opt.max_len

    EN = SentencePieceField(init_token=Constants.BOS_WORD,
                            eos_token=Constants.EOS_WORD,
                            batch_first=True,
                            include_lengths=True)

    train = datasets.TranslationDataset(path='./data/dualgan/train',
                                        exts=('.billion.sp', '.use.sp'),
                                        fields=[('src', EN), ('tgt', EN)],
                                        filter_pred=len_filter)
    val = datasets.TranslationDataset(path='./data/dualgan/val',
                                      exts=('.billion.sp', '.use.sp'),
                                      fields=[('src', EN), ('tgt', EN)],
                                      filter_pred=len_filter)
    train_lang8, val_lang8 = Lang8.splits(exts=('.err.sp', '.cor.sp'),
                                          fields=[('src', EN), ('tgt', EN)],
                                          train='test',
                                          validation='test',
                                          test=None,
                                          filter_pred=len_filter)

    # 讀取 vocabulary(確保一致)
    try:
        logging.info('Load voab from %s' % opt.load_vocab_from)
        EN.load_vocab(opt.load_vocab_from)
    except FileNotFoundError:
        EN.build_vocab_from(opt.build_vocab_from)
        EN.save_vocab(opt.load_vocab_from)

    logging.info('Vocab len: %d' % len(EN.vocab))

    # 檢查Constants是否有誤
    assert EN.vocab.stoi[Constants.BOS_WORD] == Constants.BOS
    assert EN.vocab.stoi[Constants.EOS_WORD] == Constants.EOS
    assert EN.vocab.stoi[Constants.PAD_WORD] == Constants.PAD
    assert EN.vocab.stoi[Constants.UNK_WORD] == Constants.UNK

    # ---------- init model ----------

    # G = build_G(opt, EN, EN)
    hidden_size = 512
    bidirectional = True
    encoder = EncoderRNN(len(EN.vocab),
                         opt.max_len,
                         hidden_size,
                         n_layers=1,
                         bidirectional=bidirectional)
    decoder = DecoderRNN(len(EN.vocab),
                         opt.max_len,
                         hidden_size * 2 if bidirectional else 1,
                         n_layers=1,
                         dropout_p=0.2,
                         use_attention=True,
                         bidirectional=bidirectional,
                         eos_id=Constants.EOS,
                         sos_id=Constants.BOS)
    G = Seq2seq(encoder, decoder)
    for param in G.parameters():
        param.data.uniform_(-0.08, 0.08)

    # optim_G = ScheduledOptim(optim.Adam(
    #     G.get_trainable_parameters(),
    #     betas=(0.9, 0.98), eps=1e-09),
    #     opt.d_model, opt.n_warmup_steps)
    optim_G = optim.Adam(G.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-09)
    loss_G = NLLLoss(size_average=False)
    if torch.cuda.is_available():
        loss_G.cuda()

    # # 預先訓練D
    if opt.load_D_from:
        D = load_model(opt.load_D_from)
    else:
        D = build_D(opt, EN)
    optim_D = torch.optim.Adam(D.parameters(), lr=1e-4)

    def get_criterion(vocab_size):
        ''' With PAD token zero weight '''
        weight = torch.ones(vocab_size)
        weight[Constants.PAD] = 0
        return nn.CrossEntropyLoss(weight, size_average=False)

    crit_G = get_criterion(len(EN.vocab))
    crit_D = nn.BCELoss()

    if opt.cuda:
        G.cuda()
        D.cuda()
        crit_G.cuda()
        crit_D.cuda()

    # ---------- train ----------

    trainer_D = trainers.DiscriminatorTrainer()

    if not opt.load_D_from:
        for epoch in range(1):
            logging.info('[Pretrain D Epoch %d]' % epoch)

            pool = helper.DiscriminatorDataPool(opt.max_len, D.min_len,
                                                Constants.PAD)

            # 將資料塞進pool中
            train_iter = data.BucketIterator(dataset=train,
                                             batch_size=opt.batch_size,
                                             device=opt.device,
                                             sort_key=lambda x: len(x.src),
                                             repeat=False)
            pool.fill(train_iter)

            # train D
            trainer_D.train(D,
                            train_iter=pool.batch_gen(),
                            crit=crit_D,
                            optimizer=optim_D)
            pool.reset()

        Checkpoint(model=D,
                   optimizer=optim_D,
                   epoch=0,
                   step=0,
                   input_vocab=EN.vocab,
                   output_vocab=EN.vocab).save(opt.exp_dir)

    def eval_D():
        pool = helper.DiscriminatorDataPool(opt.max_len, D.min_len,
                                            Constants.PAD)
        val_iter = data.BucketIterator(dataset=val,
                                       batch_size=opt.batch_size,
                                       device=opt.device,
                                       sort_key=lambda x: len(x.src),
                                       repeat=False)
        pool.fill(val_iter)
        trainer_D.evaluate(D, val_iter=pool.batch_gen(), crit=crit_D)

        # eval_D()

    # Train G
    ALPHA = 0
    for epoch in range(100):
        logging.info('[Epoch %d]' % epoch)
        train_iter = data.BucketIterator(dataset=train,
                                         batch_size=1,
                                         device=opt.device,
                                         sort_within_batch=True,
                                         sort_key=lambda x: len(x.src),
                                         repeat=False)

        for step, batch in enumerate(train_iter):
            src_seq = batch.src[0]
            src_length = batch.src[1]
            tgt_seq = src_seq[0].clone()
            # gold = tgt_seq[:, 1:]

            optim_G.zero_grad()
            loss_G.reset()

            decoder_outputs, decoder_hidden, other = G.rollout(src_seq,
                                                               None,
                                                               None,
                                                               n_rollout=1)
            for i, step_output in enumerate(decoder_outputs):
                batch_size = tgt_seq.size(0)
                # print(step_output)

                # loss_G.eval_batch(step_output.contiguous().view(batch_size, -1), tgt_seq[:, i + 1])

            softmax_output = torch.exp(
                torch.cat([x for x in decoder_outputs], dim=0)).unsqueeze(0)
            softmax_output = helper.stack(softmax_output, 8)

            print(softmax_output)
            rollout = softmax_output.multinomial(1)
            print(rollout)

            tgt_seq = helper.pad_seq(tgt_seq.data,
                                     max_len=len(decoder_outputs) + 1,
                                     pad_value=Constants.PAD)
            tgt_seq = autograd.Variable(tgt_seq)
            for i, step_output in enumerate(decoder_outputs):
                batch_size = tgt_seq.size(0)
                loss_G.eval_batch(
                    step_output.contiguous().view(batch_size, -1),
                    tgt_seq[:, i + 1])
            G.zero_grad()
            loss_G.backward()
            optim_G.step()

            if step % 100 == 0:
                pred = torch.cat([x for x in other['sequence']], dim=1)
                print('[step %d] loss_rest %.4f' %
                      (epoch * len(train_iter) + step, loss_G.get_loss()))
                print('%s -> %s' %
                      (EN.reverse(tgt_seq.data)[0], EN.reverse(pred.data)[0]))

    # Reinforce Train G
    for p in D.parameters():
        p.requires_grad = False
Пример #18
0
from torchtext import datasets, data
from translation.data_loader import tokenize_en, tokenize_es

if __name__ == "__main__":
    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = '<pad>'
    SRC = data.Field(tokenize=tokenize_es,
                     init_token=BOS_WORD,
                     eos_token=EOS_WORD,
                     pad_token=BLANK_WORD)
    TGT = data.Field(tokenize=tokenize_en,
                     init_token=BOS_WORD,
                     eos_token=EOS_WORD,
                     pad_token=BLANK_WORD)

    dataset = datasets.TranslationDataset(
        'data/en-es/en-es_', ('en.txt', 'es.txt'), (SRC, TGT),
        filter_pred=lambda x: len(vars(x)['src']) <= 100 and len(
            vars(x)['trg']) <= 100)

    SRC.build_vocab(dataset.src, min_freq=2, max_size=39996)
    TGT.build_vocab(dataset.trg, min_freq=2, max_size=39996)

    src_file = open("data/SRC_Field.pt", "wb")
    tgt_file = open("data/TGT_Field.pt", "wb")
    dill.dump(SRC, src_file)
    dill.dump(TGT, tgt_file)

    print("Field files generated!")
Пример #19
0
def load_data(sum_num=30000, max_length=10):
    spacy_fr = spacy.load('fr_core_news_sm')
    spacy_en = spacy.load("en_core_web_sm")
    tokenize_eng = lambda text: [tok.text for tok in spacy_en.tokenizer(text)
                                 ][::-1]  #TODO:为什么是反序
    tokenize_fren = lambda text: [tok.text for tok in spacy_fr.tokenizer(text)]
    build_new_data(sum_num=sum_num, max_length=max_length)
    temp_tokenizer = lambda x: x.strip().split()

    # eng_field = data.Field(tokenize = tokenize_eng,
    #         init_token = '<sos>',
    #         eos_token = '<eos>',
    #         lower = True)

    # fren_field = data.Field(tokenize = tokenize_fren,
    #         init_token = '<sos>',
    #         eos_token = '<eos>',
    #         lower = True)

    # train, val, test = datasets.Multi30k.splits(exts = ('.de', '.en'),
    #                                                 fields = (eng_field, fren_field))

    # eng_field.build_vocab(train.src, min_freq=3)
    # fren_field.build_vocab(train.trg, min_freq=3)
    # if True: return eng_field, fren_field, (train, val, test)

    eng_field = data.Field(tokenize=tokenize_eng,
                           init_token=START_WORD,
                           eos_token=END_WORD)
    fren_field = data.Field(tokenize=tokenize_fren,
                            init_token=START_WORD,
                            eos_token=END_WORD)

    # eng_field = data.Field(sequential=True, # 序列化数据
    #                     use_vocab=True, # 确认使用词典
    #                     init_token=START_WORD,
    #                     eos_token=END_WORD,
    #                     fix_length=max_length, # 最大长度
    #                     tokenize=tokenize, # token方法
    #                     unk_token=UNKNOWN_WORD, # 未出现的词
    #                     batch_first=True, #是否先生成批次维度的张量
    #                     include_lengths=True # 返回填充的小批量的元祖和包含每个示例的列表
    #                     )
    # fren_field = data.Field(sequential=True, # 序列化数据
    #                     use_vocab=True, # 确认使用词典
    #                     init_token=START_WORD,
    #                     eos_token=END_WORD,
    #                     fix_length=max_length, # 最大长度
    #                     tokenize=tokenize, # token方法
    #                     unk_token=UNKNOWN_WORD,
    #                     batch_first = True,
    #                     include_lengths=True
    #                     )
    dataset = datasets.TranslationDataset(path='./data/small',
                                          exts=('.en', '.fr'),
                                          fields=(eng_field, fren_field))
    train, val, test = dataset.splits(exts=('.en', '.fr'),
                                      fields=(eng_field, fren_field),
                                      path='./data/')
    print('len(train.examples)', len(train.examples))
    print('len(val.examples)', len(val.examples))
    print('len(test.examples)', len(test.examples))
    eng_field.build_vocab(train.src, min_freq=2)
    fren_field.build_vocab(train.trg, min_freq=2)
    print('len(src_field.vocab)', len(eng_field.vocab))
    print('len(trg_field.vocab)', len(fren_field.vocab))
    return eng_field, fren_field, (train, val, test)
Пример #20
0
def main() -> None:
    parser = get_arg_parser()
    args = parser.parse_args()
    device = "cuda" if torch.cuda.is_available() and args.cuda else "cpu"
    print('using device {}'.format(device))

    print('loading datasets...')
    src = data.Field(include_lengths=True,
                     init_token='<sos>',
                     eos_token='<eos>',
                     batch_first=True,
                     fix_length=200)
    trg = data.Field(include_lengths=True,
                     init_token='<sos>',
                     eos_token='<eos>',
                     batch_first=True)

    if args.dataset == 'WMT':
        mt_train = datasets.TranslationDataset(
            path=constants.WMT14_EN_FR_SMALL_TRAIN,
            exts=('.en', '.fr'),
            fields=(src, trg))
        src_vocab, trg_vocab = utils.load_torchtext_wmt_small_vocab()
        src.vocab = src_vocab

        trg.vocab = trg_vocab

        mt_valid = None
    else:
        if args.dataset == 'Multi30k':
            mt_train, mt_valid, mt_test = datasets.Multi30k.splits(
                exts=('.en', '.de'),
                fields=(src, trg),
            )
        elif args.dataset == 'IWSLT':
            mt_train, mt_valid, mt_test = datasets.IWSLT.splits(
                exts=('.en', '.de'),
                fields=(src, trg),
            )
        else:
            raise Exception("Uknown dataset: {}".format(args.dataset))

        print('loading vocabulary...')

        # mt_dev shares the fields, so it shares their vocab objects
        src.build_vocab(
            mt_train,
            min_freq=args.torchtext_unk,
            max_size=args.torchtext_src_max_vocab,
        )

        trg.build_vocab(
            mt_train,
            max_size=args.torchtext_trg_max_vocab,
        )
        print('loaded vocabulary')

    # determine the correct dataset to evaluate
    eval_dataset = mt_train if args.eval_train else mt_valid
    eval_dataset = mt_test if args.eval_test else eval_dataset

    train_loader = data.BucketIterator(
        dataset=eval_dataset,
        batch_size=1,
        sort_key=lambda x: len(
            x.src),  # data.interleave_keys(len(x.src), len(x.trg)),
        sort_within_batch=True,
        device=device)

    print('model type: {}'.format(args.model_type))
    model = utils.build_model(parser, src.vocab, trg.vocab)
    if args.load_path is not None:
        model.load_state_dict(torch.load(args.load_path))
    model = model.eval()
    if args.binarize:
        print('binarizing model')
        binarized_model = Binarize(model)
        binarized_model.binarization()

    print(model)

    model_size = size_metrics.get_model_size(model)
    print("64 bit float: {}".format(
        size_metrics.get_model_size(model, 64, args.binarize)))
    print("32 bit float: {}".format(
        size_metrics.get_model_size(model, 32, args.binarize)))
    print("16 bit float: {}".format(
        size_metrics.get_model_size(model, 16, args.binarize)))
Пример #21
0
def main():
    src_dir = "data/src"
    model_dir = "data/model"
    eval_dir = "data/eval"

    corpus = "lang8_small"

    en_emb = "glove"
    de_emb = "glove"

    seq_train = False

    emb_dim = 200
    batch_size = 1500

    # Data Loading
    vocab_file = os.path.join(model_dir, "%s.vocab" % (corpus))
    model_file = os.path.join(
        model_dir, "%s.%s.%s.transformer.pt" % (corpus, en_emb, de_emb))

    if not os.path.exists(eval_dir):
        os.makedirs(eval_dir)

    # Computing Unit
    device = torch.device("cpu")

    # Loading Data
    bos_word = '<s>'
    eos_word = '</s>'

    blank_word = '<blank>'
    min_freq = 2

    spacy_en = spacy.load('en')

    def tokenize(text):
        return [tkn.text for tkn in spacy_en.tokenizer(text)]

    TEXT = data.Field(tokenize=tokenize,
                      init_token=bos_word,
                      eos_token=eos_word,
                      pad_token=blank_word)

    test = datasets.TranslationDataset(path=os.path.join(src_dir, corpus),
                                       exts=('.test.src', '.test.trg'),
                                       fields=(TEXT, TEXT))
    # use the same order as original data
    test_iter = data.Iterator(test,
                              batch_size=batch_size,
                              device=device,
                              sort=False,
                              repeat=False,
                              train=False)

    random_idx = random.randint(0, len(test) - 1)
    print(test[random_idx].src)
    print(test[random_idx].trg)

    # Vocabulary

    TEXT.vocab = torch.load(vocab_file)
    pad_idx = TEXT.vocab.stoi["<blank>"]

    print("Load %s vocabuary; vocab size = %d" % (corpus, len(TEXT.vocab)))

    # Word Embedding

    encoder_emb, decoder_emb = get_emb(en_emb,
                                       de_emb,
                                       TEXT.vocab,
                                       device,
                                       d_model=emb_dim)

    # Translation
    model = BuildModel(len(TEXT.vocab),
                       encoder_emb,
                       decoder_emb,
                       d_model=emb_dim).to(device)
    model.load_state_dict(torch.load(model_file))
    model.eval()

    print("Predicting %s ..." % (corpus))

    src, trg, pred = [], [], []
    for batch in (rebatch(pad_idx, b) for b in test_iter):
        out = greedy_decode(model, TEXT.vocab, batch.src, batch.src_mask)
        # print("SRC OUT: ", src.shape, out.shape)
        probs = model.generator(out)
        _, prediction = torch.max(probs, dim=-1)

        source = [[TEXT.vocab.itos[word] for word in words[1:]]
                  for words in batch.src]
        target = [[TEXT.vocab.itos[word] for word in words[1:]]
                  for words in batch.trg]
        translation = [[TEXT.vocab.itos[word] for word in words]
                       for words in prediction]

        for i in range(len(translation)):
            src.append(' '.join(source[i]).split('</s>')[0])
            trg.append(' '.join(target[i]).split('</s>')[0])
            pred.append(' '.join(translation[i]).split('</s>')[0])

            # eliminate data with unkonwn words in src trg
            if '<unk>' in src[-1] or '<unk>' in trg[-1]:
                continue

            print("Source:", src[-1])
            print("Target:", trg[-1])
            print("Translation:", pred[-1])
            print()

    prefix = os.path.join(eval_dir, '%s.%s.%s.eval' % (corpus, en_emb, de_emb))
    for sentences, ext in zip([src, trg, pred], ['.src', '.trg', '.pred']):
        with open(prefix + ext, 'w+') as f:
            f.write('\n'.join(sentences))
    resource = torch.load('data/tfm-40768.pt')
    model_dict, field = resource['model'], resource['field']

    vocab_size = len(field.vocab.stoi)
    pad_index = field.vocab.stoi['<pad>']

    model = ParallelTransformer(
        module=Transformer(vocab_size).to(device),
        device_ids=device_ids,
        output_device=device,
        dim=1
    )

    model.load_state_dict(model_dict)

    test = datasets.TranslationDataset(
        path='data/bpe.test',
        exts=('.tgl', '.en'),
        fields=(('src', field), ('trg', field))
    )

    test_iter = data.BucketIterator(
        test,
        batch_size=256,
        batch_size_fn=lambda ex, bs, sz: sz + len(ex.src), device=device,
        train=False
    )

    result = run_epoch(test_iter, model, field, device)
Пример #23
0
    with open(file_name, "w") as file:
        for line in hypotheses:
            file.write(line + "\n")

    bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score
    print(bleu)


for error in range(1, 2):
    my_data = {}
    num_batches = 100
    error_per = error / 10.

    for split in ["train", "val", "test"]:
        my_data[split] = datasets.TranslationDataset(path="data/new_" + split,
                                                     exts=('.nl', '.amr'),
                                                     fields=(SRC, TRG))
    MIN_FREQ = 5
    SRC.build_vocab(my_data["train"].src, min_freq=MIN_FREQ)
    TRG.build_vocab(my_data["train"].trg, min_freq=MIN_FREQ)

    PAD_INDEX = TRG.vocab.stoi[PAD_TOKEN]

    print_data_info(my_data, SRC, TRG)
    train_iter = data.BucketIterator(my_data["train"],
                                     batch_size=100,
                                     train=True,
                                     sort_within_batch=True,
                                     sort_key=lambda x:
                                     (len(x.src), len(x.trg)),
                                     repeat=False,
Пример #24
0
                ) #unk=0, pad=1

tgt = data.Field(sequential=True,
                 use_vocab=True,
                 pad_token=PAD,
                 tokenize=tokenizer_de,
                 lower=True,
                 init_token=BOS,
                 eos_token=EOS,
                 include_lengths=True,

                 ) #unk=0, pad=1, <s>=2, </s>=3

prefix_f = './escape.en-de.tok.100k'

parallel_dataset = datasets.TranslationDataset(path=prefix_f, exts=('.en', '.de'), fields=[('src', src), ('tgt', tgt)])

src.build_vocab(parallel_dataset, min_freq=5, max_size=15000)
tgt.build_vocab(parallel_dataset, min_freq=5, max_size=15000)

train, valid = parallel_dataset.split(split_ratio=0.97)

train_iter, valid_iter = data.BucketIterator.splits((train, valid), batch_size=32,
                                                    sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)),
                                                    device='cuda')

class Encoder(nn.Module):
    def __init__(self, hidden_dim: int, dropout: float, pad_idx: int):
        super().__init__()

        self.dim = hidden_dim
Пример #25
0
import torch
from torchtext import data, datasets, vocab
import os

dataDir = '/Users/xinyi.ye/Documents/machine_translate/experiments/train4/'

BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"

# Field define how to deal with raw data
SRC = data.Field(pad_token=BLANK_WORD)  # tokenize, default: string.split
TGT = data.Field(init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD)

traindataset = datasets.TranslationDataset(path=dataDir + 'train-infoq',
                                           exts=('.en', '.zh'),
                                           fields=(SRC, TGT))

pwd = os.getcwd()

SRC.build_vocab(traindataset,
                vectors=vocab.Vectors(name='cc.en.300.vec',
                                      cache=pwd + '/.vector_cache'))
TGT.build_vocab(traindataset,
                vectors=vocab.Vectors(name='cc.zh.300.vec',
                                      cache=pwd + '/.vector_cache'))
Пример #26
0
    return [tok.text for tok in spacy_en.tokenizer(text)]


BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
TGT = data.Field(tokenize=tokenize_en,
                 init_token=BOS_WORD,
                 eos_token=EOS_WORD,
                 pad_token=BLANK_WORD)

MAX_LEN = 40
dataset = datasets.TranslationDataset(
    path='WMT14/europarl-v7',
    exts=('.de', '.en'),
    fields=(SRC, TGT),
    filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(
        vars(x)['trg']) <= MAX_LEN)

MIN_SRC_FREQ = 9  # 出现频率小于这个频率的丢掉   ,这个值设置的太小会导致字典尺寸太大,从而导致embed失败
MIN_TGT_FREQ = 3
SRC.build_vocab(dataset.src, min_freq=MIN_SRC_FREQ)
TGT.build_vocab(dataset.trg, min_freq=MIN_TGT_FREQ)
len1 = SRC.vocab.__len__()
len2 = TGT.vocab.__len__()
print('build_vocab is successful')
'''
#  传入数据!!!!!!!!!!!!!!!!!!!!!!!!!传入
import spacy
from torchtext import datasets, data
spacy_de = spacy.load('de')  # nlp =spacy.load('de_core_news_sm')
Пример #27
0
def main():
    args = parse_args()
    SRC_DIR = args.SRC_DIR
    MODEL_DIR = args.MODEL_DIR
    DATA = args.DATA
    EN_EMB = args.EN_EMB
    DE_EMB = args.DE_EMB
    SEQ_TRAIN = True if DE_EMB == 'elmo' else False

    # TODO currently hidden size is fixed, should be able to adjust
    #      based on src and trg embeddings respectively
    # EMB_DIM should be multiple of h (default 8), look at MultiHeadedAttention
    if 'glove' in EN_EMB:
        EMB_DIM = 200
    elif 'elmo' in EN_EMB:
        EMB_DIM = 1024
    else:
        EMB_DIM = 512

    BATCH_SIZE = 1500
    EPOCHES = 100

    options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
    vocab_file = os.path.join(MODEL_DIR, '%s.vocab' % (DATA))
    model_file = os.path.join(
        MODEL_DIR, '%s.%s.%s.transformer.pt' % (DATA, EN_EMB, DE_EMB))

    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

    # GPU to use
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = ("cpu")
    # devices = [0, 1, 2, 3]

    #####################
    #   Data Loading    #
    #####################
    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = "<blank>"
    MIN_FREQ = 2

    spacy_en = spacy.load('en')

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = data.Field(tokenize=tokenize_en,
                      init_token=BOS_WORD,
                      eos_token=EOS_WORD,
                      pad_token=BLANK_WORD)

    train = datasets.TranslationDataset(path=os.path.join(SRC_DIR, DATA),
                                        exts=('.train.src', '.train.trg'),
                                        fields=(TEXT, TEXT))
    val = datasets.TranslationDataset(path=os.path.join(SRC_DIR, DATA),
                                      exts=('.val.src', '.val.trg'),
                                      fields=(TEXT, TEXT))

    train_iter = MyIterator(train,
                            batch_size=BATCH_SIZE,
                            device=device,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=True)
    valid_iter = MyIterator(val,
                            batch_size=BATCH_SIZE,
                            device=device,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=False)

    random_idx = random.randint(0, len(train) - 1)
    print(train[random_idx].src)
    print(train[random_idx].trg)

    ###############
    #  Vocabuary  #
    ###############
    if os.path.exists(vocab_file):
        TEXT.vocab = torch.load(vocab_file)
    else:
        print("Save %s vocabuary..." % (DATA), end='\t')
        TEXT.build_vocab(train.src, min_freq=MIN_FREQ, vectors='glove.6B.200d')
        print("vocab size = %d" % (len(TEXT.vocab)))
        torch.save(TEXT.vocab, vocab_file)

    pad_idx = TEXT.vocab.stoi["<blank>"]

    #####################
    #   Word Embedding  #
    #####################
    encoder_emb, decoder_emb = get_emb(EN_EMB,
                                       DE_EMB,
                                       TEXT.vocab,
                                       device,
                                       d_model=EMB_DIM,
                                       elmo_options=options_file,
                                       elmo_weights=weight_file)

    ##########################
    #   Training the System  #
    ##########################
    model = make_model(len(TEXT.vocab),
                       encoder_emb,
                       decoder_emb,
                       d_model=EMB_DIM).to(device)
    if os.path.exists(model_file):
        print("Restart from last checkpoint...")
        model.load_state_dict(torch.load(model_file))

    criterion = LabelSmoothing(size=len(TEXT.vocab),
                               padding_idx=pad_idx,
                               smoothing=0.1).to(device)
    model_opt = NoamOpt(
        EMB_DIM, 1, 2000,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98),
                         eps=1e-9))

    # calculate parameters
    total_params = sum(p.numel() for p in model.parameters()) // 1000000
    trainable_params = sum(
        p.numel() for p in model.parameters() if p.requires_grad) // 1000000
    rate = trainable_params / total_params
    print("Model parameters trainable (%d M) / total (%d M) = %f" %
          (trainable_params, total_params, rate))

    print("Training %s %s %s..." % (DATA, EN_EMB, DE_EMB))
    ### SINGLE GPU
    for epoch in range(EPOCHES):
        model.train()
        loss_compute = SimpleLossCompute(model.generator,
                                         criterion,
                                         opt=model_opt)
        run_epoch((rebatch(pad_idx, b) for b in train_iter),
                  model,
                  loss_compute,
                  TEXT.vocab,
                  seq_train=SEQ_TRAIN)

        model.eval()
        total_loss, total_tokens = 0, 0
        for batch in (rebatch(pad_idx, b) for b in valid_iter):
            out = greedy_decode(model,
                                TEXT.vocab,
                                batch.src,
                                batch.src_mask,
                                trg=batch.trg)
            loss = loss_compute(out, batch.trg_y, batch.ntokens)
            total_loss += loss
            total_tokens += batch.ntokens

        print("Save model...")
        torch.save(model.state_dict(), model_file)

        print("Epoch %d/%d - Loss: %f" %
              (epoch + 1, EPOCHES, total_loss / total_tokens))
Пример #28
0
def main():
    args_parser = argparse.ArgumentParser(
        description='Tuning with graph-based parsing')
    args_parser.add_argument('--cuda', action='store_true', help='using GPU')
    args_parser.add_argument('--num_epochs',
                             type=int,
                             default=200,
                             help='Number of training epochs')
    args_parser.add_argument('--batch_size',
                             type=int,
                             default=64,
                             help='Number of sentences in each batch')
    args_parser.add_argument('--hidden_size',
                             type=int,
                             default=256,
                             help='Number of hidden units in RNN')
    args_parser.add_argument('--num_layers',
                             type=int,
                             default=1,
                             help='Number of layers of RNN')
    args_parser.add_argument('--opt',
                             choices=['adam', 'sgd', 'adamax'],
                             help='optimization algorithm')
    args_parser.add_argument('--objective',
                             choices=['cross_entropy', 'crf'],
                             default='cross_entropy',
                             help='objective function of training procedure.')
    args_parser.add_argument('--learning_rate',
                             type=float,
                             default=0.01,
                             help='Learning rate')
    args_parser.add_argument('--decay_rate',
                             type=float,
                             default=0.05,
                             help='Decay rate of learning rate')
    args_parser.add_argument('--clip',
                             type=float,
                             default=5.0,
                             help='gradient clipping')
    args_parser.add_argument('--gamma',
                             type=float,
                             default=0.0,
                             help='weight for regularization')
    args_parser.add_argument('--epsilon',
                             type=float,
                             default=1e-8,
                             help='epsilon for adam or adamax')
    args_parser.add_argument('--p_rnn',
                             nargs=2,
                             type=float,
                             default=0.1,
                             help='dropout rate for RNN')
    args_parser.add_argument('--p_in',
                             type=float,
                             default=0.33,
                             help='dropout rate for input embeddings')
    args_parser.add_argument('--p_out',
                             type=float,
                             default=0.33,
                             help='dropout rate for output layer')
    args_parser.add_argument('--schedule',
                             type=int,
                             help='schedule for learning rate decay')
    args_parser.add_argument(
        '--unk_replace',
        type=float,
        default=0.,
        help='The rate to replace a singleton word with UNK')
    #args_parser.add_argument('--punctuation', nargs='+', type=str, help='List of punctuations')
    args_parser.add_argument('--word_path',
                             help='path for word embedding dict')
    args_parser.add_argument(
        '--freeze',
        action='store_true',
        help='frozen the word embedding (disable fine-tuning).')
    # args_parser.add_argument('--char_path', help='path for character embedding dict')
    args_parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    args_parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    args_parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"
    args_parser.add_argument('--model_path',
                             help='path for saving model file.',
                             default='models/temp')
    args_parser.add_argument('--model_name',
                             help='name for saving model file.',
                             default='generator')

    args_parser.add_argument('--seq2seq_save_path',
                             default='checkpoints/seq2seq_save_model',
                             type=str,
                             help='seq2seq_save_path')
    args_parser.add_argument('--seq2seq_load_path',
                             default='checkpoints/seq2seq_save_model',
                             type=str,
                             help='seq2seq_load_path')
    # args_parser.add_argument('--rl_finetune_seq2seq_save_path', default='models/rl_finetune/seq2seq_save_model',
    #                          type=str, help='rl_finetune_seq2seq_save_path')
    # args_parser.add_argument('--rl_finetune_network_save_path', default='models/rl_finetune/network_save_model',
    #                          type=str, help='rl_finetune_network_save_path')
    # args_parser.add_argument('--rl_finetune_seq2seq_load_path', default='models/rl_finetune/seq2seq_save_model',
    #                          type=str, help='rl_finetune_seq2seq_load_path')
    # args_parser.add_argument('--rl_finetune_network_load_path', default='models/rl_finetune/network_save_model',
    #                          type=str, help='rl_finetune_network_load_path')

    args_parser.add_argument('--direct_eval',
                             action='store_true',
                             help='direct eval without generation process')
    args = args_parser.parse_args()

    spacy_en = spacy.load('en_core_web_sm')  # python -m spacy download en
    spacy_de = spacy.load('de_core_news_sm')  # python -m spacy download en
    spacy_fr = spacy.load('fr_core_news_sm')  # python -m spacy download en

    SEED = 0
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    device = torch.device(
        'cuda:2'
    )  #torch.device('cuda' if torch.cuda.is_available() else 'cpu') #'cpu' if not torch.cuda.is_available() else 'cuda:0'

    def tokenizer_en(text):  # create a tokenizer function
        return [tok.text for tok in spacy_en.tokenizer(text)]

    def tokenizer_de(text):  # create a tokenizer function
        return [tok.text for tok in spacy_de.tokenizer(text)]

    def tokenizer_fr(text):  # create a tokenizer function
        return [tok.text for tok in spacy_fr.tokenizer(text)]

    en_field = data.Field(sequential=True,
                          tokenize=tokenizer_en,
                          lower=True,
                          fix_length=150,
                          include_lengths=True,
                          batch_first=True)  #use_vocab=False
    de_field = data.Field(sequential=True,
                          tokenize=tokenizer_de,
                          lower=True,
                          fix_length=150,
                          include_lengths=True,
                          batch_first=True)  #use_vocab=False
    fr_field = data.Field(sequential=True,
                          tokenize=tokenizer_fr,
                          lower=True,
                          fix_length=150,
                          include_lengths=True,
                          batch_first=True)  #use_vocab=False
    print('begin loading training data-----')
    print('time: ', time.asctime(time.localtime(time.time())))
    seq2seq_train_data = MultiSourceTranslationDataset(
        path='wmt14_3/sample',
        exts=('.de', '.fr', '.en'),
        fields=(de_field, fr_field, en_field))
    print('begin loading validation data-----')
    print('time: ', time.asctime(time.localtime(time.time())))
    seq2seq_dev_data = MultiSourceTranslationDataset(
        path='wmt14_3/sample',
        exts=('.de', '.fr', '.en'),
        fields=(de_field, fr_field, en_field))
    print('end loading data-----')
    print('time: ', time.asctime(time.localtime(time.time())))

    fr_train_data = datasets.TranslationDataset(path='wmt14_3/train',
                                                exts=('.fr', '.fr'),
                                                fields=(fr_field, fr_field))
    print('end fr data add-----')
    print('time: ', time.asctime(time.localtime(time.time())))
    fr_field.build_vocab(fr_train_data,
                         max_size=80000)  # ,vectors="glove.6B.100d"
    with open('vocab_fr.pickle', 'wb') as f:
        pickle.dump(fr_field.vocab, f)
    print('end fr vocab save-----')
    print('time: ', time.asctime(time.localtime(time.time())))
    with open('vocab_fr.pickle', 'rb') as f:
        fr_field.vocab = pickle.load(f)
    print('end fr vocab load-----')
    print('time: ', time.asctime(time.localtime(time.time())))
Пример #29
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--embedding-size',
                        type=int,
                        dest="embedding_size",
                        help="Embedding size",
                        default=EMBEDDING_DIM)
    parser.add_argument('--hidden-size',
                        type=int,
                        dest="hidden_size",
                        help="Hidden size",
                        default=HIDDEN_SIZE)
    parser.add_argument('--nlayers',
                        type=int,
                        dest="nlayers",
                        help="Number of RNN layers",
                        default=NUM_LAYER)
    parser.add_argument('--dropout',
                        type=float,
                        help="Dropout",
                        default=DROPOUT)
    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        dest="batch_size",
                        help="Batch size",
                        default=DEFAULT_BATCH_SIZE)
    parser.add_argument('--learning-rate',
                        type=float,
                        dest="learning_rate",
                        help="Initial learning rate",
                        default=0.1)
    parser.add_argument('--learning-rate-decay',
                        type=float,
                        dest="learning_rate_decay",
                        help="Learning rate decay",
                        default=0.5)
    parser.add_argument(
        '--epochs',
        type=int,
        default=10,
        help="Start decaying every epoch after and including this epoch.")
    parser.add_argument(
        '--start-decay-at',
        dest="start_decay_at",
        type=int,
        default=3,
        help="Start decaying every epoch after and including this epoch.")
    parser.add_argument('--batches-per-print',
                        type=int,
                        dest="batches_per_print",
                        help="Number of batches per print",
                        default=100)
    parser.add_argument('-m',
                        '--model',
                        help="Path to the model file to load",
                        default=None)
    parser.add_argument('--data', help="train or test", default="train")
    cmd_args = parser.parse_args()

    src = data.Field(include_lengths=True, tokenize=list)
    tgt = data.Field(include_lengths=True, tokenize=list)

    mt_train = datasets.TranslationDataset(path='data/%s' % cmd_args.data,
                                           exts=('.src', '.tgt'),
                                           fields=(src, tgt))
    mt_dev = datasets.TranslationDataset(path='data/dev',
                                         exts=('.src', '.tgt'),
                                         fields=(src, tgt))

    print("Building vocabularies..")
    src.build_vocab(mt_train)
    tgt.build_vocab(mt_train)

    print("Making batches..")
    # sort key 부호는 GPU에서는 -, CPU에서는 +를 붙여야 하나?
    SIGN = -1 if CUDA_AVAILABLE else 1
    train_iter = data.BucketIterator(dataset=mt_train,
                                     batch_size=cmd_args.batch_size,
                                     device=(None if CUDA_AVAILABLE else -1),
                                     repeat=False,
                                     sort_key=lambda x: len(x.src) * SIGN)
    dev_iter = data.BucketIterator(dataset=mt_dev,
                                   batch_size=cmd_args.batch_size,
                                   device=(None if CUDA_AVAILABLE else -1),
                                   repeat=False,
                                   train=False,
                                   sort_key=lambda x: len(x.src) * SIGN)

    print("Creating model..")
    from spacer import Spacer
    num_classes = len(tgt.vocab)
    padding_idx = tgt.vocab.stoi["<pad>"]
    model = Spacer(len(src.vocab),
                   num_classes,
                   cmd_args.embedding_size,
                   cmd_args.hidden_size,
                   cmd_args.nlayers,
                   cmd_args.dropout,
                   BIDIRECTIONAL,
                   padding_idx=padding_idx)
    if CUDA_AVAILABLE:
        model.cuda(0)

    if cmd_args.model:
        print("Loading model: {}".format(cmd_args.model))
        state_dict = torch.load(cmd_args.model)
        model.load_state_dict(state_dict)

    criterion = torch.nn.CrossEntropyLoss(ignore_index=padding_idx)
    learning_rate = cmd_args.learning_rate

    loss_history = []
    for epoch in range(1, cmd_args.epochs + 1):
        if epoch >= cmd_args.start_decay_at and len(
                loss_history) > 1 and loss_history[-1] > loss_history[-2]:
            learning_rate *= cmd_args.learning_rate_decay
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

        train_losses = []
        correct_answer_count = 0
        total_question_count = 0
        total_processing_chars = 0
        start_time = time.time()
        for batch_idx, batch in enumerate(train_iter, 1):
            optimizer.zero_grad()

            inputs, src_length = batch.src
            y_ = model(inputs, src_length)
            y_ = y_.view(-1, num_classes)
            y = batch.trg[0]
            y = y.view(-1)
            loss = criterion(y_, y)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.data[0])
            _, prediction = torch.max(y_, dim=1)
            total_question_count += prediction.size()[0]
            correct_answer_count += (prediction == y).float().sum().data[0]
            total_processing_chars += torch.sum(src_length)
            if batch_idx % cmd_args.batches_per_print == 0:
                average_loss = np.mean(train_losses)
                end_time = time.time()
                cps = int(total_processing_chars / (end_time - start_time))

                print(
                    "{}-{}(BS: {}), TrainLoss: {:.4f}, Accuracy: {:.4f}, LR:{:.4f}, Time: {:.2f} s, Speed: {} chars/s"
                    .format(epoch, batch_idx, cmd_args.batch_size,
                            average_loss,
                            correct_answer_count / total_question_count,
                            learning_rate, end_time - start_time, cps))
                print("Sentence: {}".format("".join(
                    src.vocab.itos[x[0]] for x in batch.src[0].data)))
                prediction = prediction.view(-1, batch.batch_size)
                print("Prediction: {}".format("".join(
                    tgt.vocab.itos[x[0]] for x in prediction.data)))
                y = y.view(-1, batch.batch_size)
                print("Answer    : {}".format("".join(tgt.vocab.itos[x[0]]
                                                      for x in y.data)))

                train_losses = []
                correct_answer_count = 0
                total_question_count = 0
                total_processing_chars = 0
                start_time = end_time

        model.train(False)
        cv_losses = []
        for cv_batch in dev_iter:
            inputs, src_length = cv_batch.src
            y_ = model(inputs, src_length)
            y_ = y_.view(-1, num_classes)
            y = cv_batch.trg[0]
            y = y.view(-1)
            loss = criterion(y_, y)
            cv_losses.append(loss.data[0])
        cv_average_loss = np.mean(cv_losses)
        loss_history.append(cv_average_loss)
        model.train(True)

        filename = "models/spacer_{:02d}_{:.4f}.pth".format(
            epoch, cv_average_loss)
        print("Saving a file: {}".format(filename))
        torch.save(model.state_dict(), filename)

        print("== Summary ==")
        for i, l in enumerate(loss_history, start=1):
            print("Epoch: {}, CV Loss: {}".format(i, l))
        print("")

    print('done')
Пример #30
0
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    else:
        torch.cuda.manual_seed(args.seed)

############################
# Load data
############################
print ("Loading data...")

PAD_WORD = '<blank>'
eval_batch_size = args.eval_batch_size

src = data.Field(pad_token=PAD_WORD)
trg = data.Field(pad_token=PAD_WORD)

train_data = datasets.TranslationDataset(path=args.data + '/train', exts=('.en', '.de'), fields=(src, trg))
val_data = datasets.TranslationDataset(path=args.data + '/valid', exts=('.en', '.de'), fields=(src, trg))
test_data = datasets.TranslationDataset(path=args.data + '/test', exts=('.en', '.de'), fields=(src, trg))

print ("DONE\n")

############################
# Load vocab
############################

print ("Loading vocab...")

vocab = dict(torch.load(args.dict_path, "text"))
v = vocab['tgt']
v.stoi = defaultdict(lambda: 0, v.stoi)
src.vocab = v; trg.vocab = v