Exemplo n.º 1
0
def create_infer_loader(args):
    batch_size = args.batch_size
    max_len = args.max_len

    test_ds = load_dataset('iwslt15', splits='test')
    src_vocab = Vocab.load_vocabulary(**test_ds.vocab_info['en'])
    tgt_vocab = Vocab.load_vocabulary(**test_ds.vocab_info['vi'])
    bos_id = src_vocab[src_vocab.bos_token]
    eos_id = src_vocab[src_vocab.eos_token]
    pad_id = eos_id

    def convert_example(example):
        source = example['en'].split()
        target = example['vi'].split()

        source = src_vocab.to_indices(source)
        target = tgt_vocab.to_indices(target)

        return source, target

    test_ds.map(convert_example)
    test_batch_sampler = SamplerHelper(test_ds).batch(batch_size=batch_size)

    test_loader = paddle.io.DataLoader(test_ds,
                                       batch_sampler=test_batch_sampler,
                                       collate_fn=partial(prepare_infer_input,
                                                          bos_id=bos_id,
                                                          eos_id=eos_id,
                                                          pad_id=pad_id))
    return test_loader, len(src_vocab), len(tgt_vocab), bos_id, eos_id
Exemplo n.º 2
0
 def test_counter(self):
     token_to_idx = {'一万七千多': 1, '一万七千余': 2, '一万万': 3}
     vocab = Vocab(
         counter=self.counter, unk_token='[UNK]', token_to_idx=token_to_idx)
     self.check_output_equal(vocab.to_tokens(1), '一万七千多')
     self.check_output_equal(vocab.to_tokens(2), '一万七千余')
     self.check_output_equal(vocab.to_tokens(3), '一万万')
Exemplo n.º 3
0
def init_lstm_var(args):
    if args.language == 'ch':
        vocab = Vocab.load_vocabulary("../task/similarity/simnet/vocab.char",
                                      unk_token='[UNK]',
                                      pad_token='[PAD]')
    else:
        vocab = Vocab.load_vocabulary("../task/similarity/simnet/vocab_QQP",
                                      unk_token='[UNK]',
                                      pad_token='[PAD]')

    tokenizer = CharTokenizer(vocab, args.language, '../punctuations')
    model = SimNet(network='lstm', vocab_size=len(vocab), num_classes=2)

    dev_ds = SimilarityData().read(os.path.join(args.data_dir, 'dev'))
    dev_examples = preprocess_data(dev_ds.data,
                                   tokenizer,
                                   language=args.language)
    batches = [
        dev_examples[idx:idx + args.batch_size]
        for idx in range(0, len(dev_examples), args.batch_size)
    ]

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)),  # query_ids
        Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)),  # title_ids
        Stack(dtype="int64"),  # query_seq_lens
        Stack(dtype="int64"),  # title_seq_lens
    ): [data for data in fn(samples)]

    return model, tokenizer, batches, batchify_fn, vocab, dev_ds
Exemplo n.º 4
0
    def __init__(self, args={}):
        super(TransformerReader, self).__init__()

        dataset = load_dataset('wmt14ende', splits=('test'))
        if not args.benchmark:
            self.vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"])
        else:
            self.vocab = Vocab.load_vocabulary(
                **dataset.vocab_info["benchmark"])
        self.src_vocab = self.trg_vocab = self.vocab

        def convert_samples(samples):
            source = []
            for sample in samples:
                src = sample.split()
                source.append(self.src_vocab.to_indices(src))

            return source

        self.tokenize = convert_samples
        self.to_tokens = self.trg_vocab.to_tokens
        self.feed_keys = ["src_word"]
        self.bos_idx = args.bos_idx
        self.eos_idx = args.eos_idx
        self.pad_idx = args.bos_idx
        self.pad_seq = args.pad_seq
        self.word_pad = Pad(self.pad_idx)
Exemplo n.º 5
0
 def test_json(self):
     token_to_idx = {'一万七千多': 1, '一万七千余': 2, '一万万': 3}
     vocab = Vocab(
         counter=self.counter, unk_token='[UNK]', token_to_idx=token_to_idx)
     json_str = vocab.to_json()
     copied_vocab = Vocab.from_json(json_str)
     for key, value in copied_vocab.token_to_idx.items():
         self.check_output_equal(value, vocab[key])
Exemplo n.º 6
0
def load_vocab(vocab_dir):
    """load vocabs"""
    word_vocab = Vocab.from_json(os.path.join(vocab_dir, "word_vocab.json"))
    rel_vocab = Vocab.from_json(os.path.join(vocab_dir, "rel_vocab.json"))
    feat_vocab_path = os.path.join(vocab_dir, "feat_vocab.json")
    if os.path.exists(feat_vocab_path):
        feat_vocab = Vocab.from_json(os.path.join(feat_vocab_path))
    else:
        feat_vocab = None
    return word_vocab, feat_vocab, rel_vocab
Exemplo n.º 7
0
def create_data_loader(args, places=None):
    datasets = load_dataset('wmt14ende', splits=('train', 'dev'))
    if not args.benchmark:
        src_vocab = Vocab.load_vocabulary(**datasets[0].vocab_info["bpe"])
    else:
        src_vocab = Vocab.load_vocabulary(
            **datasets[0].vocab_info["benchmark"])
    trg_vocab = src_vocab

    padding_vocab = (
        lambda x:
        (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor)
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))

    def convert_samples(sample):
        source = sample[args.src_lang].split()
        target = sample[args.trg_lang].split()

        source = src_vocab.to_indices(source)
        target = trg_vocab.to_indices(target)

        return source, target

    data_loaders = [(None)] * 2
    for i, dataset in enumerate(datasets):
        dataset = dataset.map(convert_samples, lazy=False).filter(
            partial(min_max_filer, max_len=args.max_length))
        batch_sampler = TransformerBatchSampler(
            dataset=dataset,
            batch_size=args.batch_size,
            pool_size=args.pool_size,
            sort_type=args.sort_type,
            shuffle=args.shuffle,
            shuffle_batch=args.shuffle_batch,
            use_token_batch=True,
            max_length=args.max_length,
            distribute_mode=True if i == 0 else False,
            world_size=dist.get_world_size(),
            rank=dist.get_rank(),
            pad_seq=args.pad_seq,
            bsz_multi=args.bsz_multi)

        data_loader = DataLoader(dataset=dataset,
                                 places=places,
                                 batch_sampler=batch_sampler,
                                 collate_fn=partial(prepare_train_input,
                                                    bos_idx=args.bos_idx,
                                                    eos_idx=args.eos_idx,
                                                    pad_idx=args.bos_idx,
                                                    pad_seq=args.pad_seq),
                                 num_workers=0)
        data_loaders[i] = (data_loader)
    return data_loaders
Exemplo n.º 8
0
    def __init__(self,
                 max_length: int = 256,
                 max_out_len: int = 256,
                 beam_size: int = 5):
        super(MTTransformer, self).__init__()
        bpe_codes_file = os.path.join(MODULE_HOME, 'transformer_zh_en',
                                      'assets', '2M.zh2en.dict4bpe.zh')
        src_vocab_file = os.path.join(MODULE_HOME, 'transformer_zh_en',
                                      'assets', 'vocab.zh')
        trg_vocab_file = os.path.join(MODULE_HOME, 'transformer_zh_en',
                                      'assets', 'vocab.en')
        checkpoint = os.path.join(MODULE_HOME, 'transformer_zh_en', 'assets',
                                  'transformer.pdparams')

        self.max_length = max_length
        self.beam_size = beam_size
        self.tokenizer = MTTokenizer(bpe_codes_file=bpe_codes_file,
                                     lang_src=self.lang_config['source'],
                                     lang_trg=self.lang_config['target'])
        self.src_vocab = Vocab.load_vocabulary(
            filepath=src_vocab_file,
            unk_token=self.vocab_config['unk_token'],
            bos_token=self.vocab_config['bos_token'],
            eos_token=self.vocab_config['eos_token'])
        self.trg_vocab = Vocab.load_vocabulary(
            filepath=trg_vocab_file,
            unk_token=self.vocab_config['unk_token'],
            bos_token=self.vocab_config['bos_token'],
            eos_token=self.vocab_config['eos_token'])
        self.src_vocab_size = (len(self.src_vocab) + self.vocab_config['pad_factor'] - 1) \
            // self.vocab_config['pad_factor'] * self.vocab_config['pad_factor']
        self.trg_vocab_size = (len(self.trg_vocab) + self.vocab_config['pad_factor'] - 1) \
            // self.vocab_config['pad_factor'] * self.vocab_config['pad_factor']
        self.transformer = InferTransformerModel(
            src_vocab_size=self.src_vocab_size,
            trg_vocab_size=self.trg_vocab_size,
            bos_id=self.vocab_config['bos_id'],
            eos_id=self.vocab_config['eos_id'],
            max_length=self.max_length + 1,
            max_out_len=max_out_len,
            beam_size=self.beam_size,
            **self.model_config)

        state_dict = paddle.load(checkpoint)

        # To avoid a longer length than training, reset the size of position
        # encoding to max_length
        state_dict["encoder.pos_encoder.weight"] = position_encoding_init(
            self.max_length + 1, self.model_config['d_model'])
        state_dict["decoder.pos_encoder.weight"] = position_encoding_init(
            self.max_length + 1, self.model_config['d_model'])

        self.transformer.set_state_dict(state_dict)
Exemplo n.º 9
0
def create_infer_loader(args):
    if args.test_file is not None:
        dataset = load_dataset('wmt14ende',
                               data_files=[args.test_file],
                               splits=['test'])
    else:
        dataset = load_dataset('wmt14ende', splits=('test'))

    if args.vocab_file is not None:
        src_vocab = Vocab.load_vocabulary(filepath=args.vocab_file,
                                          unk_token=args.unk_token,
                                          bos_token=args.bos_token,
                                          eos_token=args.eos_token)
    elif not args.benchmark:
        src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"])
    else:
        src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["benchmark"])
    trg_vocab = src_vocab

    padding_vocab = (
        lambda x:
        (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor)
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))

    def convert_samples(sample):
        source = sample[args.src_lang].split()
        target = sample[args.trg_lang].split()

        source = src_vocab.to_indices(source)
        target = trg_vocab.to_indices(target)

        return source, target

    dataset = dataset.map(convert_samples, lazy=False)

    batch_sampler = SamplerHelper(dataset).batch(
        batch_size=args.infer_batch_size, drop_last=False)

    data_loader = DataLoader(dataset=dataset,
                             batch_sampler=batch_sampler,
                             collate_fn=partial(prepare_infer_input,
                                                bos_idx=args.bos_idx,
                                                eos_idx=args.eos_idx,
                                                pad_idx=args.bos_idx,
                                                pad_seq=args.pad_seq,
                                                dtype=args.input_dtype),
                             num_workers=args.num_workers,
                             return_list=True)
    return data_loader, trg_vocab.to_tokens
Exemplo n.º 10
0
def create_data_loader(batch_size, num_steps, data_path):
    train_ds, valid_ds, test_ds = load_dataset('ptb',
                                               splits=('train', 'valid',
                                                       'test'))

    train_examples = [
        train_ds[i]['sentence'].split() for i in range(len(train_ds))
    ]
    vocab = Vocab.build_vocab(train_examples, eos_token='</eos>')

    # Because the sentences in PTB dataset might be consecutive, we need to concatenate
    # all texts from our dataset and fold them into chunks while the number of rows is
    # equal to batch size. For example:
    #
    #   Sentence1: we're talking about years ago before anyone heard of asbestos having
    #              any questionable properties.
    #   Sentence2: there is no asbestos in our products now.
    #   Batch_size: 5
    #   Grouped_text: [["we're", "talking", "about", "years"],
    #                  ["ago", "before", "anyone", "heard"],
    #                  ["of", "asbestos", "having", "any"],
    #                  ["questionable", "properties", "there", "is"],
    #                  ["no", "asbestos", "in", "our"]]
    #
    def group_texts(examples):
        concat_examples = []
        for example in examples:
            concat_examples += example['sentence'].split() + ['</eos>']

        concat_examples = vocab.to_indices(concat_examples)

        max_seq_len = len(concat_examples) // batch_size
        reshaped_examples = np.asarray(concat_examples[0:batch_size *
                                                       max_seq_len],
                                       dtype='int64').reshape(
                                           (batch_size, max_seq_len))
        encoded_examples = []
        for i in range(max_seq_len // num_steps):
            encoded_examples.append(
                (np.copy(reshaped_examples[:,
                                           i * num_steps:(i + 1) * num_steps]),
                 np.copy(reshaped_examples[:, i * num_steps +
                                           1:(i + 1) * num_steps + 1])))

        return encoded_examples

    train_ds.map(group_texts, batched=True)
    valid_ds.map(group_texts, batched=True)
    test_ds.map(group_texts, batched=True)

    train_loader = paddle.io.DataLoader(train_ds,
                                        return_list=True,
                                        batch_size=None)
    valid_loader = paddle.io.DataLoader(valid_ds,
                                        return_list=True,
                                        batch_size=None)
    test_loader = paddle.io.DataLoader(test_ds,
                                       return_list=True,
                                       batch_size=None)
    return train_loader, valid_loader, test_loader, len(vocab)
Exemplo n.º 11
0
def main():
    # Load vocab.
    vocab = Vocab.load_vocabulary(args.vocab_path)
    label_map = {0: 'negative', 1: 'positive'}

    # Construct the newtork.
    model = ppnlp.models.Senta(
        network=args.network, vocab_size=len(vocab), num_classes=len(label_map))

    # Load model parameters.
    state_dict = paddle.load(args.params_path)
    model.set_dict(state_dict)
    model.eval()

    inputs = [paddle.static.InputSpec(shape=[None, None], dtype="int64")]
    # Convert to static graph with specific input description
    if args.network in [
            "lstm", "bilstm", "gru", "bigru", "rnn", "birnn", "bilstm_attn"
    ]:
        inputs.append(paddle.static.InputSpec(
            shape=[None], dtype="int64"))  # seq_len

    model = paddle.jit.to_static(model, input_spec=inputs)
    # Save in static graph model.
    paddle.jit.save(model, args.output_path)
Exemplo n.º 12
0
    def __init__(self,
                 sentencepiece_model_file,
                 do_lower_case=True,
                 encoding="utf8",
                 unk_token="<unk>",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]",
                 **kwargs):

        if not os.path.isfile(sentencepiece_model_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the "
                "vocabulary from a pretrained model please use "
                "`tokenizer = BigBirdTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(sentencepiece_model_file))
        self.encoding = encoding
        mod = try_import('sentencepiece')
        self.sp_model = mod.SentencePieceProcessor()
        if os.path.isfile(sentencepiece_model_file):
            self.sp_model.Load(sentencepiece_model_file)
        vocab_dict = {}
        for id in range(self.sp_model.get_piece_size()):
            vocab_dict[self.sp_model.id_to_piece(id)] = id
        self.vocab = Vocab.from_dict(vocab_dict, unk_token=unk_token)
        self.start_word_tokens = np.array([
            self.vocab._idx_to_token[i][0] == "▁"
            for i in range(0, len(self.vocab))
        ])
        self.unk_token = unk_token
        self.mask_id = vocab_dict[mask_token]
        self.unk_id = vocab_dict[unk_token]
        self.cls_id = vocab_dict[cls_token]
        self.sep_id = vocab_dict[sep_token]
        self.pad_id = vocab_dict[pad_token] if pad_token in vocab_dict else 0

        unk_token = AddedToken(unk_token,
                               lstrip=False, rstrip=False) if isinstance(
                                   unk_token, str) else unk_token
        pad_token = AddedToken(pad_token,
                               lstrip=False, rstrip=False) if isinstance(
                                   pad_token, str) else pad_token
        cls_token = AddedToken(cls_token,
                               lstrip=False, rstrip=False) if isinstance(
                                   cls_token, str) else cls_token
        sep_token = AddedToken(sep_token,
                               lstrip=False, rstrip=False) if isinstance(
                                   sep_token, str) else sep_token

        # Mask token behave like a normal word, i.e. include the space before it
        mask_token = AddedToken(mask_token,
                                lstrip=True, rstrip=False) if isinstance(
                                    mask_token, str) else mask_token

        self._build_special_tokens_map_extended(sep_token=sep_token,
                                                cls_token=cls_token,
                                                unk_token=unk_token,
                                                pad_token=pad_token,
                                                mask_token=mask_token)
Exemplo n.º 13
0
def create_infer_loader(args):
    dataset = load_dataset('wmt14ende', splits=('test'))
    src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"])
    trg_vocab = src_vocab

    padding_vocab = (
        lambda x:
        (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor)
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))

    def convert_samples(sample):
        source = sample[args.src_lang].split()
        target = sample[args.trg_lang].split()

        source = src_vocab.to_indices(source)
        target = trg_vocab.to_indices(target)

        return source, target

    dataset = dataset.map(convert_samples, lazy=False)

    batch_sampler = SamplerHelper(dataset).batch(
        batch_size=args.infer_batch_size, drop_last=False)

    data_loader = DataLoader(dataset=dataset,
                             batch_sampler=batch_sampler,
                             collate_fn=partial(prepare_infer_input,
                                                bos_idx=args.bos_idx,
                                                eos_idx=args.eos_idx,
                                                pad_idx=args.bos_idx),
                             num_workers=0,
                             return_list=True)
    return data_loader, trg_vocab.to_tokens
def init_lstm_var(args):
    vocab = Vocab.load_vocabulary(args.vocab_path,
                                  unk_token='[UNK]',
                                  pad_token='[PAD]')
    tokenizer = CharTokenizer(vocab, args.language, '../../punctuations')
    padding_idx = vocab.token_to_idx.get('[PAD]', 0)

    trans_fn = partial(convert_example,
                       tokenizer=tokenizer,
                       is_test=True,
                       language=args.language)

    # Init attention layer
    lstm_hidden_size = 196
    attention = SelfInteractiveAttention(hidden_size=2 * lstm_hidden_size)
    model = BiLSTMAttentionModel(attention_layer=attention,
                                 vocab_size=len(tokenizer.vocab),
                                 lstm_hidden_size=lstm_hidden_size,
                                 num_classes=2,
                                 padding_idx=padding_idx)

    # Reads data and generates mini-batches.
    dev_ds = Senti_data().read(args.data_dir)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=padding_idx),  # input_ids
        Stack(dtype="int64"),  # seq len
    ): [data for data in fn(samples)]

    dev_loader = create_dataloader(dev_ds,
                                   trans_fn=trans_fn,
                                   batch_size=args.batch_size,
                                   mode='validation',
                                   batchify_fn=batchify_fn)

    return model, tokenizer, dev_loader
Exemplo n.º 15
0
    def __init__(self,
                 sentencepiece_model_file,
                 do_lower_case=True,
                 encoding="utf8",
                 unk_token="<unk>",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]"):

        if not os.path.isfile(sentencepiece_model_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the "
                "vocabulary from a pretrained model please use "
                "`tokenizer = BigBirdTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(sentencepiece_model_file))
        self.encoding = encoding
        mod = try_import('sentencepiece')
        self.sp_model = mod.SentencePieceProcessor()
        if os.path.isfile(sentencepiece_model_file):
            self.sp_model.Load(sentencepiece_model_file)
        vocab_dict = {}
        for id in range(self.sp_model.get_piece_size()):
            vocab_dict[self.sp_model.id_to_piece(id)] = id
        self.vocab = Vocab.from_dict(vocab_dict, unk_token=unk_token)
        self.start_word_tokens = np.array([
            self.vocab._idx_to_token[i][0] == "▁"
            for i in range(0, len(self.vocab))
        ])
        self.unk_token = unk_token
        self.mask_id = vocab_dict[mask_token]
        self.unk_id = vocab_dict[unk_token]
        self.cls_id = vocab_dict[cls_token]
        self.sep_id = vocab_dict[sep_token]
Exemplo n.º 16
0
def main():
    # Load vocab.
    if not os.path.exists(args.vocab_path):
        raise RuntimeError('The vocab_path  can not be found in the path %s' %
                           args.vocab_path)

    vocab = Vocab.load_vocabulary(args.vocab_path)
    label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

    # Construct the newtork.
    vocab_size = len(vocab)
    num_classes = len(label_map)
    pad_token_id = vocab.to_indices('[PAD]')

    model = TextCNNModel(vocab_size,
                         num_classes,
                         padding_idx=pad_token_id,
                         ngram_filter_sizes=(1, 2, 3))

    # Load model parameters.
    state_dict = paddle.load(args.params_path)
    model.set_dict(state_dict)
    model.eval()

    inputs = [paddle.static.InputSpec(shape=[None, None], dtype="int64")]

    model = paddle.jit.to_static(model, input_spec=inputs)
    # Save in static graph model.
    paddle.jit.save(model, args.output_path)
Exemplo n.º 17
0
def create_infer_loader(args, use_all_vocab=False):
    data_files = None
    if args.root != "None" and os.path.exists(args.root):
        data_files = {
            'test': (os.path.join(args.root, "newstest2014.tok.bpe.33708.en"),
                     os.path.join(args.root, "newstest2014.tok.bpe.33708.de"))
        }

    dataset = load_dataset('wmt14ende', data_files=data_files, splits=('test'))
    if use_all_vocab:
        src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"])
    else:
        src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["benchmark"])
    trg_vocab = src_vocab

    padding_vocab = (
        lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor
    )
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))

    def convert_samples(sample):
        source = sample[args.src_lang].split()
        target = sample[args.trg_lang].split()

        source = src_vocab.to_indices(source)
        target = trg_vocab.to_indices(target)

        return source, target

    dataset = dataset.map(convert_samples, lazy=False)

    batch_sampler = SamplerHelper(dataset).batch(
        batch_size=args.infer_batch_size, drop_last=False)

    data_loader = DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=partial(
            prepare_infer_input,
            bos_idx=args.bos_idx,
            eos_idx=args.eos_idx,
            pad_idx=args.bos_idx,
            pad_seq=args.pad_seq),
        num_workers=0,
        return_list=True)
    return data_loader, trg_vocab.to_tokens
Exemplo n.º 18
0
def create_train_loader(args):
    batch_size = args.batch_size
    max_len = args.max_len

    train_ds, dev_ds = load_dataset('iwslt15', splits=('train', 'dev'))
    src_vocab = Vocab.load_vocabulary(**train_ds.vocab_info['en'])
    tgt_vocab = Vocab.load_vocabulary(**train_ds.vocab_info['vi'])
    bos_id = src_vocab[src_vocab.bos_token]
    eos_id = src_vocab[src_vocab.eos_token]
    pad_id = eos_id

    def convert_example(example):
        source = example['en'].split()[:max_len]
        target = example['vi'].split()[:max_len]

        source = src_vocab.to_indices(source)
        target = tgt_vocab.to_indices(target)

        return source, target

    key = (lambda x, data_source: len(data_source[x][0]))

    # Truncate and convert example to ids
    train_ds = train_ds.map(convert_example, lazy=False)
    dev_ds = dev_ds.map(convert_example, lazy=False)

    train_batch_sampler = SamplerHelper(train_ds).shuffle().sort(
        key=key, buffer_size=batch_size * 20).batch(batch_size=batch_size)

    dev_batch_sampler = SamplerHelper(dev_ds).sort(
        key=key, buffer_size=batch_size * 20).batch(batch_size=batch_size)

    train_loader = paddle.io.DataLoader(train_ds,
                                        batch_sampler=train_batch_sampler,
                                        collate_fn=partial(prepare_train_input,
                                                           bos_id=bos_id,
                                                           eos_id=eos_id,
                                                           pad_id=pad_id))

    dev_loader = paddle.io.DataLoader(dev_ds,
                                      batch_sampler=dev_batch_sampler,
                                      collate_fn=partial(prepare_train_input,
                                                         bos_id=bos_id,
                                                         eos_id=eos_id,
                                                         pad_id=pad_id))

    return train_loader, dev_loader, len(src_vocab), len(tgt_vocab), pad_id
Exemplo n.º 19
0
    def __init__(self,
                 embedding_name=EMBEDDING_NAME_LIST[0],
                 unknown_token=UNK_TOKEN,
                 unknown_token_vector=None,
                 extended_vocab_path=None,
                 trainable=True,
                 keep_extended_vocab_only=False):
        vector_path = osp.join(EMBEDDING_HOME, embedding_name + ".npz")
        if not osp.exists(vector_path):
            # download
            url = EMBEDDING_URL_ROOT + "/" + embedding_name + ".tar.gz"
            get_path_from_url(url, EMBEDDING_HOME)

        logger.info("Loading token embedding...")
        vector_np = np.load(vector_path)
        self.embedding_dim = vector_np['embedding'].shape[1]
        self.unknown_token = unknown_token
        if unknown_token_vector is not None:
            unk_vector = np.array(unknown_token_vector).astype(
                paddle.get_default_dtype())
        else:
            unk_vector = np.random.normal(scale=0.02,
                                          size=self.embedding_dim).astype(
                                              paddle.get_default_dtype())
        pad_vector = np.array([0] * self.embedding_dim).astype(
            paddle.get_default_dtype())
        if extended_vocab_path is not None:
            embedding_table = self._extend_vocab(extended_vocab_path,
                                                 vector_np, pad_vector,
                                                 unk_vector,
                                                 keep_extended_vocab_only)
            trainable = True
        else:
            embedding_table = self._init_without_extend_vocab(
                vector_np, pad_vector, unk_vector)

        self.vocab = Vocab.from_dict(self._word_to_idx,
                                     unk_token=unknown_token,
                                     pad_token=PAD_TOKEN)
        self.num_embeddings = embedding_table.shape[0]
        # import embedding
        super(TokenEmbedding,
              self).__init__(self.num_embeddings,
                             self.embedding_dim,
                             padding_idx=self._word_to_idx[PAD_TOKEN])
        self.weight.set_value(embedding_table)
        self.set_trainable(trainable)
        logger.info("Finish loading embedding vector.")
        s = "Token Embedding info:\
             \nUnknown index: {}\
             \nUnknown token: {}\
             \nPadding index: {}\
             \nPadding token: {}\
             \nShape :{}".format(self._word_to_idx[self.unknown_token],
                                 self.unknown_token,
                                 self._word_to_idx[PAD_TOKEN], PAD_TOKEN,
                                 self.weight.shape)
        logger.info(s)
Exemplo n.º 20
0
def create_infer_loader(args):
    dataset = load_dataset(read,
                           src_path=args.predict_file,
                           tgt_path=None,
                           is_predict=True,
                           lazy=False)

    src_vocab = Vocab.load_vocabulary(args.src_vocab_fpath,
                                      bos_token=args.special_token[0],
                                      eos_token=args.special_token[1],
                                      unk_token=args.special_token[2])
    trg_vocab = Vocab.load_vocabulary(args.trg_vocab_fpath,
                                      bos_token=args.special_token[0],
                                      eos_token=args.special_token[1],
                                      unk_token=args.special_token[2])

    padding_vocab = (
        lambda x:
        (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor)
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))

    def convert_samples(sample):
        source = sample['src'].split()
        target = sample['tgt'].split()

        source = src_vocab.to_indices(source)
        target = trg_vocab.to_indices(target)

        return source, target

    dataset = dataset.map(convert_samples, lazy=False)

    batch_sampler = SamplerHelper(dataset).batch(
        batch_size=args.infer_batch_size, drop_last=False)

    data_loader = DataLoader(dataset=dataset,
                             batch_sampler=batch_sampler,
                             collate_fn=partial(prepare_infer_input,
                                                bos_idx=args.bos_idx,
                                                eos_idx=args.eos_idx,
                                                pad_idx=args.bos_idx),
                             num_workers=2,
                             return_list=True)
    return data_loader, trg_vocab.to_tokens
Exemplo n.º 21
0
def adapt_vocab_size(args):
    dataset = load_dataset('wmt14ende', splits=('test'))
    src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"])
    trg_vocab = src_vocab

    padding_vocab = (
        lambda x:
        (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor)
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))
Exemplo n.º 22
0
def create_infer_loader(args, places=None):
    data_files = {
        'test': args.predict_file,
    }
    dataset = load_dataset(read,
                           src_tgt_file=data_files['test'],
                           only_src=True,
                           lazy=False)

    src_vocab = Vocab.load_vocabulary(args.src_vocab_fpath,
                                      bos_token=args.special_token[0],
                                      eos_token=args.special_token[1],
                                      unk_token=args.special_token[2])

    trg_vocab = Vocab.load_vocabulary(args.trg_vocab_fpath,
                                      bos_token=args.special_token[0],
                                      eos_token=args.special_token[1],
                                      unk_token=args.special_token[2])

    args.src_vocab_size = len(src_vocab)
    args.trg_vocab_size = len(trg_vocab)

    def convert_samples(sample):
        source = [item.strip() for item in sample['src'].split()]
        source = src_vocab.to_indices(source) + [args.eos_idx]
        target = [args.bos_idx]
        return source, target

    dataset = dataset.map(convert_samples, lazy=False)

    batch_sampler = SamplerHelper(dataset).batch(batch_size=args.batch_size,
                                                 drop_last=False)

    data_loader = DataLoader(dataset=dataset,
                             places=places,
                             batch_sampler=batch_sampler,
                             collate_fn=partial(prepare_infer_input,
                                                pad_idx=args.bos_idx),
                             num_workers=0,
                             return_list=True)

    return data_loader, trg_vocab.to_tokens
Exemplo n.º 23
0
def main():
    args = parse_args()

    predictor = Predictor.create_predictor(args)
    test_loader, src_vocab_size, tgt_vocab_size, bos_id, eos_id = create_infer_loader(
        args)
    tgt_vocab = Vocab.load_vocabulary(**test_loader.dataset.vocab_info['vi'])
    trg_idx2word = tgt_vocab.idx_to_token

    predictor.predict(test_loader, args.infer_output_file, trg_idx2word,
                      bos_id, eos_id)
Exemplo n.º 24
0
def adapt_vocab_size(args):
    if args.vocab_file is not None:
        src_vocab = Vocab.load_vocabulary(filepath=args.vocab_file,
                                          unk_token=args.unk_token,
                                          bos_token=args.bos_token,
                                          eos_token=args.eos_token)
    else:
        dataset = load_dataset('wmt14ende', splits=('test'))
        if not args.benchmark:
            src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"])
        else:
            src_vocab = Vocab.load_vocabulary(
                **dataset.vocab_info["benchmark"])
    trg_vocab = src_vocab

    padding_vocab = (
        lambda x:
        (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor)
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))
Exemplo n.º 25
0
    def __init__(self, args, is_chinese):
        bpe_parser = subword_nmt.create_apply_bpe_parser()
        bpe_args = bpe_parser.parse_args(args=['-c', args.src_bpe_dict])
        self.bpe = subword_nmt.BPE(bpe_args.codes, bpe_args.merges,
                                   bpe_args.separator, None,
                                   bpe_args.glossaries)
        self.is_chinese = is_chinese

        self.src_vocab = Vocab.load_vocabulary(args.src_vocab_fpath,
                                               bos_token=args.special_token[0],
                                               eos_token=args.special_token[1],
                                               unk_token=args.special_token[2])

        self.trg_vocab = Vocab.load_vocabulary(args.trg_vocab_fpath,
                                               bos_token=args.special_token[0],
                                               eos_token=args.special_token[1],
                                               unk_token=args.special_token[2])

        args.src_vocab_size = len(self.src_vocab)
        args.trg_vocab_size = len(self.trg_vocab)
        self.args = args
Exemplo n.º 26
0
def create_infer_loader(batch_size=128):
    test_ds = load_dataset('couplet', splits='test')
    vocab = Vocab.load_vocabulary(**test_ds.vocab_info)
    pad_id = vocab[vocab.eos_token]
    trans_func = partial(convert_example, vocab=vocab)
    test_ds = test_ds.map(trans_func, lazy=False)
    test_batch_sampler = SamplerHelper(test_ds).batch(batch_size=batch_size)

    test_loader = paddle.io.DataLoader(test_ds,
                                       batch_sampler=test_batch_sampler,
                                       collate_fn=partial(prepare_input,
                                                          pad_id=pad_id))
    return test_loader, vocab
Exemplo n.º 27
0
def do_predict(args):
    device = paddle.set_device(args.device)

    test_loader, src_vocab_size, tgt_vocab_size, bos_id, eos_id = create_infer_loader(
        args)
    tgt_vocab = Vocab.load_vocabulary(**test_loader.dataset.vocab_info['vi'])

    trg_idx2word = tgt_vocab.idx_to_token

    model = paddle.Model(
        Seq2SeqAttnInferModel(src_vocab_size,
                              tgt_vocab_size,
                              args.hidden_size,
                              args.hidden_size,
                              args.num_layers,
                              args.dropout,
                              bos_id=bos_id,
                              eos_id=eos_id,
                              beam_size=args.beam_size,
                              max_out_len=256))

    model.prepare()

    # Load the trained model
    assert args.init_from_ckpt, (
        "Please set reload_model to load the infer model.")
    model.load(args.init_from_ckpt)

    cand_list = []
    with io.open(args.infer_output_file, 'w', encoding='utf-8') as f:
        for data in test_loader():
            with paddle.no_grad():
                finished_seq = model.predict_batch(inputs=data)[0]
            finished_seq = finished_seq[:, :, np.newaxis] if len(
                finished_seq.shape) == 2 else finished_seq
            finished_seq = np.transpose(finished_seq, [0, 2, 1])
            for ins in finished_seq:
                for beam_idx, beam in enumerate(ins):
                    id_list = post_process_seq(beam, bos_id, eos_id)
                    word_list = [trg_idx2word[id] for id in id_list]
                    sequence = " ".join(word_list) + "\n"
                    f.write(sequence)
                    cand_list.append(word_list)
                    break

    bleu = BLEU()
    for i, data in enumerate(test_loader.dataset.data):
        ref = data['vi'].split()
        bleu.add_inst(cand_list[i], [ref])
    print("BLEU score is %s." % bleu.score())
Exemplo n.º 28
0
    def __init__(self,
                 bpe_codes_fpath,
                 src_vocab_fpath,
                 trg_vocab_fpath,
                 special_token=["<s>", "<e>", "<unk>"]):
        bpe_parser = subword_nmt.create_apply_bpe_parser()
        bpe_args = bpe_parser.parse_args(args=['-c', bpe_codes_fpath])
        self.bpe = subword_nmt.BPE(bpe_args.codes, bpe_args.merges,
                                   bpe_args.separator, None,
                                   bpe_args.glossaries)

        self.src_vocab = Vocab.load_vocabulary(src_vocab_fpath,
                                               bos_token=special_token[0],
                                               eos_token=special_token[1],
                                               unk_token=special_token[2])

        self.trg_vocab = Vocab.load_vocabulary(trg_vocab_fpath,
                                               bos_token=special_token[0],
                                               eos_token=special_token[1],
                                               unk_token=special_token[2])

        self.src_vocab_size = len(self.src_vocab)
        self.trg_vocab_size = len(self.trg_vocab)
Exemplo n.º 29
0
    def get_vocab(cls, root=None):
        """
        Load vocab from vocab files. It vocab files don't exist, the will
        be downloaded.

        Args:
            root (str, optional): Data directory pf dataset. If not provided,
                dataset will be save in `~/.paddlenlp/datasets/machine_translation`.
                If provided, md5 check would be performed, and dataset would be
                downloaded in default directory if failed. Default: None.
        Returns:
            tuple: Source vocab and target vocab.

        Examples:
            .. code-block:: python

                from paddlenlp.datasets import IWSLT15
                (src_vocab, tgt_vocab) = IWSLT15.get_vocab()
        """

        root = cls._download_data(root=root)
        src_vocab_filename, tgt_vocab_filename, _, _ = cls.VOCAB_INFO
        src_file_path = os.path.join(root, src_vocab_filename)
        tgt_file_path = os.path.join(root, tgt_vocab_filename)

        src_vocab = Vocab.load_vocabulary(filepath=src_file_path,
                                          unk_token=cls.UNK_TOKEN,
                                          pad_token=cls.PAD_TOKEN,
                                          bos_token=cls.BOS_TOKEN,
                                          eos_token=cls.EOS_TOKEN)

        tgt_vocab = Vocab.load_vocabulary(filepath=tgt_file_path,
                                          unk_token=cls.UNK_TOKEN,
                                          pad_token=cls.PAD_TOKEN,
                                          bos_token=cls.BOS_TOKEN,
                                          eos_token=cls.EOS_TOKEN)
        return (src_vocab, tgt_vocab)
Exemplo n.º 30
0
def create_data_loader_for_small_model(task_name,
                                       vocab_path,
                                       model_name=None,
                                       batch_size=64,
                                       max_seq_length=128,
                                       shuffle=True):
    """Data loader for bi-lstm, not bert."""
    if task_name == 'chnsenticorp':
        train_ds, dev_ds = load_dataset(task_name, splits=["train", "dev"])
    else:
        train_ds, dev_ds = load_dataset('glue',
                                        task_name,
                                        splits=["train", "dev"])
    if task_name == 'chnsenticorp':
        vocab = Vocab.load_vocabulary(
            vocab_path,
            unk_token='[UNK]',
            pad_token='[PAD]',
            bos_token=None,
            eos_token=None,
        )
        pad_val = vocab['[PAD]']

    else:
        vocab = BertTokenizer.from_pretrained(model_name)
        pad_val = vocab.pad_token_id

    trans_fn = partial(convert_small_example,
                       task_name=task_name,
                       vocab=vocab,
                       max_seq_length=max_seq_length,
                       is_test=False)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=pad_val),  # input_ids
        Stack(dtype="int64"),  # seq len
        Stack(dtype="int64")  # label
    ): fn(samples)

    train_ds = train_ds.map(trans_fn, lazy=True)
    dev_ds = dev_ds.map(trans_fn, lazy=True)

    train_data_loader, dev_data_loader = create_dataloader(
        train_ds, dev_ds, batch_size, batchify_fn, shuffle)

    return train_data_loader, dev_data_loader