def create_infer_loader(args): batch_size = args.batch_size max_len = args.max_len test_ds = load_dataset('iwslt15', splits='test') src_vocab = Vocab.load_vocabulary(**test_ds.vocab_info['en']) tgt_vocab = Vocab.load_vocabulary(**test_ds.vocab_info['vi']) bos_id = src_vocab[src_vocab.bos_token] eos_id = src_vocab[src_vocab.eos_token] pad_id = eos_id def convert_example(example): source = example['en'].split() target = example['vi'].split() source = src_vocab.to_indices(source) target = tgt_vocab.to_indices(target) return source, target test_ds.map(convert_example) test_batch_sampler = SamplerHelper(test_ds).batch(batch_size=batch_size) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=partial(prepare_infer_input, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id)) return test_loader, len(src_vocab), len(tgt_vocab), bos_id, eos_id
def test_counter(self): token_to_idx = {'一万七千多': 1, '一万七千余': 2, '一万万': 3} vocab = Vocab( counter=self.counter, unk_token='[UNK]', token_to_idx=token_to_idx) self.check_output_equal(vocab.to_tokens(1), '一万七千多') self.check_output_equal(vocab.to_tokens(2), '一万七千余') self.check_output_equal(vocab.to_tokens(3), '一万万')
def init_lstm_var(args): if args.language == 'ch': vocab = Vocab.load_vocabulary("../task/similarity/simnet/vocab.char", unk_token='[UNK]', pad_token='[PAD]') else: vocab = Vocab.load_vocabulary("../task/similarity/simnet/vocab_QQP", unk_token='[UNK]', pad_token='[PAD]') tokenizer = CharTokenizer(vocab, args.language, '../punctuations') model = SimNet(network='lstm', vocab_size=len(vocab), num_classes=2) dev_ds = SimilarityData().read(os.path.join(args.data_dir, 'dev')) dev_examples = preprocess_data(dev_ds.data, tokenizer, language=args.language) batches = [ dev_examples[idx:idx + args.batch_size] for idx in range(0, len(dev_examples), args.batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)), # query_ids Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)), # title_ids Stack(dtype="int64"), # query_seq_lens Stack(dtype="int64"), # title_seq_lens ): [data for data in fn(samples)] return model, tokenizer, batches, batchify_fn, vocab, dev_ds
def __init__(self, args={}): super(TransformerReader, self).__init__() dataset = load_dataset('wmt14ende', splits=('test')) if not args.benchmark: self.vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) else: self.vocab = Vocab.load_vocabulary( **dataset.vocab_info["benchmark"]) self.src_vocab = self.trg_vocab = self.vocab def convert_samples(samples): source = [] for sample in samples: src = sample.split() source.append(self.src_vocab.to_indices(src)) return source self.tokenize = convert_samples self.to_tokens = self.trg_vocab.to_tokens self.feed_keys = ["src_word"] self.bos_idx = args.bos_idx self.eos_idx = args.eos_idx self.pad_idx = args.bos_idx self.pad_seq = args.pad_seq self.word_pad = Pad(self.pad_idx)
def test_json(self): token_to_idx = {'一万七千多': 1, '一万七千余': 2, '一万万': 3} vocab = Vocab( counter=self.counter, unk_token='[UNK]', token_to_idx=token_to_idx) json_str = vocab.to_json() copied_vocab = Vocab.from_json(json_str) for key, value in copied_vocab.token_to_idx.items(): self.check_output_equal(value, vocab[key])
def load_vocab(vocab_dir): """load vocabs""" word_vocab = Vocab.from_json(os.path.join(vocab_dir, "word_vocab.json")) rel_vocab = Vocab.from_json(os.path.join(vocab_dir, "rel_vocab.json")) feat_vocab_path = os.path.join(vocab_dir, "feat_vocab.json") if os.path.exists(feat_vocab_path): feat_vocab = Vocab.from_json(os.path.join(feat_vocab_path)) else: feat_vocab = None return word_vocab, feat_vocab, rel_vocab
def create_data_loader(args, places=None): datasets = load_dataset('wmt14ende', splits=('train', 'dev')) if not args.benchmark: src_vocab = Vocab.load_vocabulary(**datasets[0].vocab_info["bpe"]) else: src_vocab = Vocab.load_vocabulary( **datasets[0].vocab_info["benchmark"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) def convert_samples(sample): source = sample[args.src_lang].split() target = sample[args.trg_lang].split() source = src_vocab.to_indices(source) target = trg_vocab.to_indices(target) return source, target data_loaders = [(None)] * 2 for i, dataset in enumerate(datasets): dataset = dataset.map(convert_samples, lazy=False).filter( partial(min_max_filer, max_len=args.max_length)) batch_sampler = TransformerBatchSampler( dataset=dataset, batch_size=args.batch_size, pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, use_token_batch=True, max_length=args.max_length, distribute_mode=True if i == 0 else False, world_size=dist.get_world_size(), rank=dist.get_rank(), pad_seq=args.pad_seq, bsz_multi=args.bsz_multi) data_loader = DataLoader(dataset=dataset, places=places, batch_sampler=batch_sampler, collate_fn=partial(prepare_train_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx, pad_seq=args.pad_seq), num_workers=0) data_loaders[i] = (data_loader) return data_loaders
def __init__(self, max_length: int = 256, max_out_len: int = 256, beam_size: int = 5): super(MTTransformer, self).__init__() bpe_codes_file = os.path.join(MODULE_HOME, 'transformer_zh_en', 'assets', '2M.zh2en.dict4bpe.zh') src_vocab_file = os.path.join(MODULE_HOME, 'transformer_zh_en', 'assets', 'vocab.zh') trg_vocab_file = os.path.join(MODULE_HOME, 'transformer_zh_en', 'assets', 'vocab.en') checkpoint = os.path.join(MODULE_HOME, 'transformer_zh_en', 'assets', 'transformer.pdparams') self.max_length = max_length self.beam_size = beam_size self.tokenizer = MTTokenizer(bpe_codes_file=bpe_codes_file, lang_src=self.lang_config['source'], lang_trg=self.lang_config['target']) self.src_vocab = Vocab.load_vocabulary( filepath=src_vocab_file, unk_token=self.vocab_config['unk_token'], bos_token=self.vocab_config['bos_token'], eos_token=self.vocab_config['eos_token']) self.trg_vocab = Vocab.load_vocabulary( filepath=trg_vocab_file, unk_token=self.vocab_config['unk_token'], bos_token=self.vocab_config['bos_token'], eos_token=self.vocab_config['eos_token']) self.src_vocab_size = (len(self.src_vocab) + self.vocab_config['pad_factor'] - 1) \ // self.vocab_config['pad_factor'] * self.vocab_config['pad_factor'] self.trg_vocab_size = (len(self.trg_vocab) + self.vocab_config['pad_factor'] - 1) \ // self.vocab_config['pad_factor'] * self.vocab_config['pad_factor'] self.transformer = InferTransformerModel( src_vocab_size=self.src_vocab_size, trg_vocab_size=self.trg_vocab_size, bos_id=self.vocab_config['bos_id'], eos_id=self.vocab_config['eos_id'], max_length=self.max_length + 1, max_out_len=max_out_len, beam_size=self.beam_size, **self.model_config) state_dict = paddle.load(checkpoint) # To avoid a longer length than training, reset the size of position # encoding to max_length state_dict["encoder.pos_encoder.weight"] = position_encoding_init( self.max_length + 1, self.model_config['d_model']) state_dict["decoder.pos_encoder.weight"] = position_encoding_init( self.max_length + 1, self.model_config['d_model']) self.transformer.set_state_dict(state_dict)
def create_infer_loader(args): if args.test_file is not None: dataset = load_dataset('wmt14ende', data_files=[args.test_file], splits=['test']) else: dataset = load_dataset('wmt14ende', splits=('test')) if args.vocab_file is not None: src_vocab = Vocab.load_vocabulary(filepath=args.vocab_file, unk_token=args.unk_token, bos_token=args.bos_token, eos_token=args.eos_token) elif not args.benchmark: src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) else: src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["benchmark"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) def convert_samples(sample): source = sample[args.src_lang].split() target = sample[args.trg_lang].split() source = src_vocab.to_indices(source) target = trg_vocab.to_indices(target) return source, target dataset = dataset.map(convert_samples, lazy=False) batch_sampler = SamplerHelper(dataset).batch( batch_size=args.infer_batch_size, drop_last=False) data_loader = DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=partial(prepare_infer_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx, pad_seq=args.pad_seq, dtype=args.input_dtype), num_workers=args.num_workers, return_list=True) return data_loader, trg_vocab.to_tokens
def create_data_loader(batch_size, num_steps, data_path): train_ds, valid_ds, test_ds = load_dataset('ptb', splits=('train', 'valid', 'test')) train_examples = [ train_ds[i]['sentence'].split() for i in range(len(train_ds)) ] vocab = Vocab.build_vocab(train_examples, eos_token='</eos>') # Because the sentences in PTB dataset might be consecutive, we need to concatenate # all texts from our dataset and fold them into chunks while the number of rows is # equal to batch size. For example: # # Sentence1: we're talking about years ago before anyone heard of asbestos having # any questionable properties. # Sentence2: there is no asbestos in our products now. # Batch_size: 5 # Grouped_text: [["we're", "talking", "about", "years"], # ["ago", "before", "anyone", "heard"], # ["of", "asbestos", "having", "any"], # ["questionable", "properties", "there", "is"], # ["no", "asbestos", "in", "our"]] # def group_texts(examples): concat_examples = [] for example in examples: concat_examples += example['sentence'].split() + ['</eos>'] concat_examples = vocab.to_indices(concat_examples) max_seq_len = len(concat_examples) // batch_size reshaped_examples = np.asarray(concat_examples[0:batch_size * max_seq_len], dtype='int64').reshape( (batch_size, max_seq_len)) encoded_examples = [] for i in range(max_seq_len // num_steps): encoded_examples.append( (np.copy(reshaped_examples[:, i * num_steps:(i + 1) * num_steps]), np.copy(reshaped_examples[:, i * num_steps + 1:(i + 1) * num_steps + 1]))) return encoded_examples train_ds.map(group_texts, batched=True) valid_ds.map(group_texts, batched=True) test_ds.map(group_texts, batched=True) train_loader = paddle.io.DataLoader(train_ds, return_list=True, batch_size=None) valid_loader = paddle.io.DataLoader(valid_ds, return_list=True, batch_size=None) test_loader = paddle.io.DataLoader(test_ds, return_list=True, batch_size=None) return train_loader, valid_loader, test_loader, len(vocab)
def main(): # Load vocab. vocab = Vocab.load_vocabulary(args.vocab_path) label_map = {0: 'negative', 1: 'positive'} # Construct the newtork. model = ppnlp.models.Senta( network=args.network, vocab_size=len(vocab), num_classes=len(label_map)) # Load model parameters. state_dict = paddle.load(args.params_path) model.set_dict(state_dict) model.eval() inputs = [paddle.static.InputSpec(shape=[None, None], dtype="int64")] # Convert to static graph with specific input description if args.network in [ "lstm", "bilstm", "gru", "bigru", "rnn", "birnn", "bilstm_attn" ]: inputs.append(paddle.static.InputSpec( shape=[None], dtype="int64")) # seq_len model = paddle.jit.to_static(model, input_spec=inputs) # Save in static graph model. paddle.jit.save(model, args.output_path)
def __init__(self, sentencepiece_model_file, do_lower_case=True, encoding="utf8", unk_token="<unk>", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", **kwargs): if not os.path.isfile(sentencepiece_model_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the " "vocabulary from a pretrained model please use " "`tokenizer = BigBirdTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(sentencepiece_model_file)) self.encoding = encoding mod = try_import('sentencepiece') self.sp_model = mod.SentencePieceProcessor() if os.path.isfile(sentencepiece_model_file): self.sp_model.Load(sentencepiece_model_file) vocab_dict = {} for id in range(self.sp_model.get_piece_size()): vocab_dict[self.sp_model.id_to_piece(id)] = id self.vocab = Vocab.from_dict(vocab_dict, unk_token=unk_token) self.start_word_tokens = np.array([ self.vocab._idx_to_token[i][0] == "▁" for i in range(0, len(self.vocab)) ]) self.unk_token = unk_token self.mask_id = vocab_dict[mask_token] self.unk_id = vocab_dict[unk_token] self.cls_id = vocab_dict[cls_token] self.sep_id = vocab_dict[sep_token] self.pad_id = vocab_dict[pad_token] if pad_token in vocab_dict else 0 unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance( unk_token, str) else unk_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance( pad_token, str) else pad_token cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance( cls_token, str) else cls_token sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance( sep_token, str) else sep_token # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance( mask_token, str) else mask_token self._build_special_tokens_map_extended(sep_token=sep_token, cls_token=cls_token, unk_token=unk_token, pad_token=pad_token, mask_token=mask_token)
def create_infer_loader(args): dataset = load_dataset('wmt14ende', splits=('test')) src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) def convert_samples(sample): source = sample[args.src_lang].split() target = sample[args.trg_lang].split() source = src_vocab.to_indices(source) target = trg_vocab.to_indices(target) return source, target dataset = dataset.map(convert_samples, lazy=False) batch_sampler = SamplerHelper(dataset).batch( batch_size=args.infer_batch_size, drop_last=False) data_loader = DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=partial(prepare_infer_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx), num_workers=0, return_list=True) return data_loader, trg_vocab.to_tokens
def init_lstm_var(args): vocab = Vocab.load_vocabulary(args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') tokenizer = CharTokenizer(vocab, args.language, '../../punctuations') padding_idx = vocab.token_to_idx.get('[PAD]', 0) trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=True, language=args.language) # Init attention layer lstm_hidden_size = 196 attention = SelfInteractiveAttention(hidden_size=2 * lstm_hidden_size) model = BiLSTMAttentionModel(attention_layer=attention, vocab_size=len(tokenizer.vocab), lstm_hidden_size=lstm_hidden_size, num_classes=2, padding_idx=padding_idx) # Reads data and generates mini-batches. dev_ds = Senti_data().read(args.data_dir) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=padding_idx), # input_ids Stack(dtype="int64"), # seq len ): [data for data in fn(samples)] dev_loader = create_dataloader(dev_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='validation', batchify_fn=batchify_fn) return model, tokenizer, dev_loader
def __init__(self, sentencepiece_model_file, do_lower_case=True, encoding="utf8", unk_token="<unk>", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]"): if not os.path.isfile(sentencepiece_model_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the " "vocabulary from a pretrained model please use " "`tokenizer = BigBirdTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(sentencepiece_model_file)) self.encoding = encoding mod = try_import('sentencepiece') self.sp_model = mod.SentencePieceProcessor() if os.path.isfile(sentencepiece_model_file): self.sp_model.Load(sentencepiece_model_file) vocab_dict = {} for id in range(self.sp_model.get_piece_size()): vocab_dict[self.sp_model.id_to_piece(id)] = id self.vocab = Vocab.from_dict(vocab_dict, unk_token=unk_token) self.start_word_tokens = np.array([ self.vocab._idx_to_token[i][0] == "▁" for i in range(0, len(self.vocab)) ]) self.unk_token = unk_token self.mask_id = vocab_dict[mask_token] self.unk_id = vocab_dict[unk_token] self.cls_id = vocab_dict[cls_token] self.sep_id = vocab_dict[sep_token]
def main(): # Load vocab. if not os.path.exists(args.vocab_path): raise RuntimeError('The vocab_path can not be found in the path %s' % args.vocab_path) vocab = Vocab.load_vocabulary(args.vocab_path) label_map = {0: 'negative', 1: 'neutral', 2: 'positive'} # Construct the newtork. vocab_size = len(vocab) num_classes = len(label_map) pad_token_id = vocab.to_indices('[PAD]') model = TextCNNModel(vocab_size, num_classes, padding_idx=pad_token_id, ngram_filter_sizes=(1, 2, 3)) # Load model parameters. state_dict = paddle.load(args.params_path) model.set_dict(state_dict) model.eval() inputs = [paddle.static.InputSpec(shape=[None, None], dtype="int64")] model = paddle.jit.to_static(model, input_spec=inputs) # Save in static graph model. paddle.jit.save(model, args.output_path)
def create_infer_loader(args, use_all_vocab=False): data_files = None if args.root != "None" and os.path.exists(args.root): data_files = { 'test': (os.path.join(args.root, "newstest2014.tok.bpe.33708.en"), os.path.join(args.root, "newstest2014.tok.bpe.33708.de")) } dataset = load_dataset('wmt14ende', data_files=data_files, splits=('test')) if use_all_vocab: src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) else: src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["benchmark"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor ) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) def convert_samples(sample): source = sample[args.src_lang].split() target = sample[args.trg_lang].split() source = src_vocab.to_indices(source) target = trg_vocab.to_indices(target) return source, target dataset = dataset.map(convert_samples, lazy=False) batch_sampler = SamplerHelper(dataset).batch( batch_size=args.infer_batch_size, drop_last=False) data_loader = DataLoader( dataset=dataset, batch_sampler=batch_sampler, collate_fn=partial( prepare_infer_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx, pad_seq=args.pad_seq), num_workers=0, return_list=True) return data_loader, trg_vocab.to_tokens
def create_train_loader(args): batch_size = args.batch_size max_len = args.max_len train_ds, dev_ds = load_dataset('iwslt15', splits=('train', 'dev')) src_vocab = Vocab.load_vocabulary(**train_ds.vocab_info['en']) tgt_vocab = Vocab.load_vocabulary(**train_ds.vocab_info['vi']) bos_id = src_vocab[src_vocab.bos_token] eos_id = src_vocab[src_vocab.eos_token] pad_id = eos_id def convert_example(example): source = example['en'].split()[:max_len] target = example['vi'].split()[:max_len] source = src_vocab.to_indices(source) target = tgt_vocab.to_indices(target) return source, target key = (lambda x, data_source: len(data_source[x][0])) # Truncate and convert example to ids train_ds = train_ds.map(convert_example, lazy=False) dev_ds = dev_ds.map(convert_example, lazy=False) train_batch_sampler = SamplerHelper(train_ds).shuffle().sort( key=key, buffer_size=batch_size * 20).batch(batch_size=batch_size) dev_batch_sampler = SamplerHelper(dev_ds).sort( key=key, buffer_size=batch_size * 20).batch(batch_size=batch_size) train_loader = paddle.io.DataLoader(train_ds, batch_sampler=train_batch_sampler, collate_fn=partial(prepare_train_input, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id)) dev_loader = paddle.io.DataLoader(dev_ds, batch_sampler=dev_batch_sampler, collate_fn=partial(prepare_train_input, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id)) return train_loader, dev_loader, len(src_vocab), len(tgt_vocab), pad_id
def __init__(self, embedding_name=EMBEDDING_NAME_LIST[0], unknown_token=UNK_TOKEN, unknown_token_vector=None, extended_vocab_path=None, trainable=True, keep_extended_vocab_only=False): vector_path = osp.join(EMBEDDING_HOME, embedding_name + ".npz") if not osp.exists(vector_path): # download url = EMBEDDING_URL_ROOT + "/" + embedding_name + ".tar.gz" get_path_from_url(url, EMBEDDING_HOME) logger.info("Loading token embedding...") vector_np = np.load(vector_path) self.embedding_dim = vector_np['embedding'].shape[1] self.unknown_token = unknown_token if unknown_token_vector is not None: unk_vector = np.array(unknown_token_vector).astype( paddle.get_default_dtype()) else: unk_vector = np.random.normal(scale=0.02, size=self.embedding_dim).astype( paddle.get_default_dtype()) pad_vector = np.array([0] * self.embedding_dim).astype( paddle.get_default_dtype()) if extended_vocab_path is not None: embedding_table = self._extend_vocab(extended_vocab_path, vector_np, pad_vector, unk_vector, keep_extended_vocab_only) trainable = True else: embedding_table = self._init_without_extend_vocab( vector_np, pad_vector, unk_vector) self.vocab = Vocab.from_dict(self._word_to_idx, unk_token=unknown_token, pad_token=PAD_TOKEN) self.num_embeddings = embedding_table.shape[0] # import embedding super(TokenEmbedding, self).__init__(self.num_embeddings, self.embedding_dim, padding_idx=self._word_to_idx[PAD_TOKEN]) self.weight.set_value(embedding_table) self.set_trainable(trainable) logger.info("Finish loading embedding vector.") s = "Token Embedding info:\ \nUnknown index: {}\ \nUnknown token: {}\ \nPadding index: {}\ \nPadding token: {}\ \nShape :{}".format(self._word_to_idx[self.unknown_token], self.unknown_token, self._word_to_idx[PAD_TOKEN], PAD_TOKEN, self.weight.shape) logger.info(s)
def create_infer_loader(args): dataset = load_dataset(read, src_path=args.predict_file, tgt_path=None, is_predict=True, lazy=False) src_vocab = Vocab.load_vocabulary(args.src_vocab_fpath, bos_token=args.special_token[0], eos_token=args.special_token[1], unk_token=args.special_token[2]) trg_vocab = Vocab.load_vocabulary(args.trg_vocab_fpath, bos_token=args.special_token[0], eos_token=args.special_token[1], unk_token=args.special_token[2]) padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) def convert_samples(sample): source = sample['src'].split() target = sample['tgt'].split() source = src_vocab.to_indices(source) target = trg_vocab.to_indices(target) return source, target dataset = dataset.map(convert_samples, lazy=False) batch_sampler = SamplerHelper(dataset).batch( batch_size=args.infer_batch_size, drop_last=False) data_loader = DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=partial(prepare_infer_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx), num_workers=2, return_list=True) return data_loader, trg_vocab.to_tokens
def adapt_vocab_size(args): dataset = load_dataset('wmt14ende', splits=('test')) src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab))
def create_infer_loader(args, places=None): data_files = { 'test': args.predict_file, } dataset = load_dataset(read, src_tgt_file=data_files['test'], only_src=True, lazy=False) src_vocab = Vocab.load_vocabulary(args.src_vocab_fpath, bos_token=args.special_token[0], eos_token=args.special_token[1], unk_token=args.special_token[2]) trg_vocab = Vocab.load_vocabulary(args.trg_vocab_fpath, bos_token=args.special_token[0], eos_token=args.special_token[1], unk_token=args.special_token[2]) args.src_vocab_size = len(src_vocab) args.trg_vocab_size = len(trg_vocab) def convert_samples(sample): source = [item.strip() for item in sample['src'].split()] source = src_vocab.to_indices(source) + [args.eos_idx] target = [args.bos_idx] return source, target dataset = dataset.map(convert_samples, lazy=False) batch_sampler = SamplerHelper(dataset).batch(batch_size=args.batch_size, drop_last=False) data_loader = DataLoader(dataset=dataset, places=places, batch_sampler=batch_sampler, collate_fn=partial(prepare_infer_input, pad_idx=args.bos_idx), num_workers=0, return_list=True) return data_loader, trg_vocab.to_tokens
def main(): args = parse_args() predictor = Predictor.create_predictor(args) test_loader, src_vocab_size, tgt_vocab_size, bos_id, eos_id = create_infer_loader( args) tgt_vocab = Vocab.load_vocabulary(**test_loader.dataset.vocab_info['vi']) trg_idx2word = tgt_vocab.idx_to_token predictor.predict(test_loader, args.infer_output_file, trg_idx2word, bos_id, eos_id)
def adapt_vocab_size(args): if args.vocab_file is not None: src_vocab = Vocab.load_vocabulary(filepath=args.vocab_file, unk_token=args.unk_token, bos_token=args.bos_token, eos_token=args.eos_token) else: dataset = load_dataset('wmt14ende', splits=('test')) if not args.benchmark: src_vocab = Vocab.load_vocabulary(**dataset.vocab_info["bpe"]) else: src_vocab = Vocab.load_vocabulary( **dataset.vocab_info["benchmark"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab))
def __init__(self, args, is_chinese): bpe_parser = subword_nmt.create_apply_bpe_parser() bpe_args = bpe_parser.parse_args(args=['-c', args.src_bpe_dict]) self.bpe = subword_nmt.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries) self.is_chinese = is_chinese self.src_vocab = Vocab.load_vocabulary(args.src_vocab_fpath, bos_token=args.special_token[0], eos_token=args.special_token[1], unk_token=args.special_token[2]) self.trg_vocab = Vocab.load_vocabulary(args.trg_vocab_fpath, bos_token=args.special_token[0], eos_token=args.special_token[1], unk_token=args.special_token[2]) args.src_vocab_size = len(self.src_vocab) args.trg_vocab_size = len(self.trg_vocab) self.args = args
def create_infer_loader(batch_size=128): test_ds = load_dataset('couplet', splits='test') vocab = Vocab.load_vocabulary(**test_ds.vocab_info) pad_id = vocab[vocab.eos_token] trans_func = partial(convert_example, vocab=vocab) test_ds = test_ds.map(trans_func, lazy=False) test_batch_sampler = SamplerHelper(test_ds).batch(batch_size=batch_size) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=partial(prepare_input, pad_id=pad_id)) return test_loader, vocab
def do_predict(args): device = paddle.set_device(args.device) test_loader, src_vocab_size, tgt_vocab_size, bos_id, eos_id = create_infer_loader( args) tgt_vocab = Vocab.load_vocabulary(**test_loader.dataset.vocab_info['vi']) trg_idx2word = tgt_vocab.idx_to_token model = paddle.Model( Seq2SeqAttnInferModel(src_vocab_size, tgt_vocab_size, args.hidden_size, args.hidden_size, args.num_layers, args.dropout, bos_id=bos_id, eos_id=eos_id, beam_size=args.beam_size, max_out_len=256)) model.prepare() # Load the trained model assert args.init_from_ckpt, ( "Please set reload_model to load the infer model.") model.load(args.init_from_ckpt) cand_list = [] with io.open(args.infer_output_file, 'w', encoding='utf-8') as f: for data in test_loader(): with paddle.no_grad(): finished_seq = model.predict_batch(inputs=data)[0] finished_seq = finished_seq[:, :, np.newaxis] if len( finished_seq.shape) == 2 else finished_seq finished_seq = np.transpose(finished_seq, [0, 2, 1]) for ins in finished_seq: for beam_idx, beam in enumerate(ins): id_list = post_process_seq(beam, bos_id, eos_id) word_list = [trg_idx2word[id] for id in id_list] sequence = " ".join(word_list) + "\n" f.write(sequence) cand_list.append(word_list) break bleu = BLEU() for i, data in enumerate(test_loader.dataset.data): ref = data['vi'].split() bleu.add_inst(cand_list[i], [ref]) print("BLEU score is %s." % bleu.score())
def __init__(self, bpe_codes_fpath, src_vocab_fpath, trg_vocab_fpath, special_token=["<s>", "<e>", "<unk>"]): bpe_parser = subword_nmt.create_apply_bpe_parser() bpe_args = bpe_parser.parse_args(args=['-c', bpe_codes_fpath]) self.bpe = subword_nmt.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries) self.src_vocab = Vocab.load_vocabulary(src_vocab_fpath, bos_token=special_token[0], eos_token=special_token[1], unk_token=special_token[2]) self.trg_vocab = Vocab.load_vocabulary(trg_vocab_fpath, bos_token=special_token[0], eos_token=special_token[1], unk_token=special_token[2]) self.src_vocab_size = len(self.src_vocab) self.trg_vocab_size = len(self.trg_vocab)
def get_vocab(cls, root=None): """ Load vocab from vocab files. It vocab files don't exist, the will be downloaded. Args: root (str, optional): Data directory pf dataset. If not provided, dataset will be save in `~/.paddlenlp/datasets/machine_translation`. If provided, md5 check would be performed, and dataset would be downloaded in default directory if failed. Default: None. Returns: tuple: Source vocab and target vocab. Examples: .. code-block:: python from paddlenlp.datasets import IWSLT15 (src_vocab, tgt_vocab) = IWSLT15.get_vocab() """ root = cls._download_data(root=root) src_vocab_filename, tgt_vocab_filename, _, _ = cls.VOCAB_INFO src_file_path = os.path.join(root, src_vocab_filename) tgt_file_path = os.path.join(root, tgt_vocab_filename) src_vocab = Vocab.load_vocabulary(filepath=src_file_path, unk_token=cls.UNK_TOKEN, pad_token=cls.PAD_TOKEN, bos_token=cls.BOS_TOKEN, eos_token=cls.EOS_TOKEN) tgt_vocab = Vocab.load_vocabulary(filepath=tgt_file_path, unk_token=cls.UNK_TOKEN, pad_token=cls.PAD_TOKEN, bos_token=cls.BOS_TOKEN, eos_token=cls.EOS_TOKEN) return (src_vocab, tgt_vocab)
def create_data_loader_for_small_model(task_name, vocab_path, model_name=None, batch_size=64, max_seq_length=128, shuffle=True): """Data loader for bi-lstm, not bert.""" if task_name == 'chnsenticorp': train_ds, dev_ds = load_dataset(task_name, splits=["train", "dev"]) else: train_ds, dev_ds = load_dataset('glue', task_name, splits=["train", "dev"]) if task_name == 'chnsenticorp': vocab = Vocab.load_vocabulary( vocab_path, unk_token='[UNK]', pad_token='[PAD]', bos_token=None, eos_token=None, ) pad_val = vocab['[PAD]'] else: vocab = BertTokenizer.from_pretrained(model_name) pad_val = vocab.pad_token_id trans_fn = partial(convert_small_example, task_name=task_name, vocab=vocab, max_seq_length=max_seq_length, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=pad_val), # input_ids Stack(dtype="int64"), # seq len Stack(dtype="int64") # label ): fn(samples) train_ds = train_ds.map(trans_fn, lazy=True) dev_ds = dev_ds.map(trans_fn, lazy=True) train_data_loader, dev_data_loader = create_dataloader( train_ds, dev_ds, batch_size, batchify_fn, shuffle) return train_data_loader, dev_data_loader