def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=256, sentencepiece=None, root=os.path.join(get_home_dir(), 'models')): self.ctx = ctx self.dtype = dtype self.max_seq_length = max_seq_length self.batch_size = batch_size self.dataset_name = dataset_name # use sentencepiece vocab and a checkpoint # we need to set dataset_name to None, otherwise it uses the downloaded vocab if params_path and sentencepiece: dataset_name = None else: dataset_name = self.dataset_name if sentencepiece: vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(sentencepiece) else: vocab = None self.bert, self.vocab = gluonnlp.model.get_model( model, dataset_name=dataset_name, pretrained=params_path is None, ctx=self.ctx, use_pooler=False, use_decoder=False, use_classifier=False, root=root, vocab=vocab) self.bert.cast(self.dtype) if params_path: logger.info('Loading params from %s', params_path) self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True, cast_dtype=True) lower = 'uncased' in self.dataset_name if sentencepiece: self.tokenizer = BERTSPTokenizer(sentencepiece, self.vocab, lower=lower) else: self.tokenizer = BERTTokenizer(self.vocab, lower=lower) self.transform = BERTSentenceTransform( tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, pair=False)
def __init__(self, src_vocab=None, tgt_vocab=None, max_src_len=None, max_tgt_len=None): self.src_vocab = src_vocab self.tgt_vocab = tgt_vocab self.max_src_len = max_src_len self.max_tgt_len = max_tgt_len self.bert_src_tokenzier = BERTTokenizer(src_vocab) self.bert_tgt_tokenzier = BERTTokenizer(tgt_vocab)
def __init__(self, en_vocab=None, ch_vocab=None, max_en_len=None, max_ch_len=None): self.en_vocab = en_vocab self.ch_vocab = ch_vocab self.max_en_len = max_en_len self.max_ch_len = max_ch_len self.bert_en_tokenzier = BERTTokenizer(en_vocab) self.bert_ch_tokenzier = BERTTokenizer(ch_vocab)
def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=256, root=os.path.join(get_home_dir(), 'models')): self.ctx = ctx self.dtype = dtype self.max_seq_length = max_seq_length self.batch_size = batch_size self.dataset_name = dataset_name # Don't download the pretrained models if we have a parameter path self.bert, self.vocab = gluonnlp.model.get_model(model, dataset_name=self.dataset_name, pretrained=params_path is None, ctx=self.ctx, use_pooler=False, use_decoder=False, use_classifier=False, root=root) self.bert.cast(self.dtype) if params_path: logger.info('Loading params from %s', params_path) self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True) lower = 'uncased' in self.dataset_name self.tokenizer = BERTTokenizer(self.vocab, lower=lower) self.transform = BERTSentenceTransform(tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, pair=False)
def data_loader(self, sentences, shuffle=False): tokenizer = BERTTokenizer(self.vocab) transform = BERTSentenceTransform(tokenizer=tokenizer, max_seq_length=self.max_seq_length, pair=False) dataset = BertEmbeddingDataset(sentences, transform) return DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=shuffle)
def main(): """ main function """ logging.info('loading vocab file') vocab_obj = nlp.Vocab.from_json(open(args.vocab_file, 'rt').read()) tokenizer = BERTTokenizer( vocab=vocab_obj, lower=args.do_lower_case) input_files = [] for input_pattern in args.input_file.split(','): input_files.extend(glob.glob(input_pattern)) logging.info('*** Reading from input files ***') for input_file in input_files: logging.info(' %s', input_file) rng = random.Random(args.random_seed) instances = create_training_instances( input_files, tokenizer, args.max_seq_length, args.dupe_factor, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, rng) output_files = args.output_file.split(',') logging.info('*** Writing to output files ***') for output_file in output_files: logging.info(' %s', output_file) write_instance_to_example_files(instances, tokenizer, args.max_seq_length, args.max_predictions_per_seq, output_files)
def get_bert_datasets(class_labels, vectorizer, train_ds, dev_ds, batch_size, max_len, bert_model_name = 'bert_12_768_12', bert_dataset = 'book_corpus_wiki_en_uncased', pad=False, use_bert_vocab=False, ctx=mx.cpu()): bert, bert_vocabulary = get_model( name=bert_model_name, dataset_name=bert_dataset, pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False) do_lower_case = 'uncased' in bert_dataset bert_tokenizer = BERTTokenizer(bert_vocabulary, lower=do_lower_case) trans = BERTDatasetTransform(bert_tokenizer, max_len, class_labels=class_labels, label_alias=None, pad=pad, pair=False, has_label=True, vectorizer=vectorizer, bert_vocab_size = len(bert_vocabulary) if use_bert_vocab else 0) train_data, dev_data, test_data, num_train_examples = preprocess_data( trans, class_labels, train_ds, dev_ds, batch_size, max_len, pad) return train_data, dev_data, num_train_examples, bert, bert_vocabulary
def main(): """Main function.""" time_start = time.time() logging.info('loading vocab file from dataset: %s', args.vocab) vocab_obj = nlp.data.utils._load_pretrained_vocab(args.vocab) tokenizer = BERTTokenizer(vocab=vocab_obj, lower='uncased' in args.vocab) input_files = [] for input_pattern in args.input_file.split(','): input_files.extend(glob.glob(os.path.expanduser(input_pattern))) logging.info('*** Reading from %d input files ***', len(input_files)) for input_file in input_files: logging.info(' %s', input_file) num_outputs = args.num_outputs assert len(input_files) >= num_outputs, \ 'Number of outputs must be fewer than that of inputs' output_dir = os.path.expanduser(args.output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) rng = random.Random(args.random_seed) nworker = args.num_workers # calculate the number of splits file_splits = [] split_size = (len(input_files) + num_outputs - 1) // num_outputs for i in range(num_outputs - 1): file_splits.append(input_files[i * split_size:(i + 1) * split_size]) file_splits.append(input_files[(num_outputs - 1) * split_size:]) # prepare workload suffix = 'npz' if args.format == 'numpy' else 'rec' count = 0 map_args = [] pool_args = (tokenizer, args.max_seq_length, args.dupe_factor,\ args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, rng) for i, file_split in enumerate(file_splits): out = os.path.join(output_dir, 'part-{}.{}'.format(str(i).zfill(3), suffix)) count += len(file_split) map_args.append((file_split, out) + pool_args) # sanity check assert count == len(input_files) # dispatch to workers if nworker > 0: pool = Pool(nworker) pool.map(create_training_instances, map_args) else: for map_arg in map_args: create_training_instances(map_arg) time_end = time.time() logging.info('Time cost=%.1f', time_end - time_start)
def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=256): """ Encoding from BERT model. Parameters ---------- ctx : Context. running BertEmbedding on which gpu device id. dtype: str data type to use for the model. model : str, default bert_12_768_12. pre-trained BERT model dataset_name : str, default book_corpus_wiki_en_uncased. pre-trained model dataset params_path: str, default None path to a parameters file to load instead of the pretrained model. max_seq_length : int, default 25 max length of each sequence batch_size : int, default 256 batch size """ self.ctx = ctx self.dtype = dtype self.max_seq_length = max_seq_length self.batch_size = batch_size self.dataset_name = dataset_name if params_path is not None: # Don't download the pretrained models if we have a parameter path pretrained = False else: pretrained = True self.bert, self.vocab = gluonnlp.model.get_model( model, dataset_name=self.dataset_name, pretrained=pretrained, ctx=self.ctx, use_pooler=False, use_decoder=False, use_classifier=False) self.bert.cast(self.dtype) if params_path: logger.info('Loading params from %s', params_path) self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True) lower = 'uncased' in self.dataset_name self.tokenizer = BERTTokenizer(self.vocab, lower=lower) self.transform = BERTSentenceTransform( tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, pair=False)
def test_bert_dataset_transform(): text_a = u'is this jacksonville ?' text_b = u'no it is not' label_cls = 0 vocab_tokens = [ 'is', 'this', 'jack', '##son', '##ville', '?', 'no', 'it', 'is', 'not' ] bert_vocab = BERTVocab(count_tokens(vocab_tokens)) tokenizer = BERTTokenizer(vocab=bert_vocab) # test BERTDatasetTransform for classification task bert_cls_dataset_t = BERTDatasetTransform(tokenizer, 15, labels=[label_cls], pad=True, pair=True, label_dtype='int32') token_ids, length, type_ids, label_ids = bert_cls_dataset_t( (text_a, text_b, label_cls)) text_a_tokens = ['is', 'this', 'jack', '##son', '##ville', '?'] text_b_tokens = ['no', 'it', 'is', 'not'] text_a_ids = bert_vocab[text_a_tokens] text_b_ids = bert_vocab[text_b_tokens] cls_ids = bert_vocab[[bert_vocab.cls_token]] sep_ids = bert_vocab[[bert_vocab.sep_token]] pad_ids = bert_vocab[[bert_vocab.padding_token]] concated_ids = cls_ids + text_a_ids + sep_ids + text_b_ids + sep_ids + pad_ids valid_token_ids = np.array([pad_ids[0]] * 15, dtype=np.int32) for i, x in enumerate(concated_ids): valid_token_ids[i] = x valid_type_ids = np.zeros((15, ), dtype=np.int32) start = len(text_a_tokens) + 2 end = len(text_a_tokens) + 2 + len(text_b_tokens) + 1 valid_type_ids[start:end] = 1 assert all(token_ids == valid_token_ids) assert length == len(vocab_tokens) + 3 assert all(type_ids == valid_type_ids) assert all(label_ids == np.array([label_cls], dtype=np.int32)) # test BERTDatasetTransform for regression task label_reg = 0.2 bert_reg_dataset_t = BERTDatasetTransform(tokenizer, 15, pad=True, pair=True, label_dtype='float32') token_ids, length, type_ids, label_reg_val = bert_reg_dataset_t( (text_a, text_b, label_reg)) assert all(token_ids == valid_token_ids) assert length == len(vocab_tokens) + 3 assert all(type_ids == valid_type_ids) assert all(label_reg_val == np.array([label_reg], dtype=np.float32))
def summarize(sentences, transformer, src_vocab, tgt_vocab): tokenzier = BERTTokenizer(src_vocab) sentences = tokenzier(sentences) sent_idx = src_vocab.to_indices(sentences) sent_idx = nd.array([sent_idx]) Y_h = _summarize(transformer, sent_idx, tgt_vocab) Y_h = Y_h[0].asnumpy().tolist() Y_h = list(map(int, Y_h)) predict = tgt_vocab.to_tokens(Y_h) return predict
def load_dataset_bert(json_file, voc_size, json_text_key="text", json_sp_key="sp_vec", max_len=64, ctx=mx.cpu()): indices = [] values = [] indptrs = [0] cumulative = 0 total_num_words = 0 ndocs = 0 bert_model = 'bert_12_768_12' dname = 'book_corpus_wiki_en_uncased' bert_base, vocab = nlp.model.get_model(bert_model, dataset_name=dname, pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False) tokenizer = BERTTokenizer(vocab) transform = BERTSentenceTransform(tokenizer, max_len, pair=False) x_ids = [] x_val_lens = [] x_segs = [] with io.open(json_file, 'r', encoding='utf-8') as fp: for line in fp: if json_text_key: js = json.loads(line) line = js[json_text_key] if len(line.split(' ')) > 4: ids, lens, segs = transform( (line, )) # create BERT-ready inputs x_ids.append(ids) x_val_lens.append(lens) x_segs.append(segs) ## Now, get the sparse vector ndocs += 1 sp_vec_els = js[json_sp_key] n_pairs, inds, vs = get_single_vec(sp_vec_els) cumulative += n_pairs total_num_words += sum(vs) indptrs.append(cumulative) values.extend(vs) indices.extend(inds) csr_mat = mx.nd.sparse.csr_matrix((values, indices, indptrs), shape=(ndocs, voc_size)) data_train = gluon.data.ArrayDataset( mx.nd.array(x_ids, dtype='int32'), mx.nd.array(x_val_lens, dtype='int32'), mx.nd.array(x_segs, dtype='int32'), csr_mat.tostype('default')) return data_train, bert_base, vocab, csr_mat
def get_bert_datasets(class_labels, vectorizer, train_ds, dev_ds, batch_size, max_len, aux_ds=None, bert_model_name='bert_12_768_12', bert_dataset='book_corpus_wiki_en_uncased', pad=False, use_bert_vocab=False, label_alias=None, num_classes=None, ctx=mx.cpu()): if class_labels is None and num_classes is None: raise Exception("Must provide class_labels or num_classes") bert, bert_vocabulary = get_model(name=bert_model_name, dataset_name=bert_dataset, pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False) do_lower_case = 'uncased' in bert_dataset bert_tokenizer = BERTTokenizer(bert_vocabulary, lower=do_lower_case) trans = BERTDatasetTransform( bert_tokenizer, max_len, class_labels=class_labels, label_alias=label_alias, pad=pad, pair=False, has_label=True, vectorizer=vectorizer, bert_vocab_size=len(bert_vocabulary) if use_bert_vocab else 0, num_classes=num_classes) train_data, num_train_examples = preprocess_seq_data(trans, class_labels, train_ds, batch_size, max_len, train_mode=True, pad=pad, aux_dataset=aux_ds) dev_data, _ = preprocess_seq_data(trans, class_labels, dev_ds, batch_size, max_len, train_mode=False, pad=pad) return train_data, dev_data, num_train_examples, bert, bert_vocabulary
def _load_dataset_bert(line_gen, voc_size, max_len=64, ctx=mx.cpu()): indices = [] values = [] indptrs = [0] cumulative = 0 total_num_words = 0 ndocs = 0 bert_model = 'bert_12_768_12' dname = 'book_corpus_wiki_en_uncased' ## This is really only needed here to get the vocab ## GluonNLP API doesn't enable that bert_base, vocab = nlp.model.get_model(bert_model, dataset_name=dname, pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False) tokenizer = BERTTokenizer(vocab) transform = BERTSentenceTransform(tokenizer, max_len, pair=False) x_ids = [] x_val_lens = [] x_segs = [] for t in line_gen: if isinstance(t, tuple): line = t[0] sp_vec_els = t[1] else: line = t sp_vec_els = None ids, lens, segs = transform((line, )) # create BERT-ready inputs x_ids.append(ids) x_val_lens.append(lens) x_segs.append(segs) ## Now, get the sparse vector ndocs += 1 if sp_vec_els: pairs, inds, vs = get_single_vec(sp_vec_els) cumulative += len(pairs) total_num_words += sum(vs) indptrs.append(cumulative) values.extend(vs) indices.extend(inds) if len(indices) > 0: csr_mat = mx.nd.sparse.csr_matrix( (values, indices, indptrs), shape=(ndocs, voc_size)).tostype('default') else: csr_mat = None return x_ids, x_val_lens, x_segs, bert_base, vocab, csr_mat
def __init__(self, model, bert_vocab, max_length, bow_vocab=None, ctx=mx.cpu()): super().__init__(ctx) self.model = model self.bert_base = model.bert self.tokenizer = BERTTokenizer(bert_vocab) self.transform = BERTSentenceTransform(self.tokenizer, max_length, pair=False) self.bow_vocab = bow_vocab
def get_dual_bert_datasets(class_labels, vectorizer, train_ds1, train_ds2, model_name, dataset, batch_size, dev_bs, max_len1, max_len2, pad, use_bert_vocab=False, shuffle=True, ctx=mx.cpu()): bert, bert_vocabulary = get_model( name=model_name, dataset_name=dataset, pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False) do_lower_case = 'uncased' in dataset bert_tokenizer = BERTTokenizer(bert_vocabulary, lower=do_lower_case) # transformation for data train and dev trans1 = BERTDatasetTransform(bert_tokenizer, max_len1, class_labels=class_labels, label_alias=None, pad=pad, pair=False, has_label=True, vectorizer=vectorizer, bert_vocab_size=len(bert_vocabulary) if use_bert_vocab else 0) trans2 = BERTDatasetTransform(bert_tokenizer, max_len2, class_labels=class_labels, label_alias=None, pad=pad, pair=False, has_label=True, vectorizer=vectorizer, bert_vocab_size=len(bert_vocabulary) if use_bert_vocab else 0) #train_data, num_train_examples = preprocess_data_metriclearn( # trans, class_labels, train_ds1, train_ds2, batch_size, max_len, pad) batch_size = len(train_ds2) a_train_data, num_train_examples, b_train_data = preprocess_data_metriclearn_separate( trans1, trans2, class_labels, train_ds1, train_ds2, batch_size, shuffle=shuffle) return a_train_data, num_train_examples, bert, b_train_data, bert_vocabulary
def __init__(self, model, bert_vocab, max_length, bow_vocab=None, pre_vectorizer=None, ctx=mx.cpu()): super().__init__(ctx) self.model = model self.bert_base = model.bert self.tokenizer = BERTTokenizer(bert_vocab) self.transform = BERTSentenceTransform(self.tokenizer, max_length, pair=False) self.bow_vocab = bow_vocab self.vectorizer = pre_vectorizer or TMNTVectorizer( initial_vocabulary=bow_vocab)
def word_piece_tokenizer(sentences): ctx = ghp.ctx model = 'bert_12_768_12' dataset_name = 'book_corpus_wiki_en_uncased' max_seq_length = ghp.max_seq_len batch_size = 256 _, vocab = gluonnlp.model.get_model(model, dataset_name=dataset_name, pretrained=True, ctx=ctx, use_pooler=False, use_decoder=False, use_classifier=False) tokenizer = BERTTokenizer(vocab) transform = BERTSentenceTransform(tokenizer=tokenizer, max_seq_length=max_seq_length, pair=False) dataset = BertEmbeddingDataset(sentences, transform) data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False) batches = [] for token_ids, _, _ in data_loader: token_ids = token_ids.as_in_context(ctx) for token_id in token_ids.asnumpy(): batches.append(token_id) cut_results = [] for token_ids in batches: tokens = [] for token_id in token_ids: if token_id == 1: break if token_id in (2, 3): continue token = vocab.idx_to_token[token_id] if token.startswith('##'): token = token[2:] tokens[-1] += token else: # iv, avg last oov tokens.append(token) cut_results.append(tokens) return cut_results
def build_model(self, args, model_args, ctx, dataset=None, vocab=None): dataset = model_args.model_name if model_args.model_type == 'bert': model_name = 'bert_12_768_12' elif model_args.model_type == 'bertl': model_name = 'bert_24_1024_16' elif model_args.model_type == 'roberta': model_name = 'roberta_12_768_12' elif model_args.model_type == 'robertal': model_name = 'roberta_24_1024_16' else: raise NotImplementedError self.is_roberta = model_args.model_type.startswith('roberta') if args.model_params is None: pretrained = True else: pretrained = False bert, vocabulary = nlp.model.get_model( name=model_name, dataset_name=dataset, pretrained=pretrained, ctx=ctx, use_pooler=False if self.is_roberta else True, use_decoder=False, use_classifier=False) if args.model_params: bert.load_parameters(args.model_params, ctx=ctx, cast_dtype=True, ignore_extra=True) if args.fix_bert_weights: bert.collect_params('.*weight|.*bias').setattr('grad_req', 'null') if vocab: vocabulary = vocab do_lower_case = 'uncased' in dataset task_name = args.task_name num_classes = self.task.num_classes() if self.is_roberta: model = RoBERTaClassifier(bert, dropout=0.0, num_classes=num_classes) self.tokenizer = nlp.data.GPT2BPETokenizer() else: model = BERTClassifier(bert, num_classes=num_classes, dropout=model_args.dropout) self.tokenizer = BERTTokenizer(vocabulary, lower=do_lower_case) return model, vocabulary
def __init__(self, param_file=None, config_file=None, vocab_file=None, model_dir=None, ctx=mx.cpu()): super().__init__(ctx) if model_dir is not None: param_file = os.path.join(model_dir, 'model.params') vocab_file = os.path.join(model_dir, 'vocab.json') config_file = os.path.join(model_dir, 'model.config') with open(config_file) as f: config = json.loads(f.read()) with open(vocab_file) as f: voc_js = f.read() self.bow_vocab = nlp.Vocab.from_json(voc_js) self.ctx = ctx self.bert_base, self.vocab = nlp.model.get_model( 'bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False) #, output_attention=True) self.latent_dist = config['latent_distribution']['dist_type'] self.n_latent = config['n_latent'] self.kappa = config['latent_distribution']['kappa'] self.pad_id = self.vocab[self.vocab.padding_token] self.max_sent_len = config['sent_size'] self.model = BertBowVED(self.bert_base, self.bow_vocab, latent_distrib=self.latent_dist, n_latent=self.n_latent, kappa=self.kappa, batch_size=1) self.tokenizer = BERTTokenizer(self.vocab) self.transform = BERTSentenceTransform(self.tokenizer, self.max_sent_len, pair=False) self.model.load_parameters(str(param_file), allow_missing=False, ignore_extra=True)
def main(): """Main function.""" time_start = time.time() # random seed random.seed(args.random_seed) # create output dir output_dir = os.path.expanduser(args.output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) # vocabulary logging.info('loading vocab file from dataset: %s', args.vocab) vocab = nlp.data.utils._load_pretrained_vocab(args.vocab, root=output_dir, cls=nlp.vocab.BERTVocab) tokenizer = BERTTokenizer(vocab=vocab, lower='uncased' in args.vocab) # count the number of input files input_files = [] for input_pattern in args.input_file.split(','): input_files.extend(glob.glob(os.path.expanduser(input_pattern))) logging.info('*** Reading from %d input files ***', len(input_files)) for input_file in input_files: logging.info('\t%s', input_file) num_outputs = min(args.num_outputs, len(input_files)) create_training_instances(input_files, tokenizer, args.max_seq_length, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, vocab, args.dupe_factor, args.num_workers, num_outputs=num_outputs, output_dir=output_dir) time_end = time.time() logging.info('Time cost=%.1f', time_end - time_start)
def main(): """Main function.""" time_start = time.time() # random seed random.seed(args.random_seed) # create output dir output_dir = os.path.expanduser(args.output_dir) nlp.utils.mkdir(output_dir) # vocabulary and tokenizer if args.sentencepiece: logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece) if args.dataset_name: warnings.warn( 'Both --dataset_name and --sentencepiece are provided. ' 'The vocabulary will be loaded based on --sentencepiece.') vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece) tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, num_best=args.sp_nbest, alpha=args.sp_alpha, lower=not args.cased) else: logging.info('loading vocab file from pre-defined dataset: %s', args.dataset_name) vocab = nlp.data.utils._load_pretrained_vocab(args.dataset_name, root=output_dir, cls=nlp.vocab.BERTVocab) tokenizer = BERTTokenizer(vocab=vocab, lower='uncased' in args.dataset_name) # count the number of input files input_files = [] for input_pattern in args.input_file.split(','): input_files.extend(glob.glob(os.path.expanduser(input_pattern))) for input_file in input_files: logging.info('\t%s', input_file) num_inputs = len(input_files) num_outputs = min(args.num_outputs, len(input_files)) logging.info('*** Reading from %d input files ***', num_inputs) # calculate the number of splits file_splits = [] split_size = (num_inputs + num_outputs - 1) // num_outputs for i in range(num_outputs): split_start = i * split_size split_end = min(num_inputs, (i + 1) * split_size) file_splits.append(input_files[split_start:split_end]) # prepare workload count = 0 process_args = [] for i, file_split in enumerate(file_splits): output_file = os.path.join(output_dir, 'part-{}.npz'.format(str(i).zfill(3))) count += len(file_split) process_args.append( (file_split, tokenizer, args.max_seq_length, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, args.whole_word_mask, vocab, args.dupe_factor, 1, None, output_file)) # sanity check assert count == len(input_files) # dispatch to workers nworker = args.num_workers if nworker > 1: pool = Pool(nworker) pool.map(create_training_instances, process_args) else: for process_arg in process_args: create_training_instances(process_arg) time_end = time.time() logging.info('Time cost=%.1f', time_end - time_start)
logging.info('loading bert params from {0}'.format(pretrained_bert_parameters)) model.bert.load_parameters(pretrained_bert_parameters, ctx=ctx, ignore_extra=True) if model_parameters: logging.info('loading model params from {0}'.format(model_parameters)) model.load_parameters(model_parameters, ctx=ctx) if not os.path.exists(output_dir): os.makedirs(output_dir) logging.info(model) model.hybridize(static_alloc=True) loss_function.hybridize(static_alloc=True) # data processing do_lower_case = 'uncased' in dataset bert_tokenizer = BERTTokenizer(vocabulary, lower=do_lower_case) def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len): """Data preparation function.""" # transformation trans = BERTDatasetTransform( tokenizer, max_len, labels=task.get_labels(), pad=False, pair=task.is_pair, label_dtype='float32' if not task.get_labels() else 'int32') data_train = task('train').transform(trans, lazy=False) data_train_len = data_train.transform(
np.random.seed(123) mx.random.seed(123) dropout_prob = 0.1 ctx = mx.gpu(args.id) bert_model, bert_vocab = nlp.model.get_model( name='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False, dropout=dropout_prob, embed_dropout=dropout_prob) tokenizer = BERTTokenizer(bert_vocab, lower=True) abstract_emb = np.zeros((max_embs, 768), dtype=np.float32) paper_map = [] fp = open(args.file, 'r') start = time.time() for i, line in enumerate(fp): paper_id, abstract = line.split('\t') paper_id = int(paper_id) tokens = tokenizer(abstract) if len(tokens) > 512: print('paper {} has strings with {} tokens'.format( paper_id, len(tokens))) tokens = tokens[0:512]
class BertEmbedding: """ Encoding from BERT model. Parameters ---------- ctx : Context. running BertEmbedding on which gpu device id. dtype: str data type to use for the model. model : str, default bert_12_768_12. pre-trained BERT model dataset_name : str, default book_corpus_wiki_en_uncased. pre-trained model dataset params_path: str, default None path to a parameters file to load instead of the pretrained model. max_seq_length : int, default 25 max length of each sequence batch_size : int, default 256 batch size sentencepiece : str, default None Path to the sentencepiece .model file for both tokenization and vocab root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet' Location for keeping the model parameters. """ def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=256, sentencepiece=None, root=os.path.join(get_home_dir(), 'models')): self.ctx = ctx self.dtype = dtype self.max_seq_length = max_seq_length self.batch_size = batch_size self.dataset_name = dataset_name # use sentencepiece vocab and a checkpoint # we need to set dataset_name to None, otherwise it uses the downloaded vocab if params_path and sentencepiece: dataset_name = None else: dataset_name = self.dataset_name if sentencepiece: vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(sentencepiece) else: vocab = None self.bert, self.vocab = gluonnlp.model.get_model( model, dataset_name=dataset_name, pretrained=params_path is None, ctx=self.ctx, use_pooler=False, use_decoder=False, use_classifier=False, root=root, vocab=vocab) self.bert.cast(self.dtype) if params_path: logger.info('Loading params from %s', params_path) self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True, cast_dtype=True) lower = 'uncased' in self.dataset_name if sentencepiece: self.tokenizer = BERTSPTokenizer(sentencepiece, self.vocab, lower=lower) else: self.tokenizer = BERTTokenizer(self.vocab, lower=lower) self.transform = BERTSentenceTransform( tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, pair=False) def __call__(self, sentences, oov_way='avg'): return self.embedding(sentences, oov_way='avg') def embedding(self, sentences, oov_way='avg'): """ Get tokens, tokens embedding Parameters ---------- sentences : List[str] sentences for encoding. oov_way : str, default avg. use **avg**, **sum** or **last** to get token embedding for those out of vocabulary words Returns ------- List[(List[str], List[ndarray])] List of tokens, and tokens embedding """ data_iter = self.data_loader(sentences=sentences) batches = [] for token_ids, valid_length, token_types in data_iter: token_ids = token_ids.as_in_context(self.ctx) valid_length = valid_length.as_in_context(self.ctx) token_types = token_types.as_in_context(self.ctx) sequence_outputs = self.bert(token_ids, token_types, valid_length.astype(self.dtype)) for token_id, sequence_output in zip(token_ids.asnumpy(), sequence_outputs.asnumpy()): batches.append((token_id, sequence_output)) return self.oov(batches, oov_way) def data_loader(self, sentences, shuffle=False): """Load, tokenize and prepare the input sentences.""" dataset = BertEmbeddingDataset(sentences, self.transform) return DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=shuffle) def oov(self, batches, oov_way='avg'): """ How to handle oov. Also filter out [CLS], [SEP] tokens. Parameters ---------- batches : List[(tokens_id, sequence_outputs)]. batch token_ids shape is (max_seq_length,), sequence_outputs shape is (max_seq_length, dim) oov_way : str use **avg**, **sum** or **last** to get token embedding for those out of vocabulary words Returns ------- List[(List[str], List[ndarray])] List of tokens, and tokens embedding """ sentences = [] padding_idx, cls_idx, sep_idx = None, None, None if self.vocab.padding_token: padding_idx = self.vocab[self.vocab.padding_token] if self.vocab.cls_token: cls_idx = self.vocab[self.vocab.cls_token] if self.vocab.sep_token: sep_idx = self.vocab[self.vocab.sep_token] for token_ids, sequence_outputs in batches: tokens = [] tensors = [] oov_len = 1 for token_id, sequence_output in zip(token_ids, sequence_outputs): # [PAD] token, sequence is finished. if padding_idx and token_id == padding_idx: break # [CLS], [SEP] if cls_idx and token_id == cls_idx: continue if sep_idx and token_id == sep_idx: continue token = self.vocab.idx_to_token[token_id] if not self.tokenizer.is_first_subword(token): tokens.append(token) if oov_way == 'last': tensors[-1] = sequence_output else: tensors[-1] += sequence_output if oov_way == 'avg': oov_len += 1 else: # iv, avg last oov if oov_len > 1: tensors[-1] /= oov_len oov_len = 1 tokens.append(token) tensors.append(sequence_output) if oov_len > 1: # if the whole sentence is one oov, handle this special case tensors[-1] /= oov_len sentences.append((tokens, tensors)) return sentences
def train(args): ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu) dataset_name = 'book_corpus_wiki_en_cased' if args.cased else 'book_corpus_wiki_en_uncased' bert_model, bert_vocab = nlp.model.get_model(name=args.bert_model, dataset_name=dataset_name, pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False, dropout=args.dropout_prob, embed_dropout=args.dropout_prob) tokenizer = BERTTokenizer(bert_vocab, lower=not args.cased) if args.dataset == 'atis': train_data = ATISDataset('train') dev_data = ATISDataset('dev') test_data = ATISDataset('test') intent_vocab = train_data.intent_vocab slot_vocab = train_data.slot_vocab elif args.dataset == 'snips': train_data = SNIPSDataset('train') dev_data = SNIPSDataset('dev') test_data = SNIPSDataset('test') intent_vocab = train_data.intent_vocab slot_vocab = train_data.slot_vocab else: raise NotImplementedError print('Dataset {}'.format(args.dataset)) print(' #Train/Dev/Test = {}/{}/{}'.format(len(train_data), len(dev_data), len(test_data))) print(' #Intent = {}'.format(len(intent_vocab))) print(' #Slot = {}'.format(len(slot_vocab))) # Display An Example print('Display A Samples') print_sample(test_data, 1) print('-' * 80) idsl_transform = IDSLSubwordTransform(subword_vocab=bert_vocab, subword_tokenizer=tokenizer, slot_vocab=slot_vocab, cased=args.cased) train_data_bert = train_data.transform(idsl_transform, lazy=False) dev_data_bert = dev_data.transform(idsl_transform, lazy=False) test_data_bert = test_data.transform(idsl_transform, lazy=False) # Construct the DataLoader batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(), # Subword ID nlp.data.batchify.Pad(), # Subword Mask nlp.data.batchify.Pad(), # Beginning of subword nlp.data.batchify.Pad(), # Tag IDs nlp.data.batchify.Stack(), # Intent Label nlp.data.batchify.Stack()) # Valid Length train_batch_sampler = nlp.data.sampler.SortedBucketSampler( [len(ele) for ele in train_data_bert], batch_size=args.batch_size, mult=20, shuffle=True) train_loader = gluon.data.DataLoader(dataset=train_data_bert, num_workers=4, batch_sampler=train_batch_sampler, batchify_fn=batchify_fn) dev_loader = gluon.data.DataLoader(dataset=dev_data_bert, num_workers=4, batch_size=args.batch_size, batchify_fn=batchify_fn, shuffle=False) test_loader = gluon.data.DataLoader(dataset=test_data_bert, num_workers=4, batch_size=args.batch_size, batchify_fn=batchify_fn, shuffle=False) # Build the network and loss functions intent_pred_loss = gluon.loss.SoftmaxCELoss() if args.use_focal: slot_pred_loss = SoftmaxFocalLoss(batch_axis=[0, 1], alpha=args.focal_alpha, gamma=args.focal_gamma) else: slot_pred_loss = gluon.loss.SoftmaxCELoss(batch_axis=[0, 1]) net = BERTForICSL(bert_model, num_intent_classes=len(intent_vocab), num_slot_classes=len(slot_vocab), dropout_prob=args.dropout_prob) net.slot_tagger.initialize(ctx=ctx, init=mx.init.Normal(0.02)) net.intent_classifier.initialize(ctx=ctx, init=mx.init.Normal(0.02)) net.hybridize() intent_pred_loss.hybridize() slot_pred_loss.hybridize() # Build the trainer trainer = gluon.Trainer(net.collect_params(), args.optimizer, {'learning_rate': args.learning_rate, 'wd': args.wd}, update_on_kvstore=False) params = [p for p in net.collect_params().values() if p.grad_req != 'null'] step_num = 0 num_train_steps = int(len(train_batch_sampler) * args.epochs) num_warmup_steps = int(num_train_steps * args.warmup_ratio) best_dev_sf1 = -1 for epoch_id in range(args.epochs): avg_train_intent_loss = 0.0 avg_train_slot_loss = 0.0 nsample = 0 nslot = 0 ntoken = 0 train_epoch_start = time.time() for token_ids, mask, selected, slot_ids, intent_label, valid_length in train_loader: ntoken += valid_length.sum().asscalar() token_ids = mx.nd.array(token_ids, ctx=ctx).astype(np.int32) mask = mx.nd.array(mask, ctx=ctx).astype(np.float32) slot_ids = mx.nd.array(slot_ids, ctx=ctx).astype(np.int32) intent_label = mx.nd.array(intent_label, ctx=ctx).astype(np.int32) valid_length = mx.nd.array(valid_length, ctx=ctx).astype(np.float32) batch_nslots = mask.sum().asscalar() batch_nsample = token_ids.shape[0] # Set learning rate warm-up step_num += 1 if step_num < num_warmup_steps: new_lr = args.learning_rate * step_num / num_warmup_steps else: offset = ((step_num - num_warmup_steps) * args.learning_rate / (num_train_steps - num_warmup_steps)) new_lr = args.learning_rate - offset trainer.set_learning_rate(new_lr) with mx.autograd.record(): intent_scores, slot_scores = net(token_ids, valid_length) intent_loss = intent_pred_loss(intent_scores, intent_label) slot_loss = slot_pred_loss(slot_scores, slot_ids, mask.expand_dims(axis=-1)) intent_loss = intent_loss.mean() slot_loss = slot_loss.sum() / batch_nslots loss = intent_loss + args.slot_loss_mult * slot_loss loss.backward() trainer.update(1.0) avg_train_intent_loss += intent_loss.asscalar() * batch_nsample avg_train_slot_loss += slot_loss.asscalar() * batch_nslots nsample += batch_nsample nslot += batch_nslots train_epoch_end = time.time() avg_train_intent_loss /= nsample avg_train_slot_loss /= nslot print('[Epoch {}] train intent/slot = {:.3f}/{:.3f}, #token per second={:.0f}'.format( epoch_id, avg_train_intent_loss, avg_train_slot_loss, ntoken / (train_epoch_end - train_epoch_start))) avg_dev_intent_loss, avg_dev_slot_loss, dev_intent_acc,\ dev_slot_f1, dev_pred_slots, dev_gt_slots\ = evaluation(ctx, dev_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab) print('[Epoch {}] dev intent/slot = {:.3f}/{:.3f}, slot f1 = {:.2f}, intent acc = {:.2f}'.format( epoch_id, avg_dev_intent_loss, avg_dev_slot_loss, dev_slot_f1 * 100, dev_intent_acc * 100)) if dev_slot_f1 > best_dev_sf1: best_dev_sf1 = dev_slot_f1 avg_test_intent_loss, avg_test_slot_loss, test_intent_acc, \ test_slot_f1, test_pred_slots, test_gt_slots \ = evaluation(ctx, test_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab) print('[Epoch {}] test intent/slot = {:.3f}/{:.3f}, slot f1 = {:.2f}, intent acc = {:.2f}'.format( epoch_id, avg_test_intent_loss, avg_test_slot_loss, test_slot_f1 * 100, test_intent_acc * 100)) if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) net.save_parameters(os.path.join(args.save_dir, 'best_valid.params')) print('Evaluate the best model:') net.load_parameters(os.path.join(args.save_dir, 'best_valid.params')) avg_test_intent_loss, avg_test_slot_loss, test_intent_acc, \ test_slot_f1, test_pred_slots, test_gt_slots \ = evaluation(ctx, test_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab) print('Best validation model --> Slot F1={:.2f}, Intent acc={:.2f}'.format(test_slot_f1 * 100, test_intent_acc * 100)) with open(os.path.join(args.save_dir, 'test_error.txt'), 'w') as of: of.write('{} {}\n'.format(test_slot_f1, test_intent_acc ))
def translate(args): gpu_idx = args.gpu if not gpu_idx: ctx = mx.cpu() else: ctx = mx.gpu(gpu_idx - 1) en_bert, en_vocab = gluonnlp.model.get_model( args.bert_model, dataset_name=args.en_bert_dataset, pretrained=True, ctx=ctx, use_pooler=False, use_decoder=False, use_classifier=False) _, ch_vocab = gluonnlp.model.get_model(args.bert_model, dataset_name=args.ch_bert_dataset, pretrained=True, ctx=ctx, use_pooler=False, use_decoder=False, use_classifier=False) mt_model = MTModel_Hybird(en_vocab=en_vocab, ch_vocab=ch_vocab, embedding_dim=args.mt_emb_dim, model_dim=args.mt_model_dim, head_num=args.mt_head_num, layer_num=args.mt_layer_num, ffn_dim=args.mt_ffn_dim, dropout=args.mt_dropout, att_dropout=args.mt_att_dropout, ffn_dropout=args.mt_ffn_dropout, ctx=ctx) en_bert.load_parameters(args.en_bert_model_params_path, ctx=ctx) mt_model.load_parameters(args.mt_model_params_path, ctx=ctx) en_bert_tokenzier = BERTTokenizer(en_vocab) ch_bert_tokenzier = BERTTokenizer(ch_vocab) while True: trans = input("input:") trans = en_bert_tokenzier(trans) trans = [en_vocab.cls_token] + \ trans + [en_vocab.sep_token] trans_valid_len = len(trans) if args.max_en_len and len(trans) > args.max_en_len: trans = trans[0:args.max_en_len] aim = [BOS] trans = en_vocab[trans] aim = ch_vocab[aim] aim = nd.array([aim], ctx=ctx) trans = nd.array([trans], ctx=ctx) trans_valid_len = nd.array([trans_valid_len], ctx=ctx) trans_token_types = nd.zeros_like(trans) batch_size = 1 beam_size = 6 en_bert_outputs = en_bert(trans, trans_token_types, trans_valid_len) mt_outputs = mt_model(en_bert_outputs, trans, aim) en_bert_outputs = nd.broadcast_axes(en_bert_outputs, axis=0, size=beam_size) trans = nd.broadcast_axes(trans, axis=0, size=beam_size) targets = None for n in range(0, args.max_ch_len): aim, targets = beam_search(mt_outputs[:, n, :], targets=targets, max_seq_len=args.max_ch_len, ctx=ctx, beam_width=beam_size) mt_outputs = mt_model(en_bert_outputs, trans, aim) predict = aim.asnumpy().tolist() predict_strs = [] for pred in predict: predict_token = [ch_vocab.idx_to_token[int(idx)] for idx in pred] predict_str = "" sub_token = [] for token in predict_token: # if token in ["[CLS]", EOS, "[SEP]"]: # continue if len(sub_token) == 0: sub_token.append(token) elif token[:2] != "##" and len(sub_token) != 0: predict_str += "".join(sub_token) + " " sub_token = [] sub_token.append(token) else: if token[:2] == "##": token = token.replace("##", "") sub_token.append(token) if token == EOS: if len(sub_token) != 0: predict_str += "".join(sub_token) + " " break predict_strs.append( predict_str.replace("[SEP]", "").replace("[CLS]", "").replace(EOS, "")) for predict_str in predict_strs: print(predict_str)
def translate(args): gpu_idx = args.gpu if not gpu_idx: ctx = mx.cpu() else: ctx = mx.gpu(gpu_idx - 1) src_bert, src_vocab = gluonnlp.model.get_model(args.bert_model, dataset_name=args.src_bert_dataset, pretrained=True, ctx=ctx, use_pooler=False, use_decoder=False, use_classifier=False) _, tgt_vocab = gluonnlp.model.get_model(args.bert_model, dataset_name=args.tgt_bert_dataset, pretrained=True, ctx=ctx, use_pooler=False, use_decoder=False, use_classifier=False) mt_model = MTModel_Hybird(src_vocab=src_vocab, tgt_vocab=tgt_vocab, embedding_dim=args.mt_emb_dim, model_dim=args.mt_model_dim, head_num=args.mt_head_num, layer_num=args.mt_layer_num, ffn_dim=args.mt_ffn_dim, dropout=args.mt_dropout, att_dropout=args.mt_att_dropout, ffn_dropout=args.mt_ffn_dropout, ctx=ctx) src_bert.load_parameters(args.bert_model_params_path, ctx=ctx) mt_model.load_parameters(args.mt_model_params_path, ctx=ctx) src_bert_tokenzier = BERTTokenizer(src_vocab) tgt_bert_tokenzier = BERTTokenizer(tgt_vocab) while True: src = input("input:") src = src_bert_tokenzier(src) src = [src_vocab.cls_token] + \ src + [src_vocab.sep_token] src_valid_len = len(src) if args.max_src_len and len(src) > args.max_src_len: src = src[0:args.max_src_len] tgt = [BOS] src = src_vocab[src] tgt = tgt_vocab[tgt] tgt = nd.array([tgt], ctx=ctx) src = nd.array([src], ctx=ctx) src_valid_len = nd.array([src_valid_len], ctx=ctx) src_token_types = nd.zeros_like(src) beam_size = 6 src_bert_outputs = src_bert(src, src_token_types, src_valid_len) mt_outputs = mt_model(src_bert_outputs, src, tgt) src_bert_outputs = nd.broadcast_axes( src_bert_outputs, axis=0, size=beam_size) src = nd.broadcast_axes(src, axis=0, size=beam_size) targets = None for n in range(0, args.max_tgt_len): tgt, targets = beam_search( mt_outputs[:, n, :], targets=targets, max_seq_len=args.max_tgt_len, ctx=ctx, beam_width=beam_size) mt_outputs = mt_model(src_bert_outputs, src, tgt) predict = tgt.asnumpy().tolist() predict_strs = [] for pred in predict: predict_token = [tgt_vocab.idx_to_token[int(idx)] for idx in pred] predict_str = "" sub_token = [] for token in predict_token: # if token in ["[CLS]", EOS, "[SEP]"]: # continue if len(sub_token) == 0: sub_token.append(token) elif token[:2] != "##" and len(sub_token) != 0: predict_str += "".join(sub_token) + " " sub_token = [] sub_token.append(token) else: if token[:2] == "##": token = token.replace("##", "") sub_token.append(token) if token == EOS: if len(sub_token) != 0: predict_str += "".join(sub_token) + " " break predict_strs.append(predict_str.replace( "[SEP]", "").replace("[CLS]", "").replace(EOS, "")) for predict_str in predict_strs: print(predict_str)
if __name__ == '__main__': # random seed seed = args.seed np.random.seed(seed) random.seed(seed) mx.random.seed(seed) ctx = [mx.cpu()] if args.gpus is None or args.gpus == '' else \ [mx.gpu(int(x)) for x in args.gpus.split(',')] model, nsp_loss, mlm_loss, vocabulary = get_model(ctx) lower = 'uncased' in args.dataset_name tokenizer = BERTTokenizer(vocabulary, lower=lower) store = mx.kv.create(args.kvstore) if args.ckpt_dir: ckpt_dir = os.path.expanduser(args.ckpt_dir) if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) if args.data: data_train = get_dataset(args.data, args.batch_size, len(ctx), True, store) train(data_train, model, nsp_loss, mlm_loss, len(tokenizer.vocab), ctx, store) if args.data_eval: data_eval = get_dataset(args.data_eval, args.batch_size_eval, len(ctx), False, store)
def __init__(self, ch_vocab=None, max_seq_len=None, istrain=True): self.ch_vocab = ch_vocab self.max_seq_len = max_seq_len self.istrain = istrain self.tokenizer = BERTTokenizer( ch_vocab) # 后面没用bert的tokenizer,感觉效果反而好些。