def train_model(self, config, reporter): """Primary training call used for model training/evaluation by autogluon model selection or for training one off models. Parameters: config (:class:`tmnt.configuration.TMNTConfigBOW`): TMNT configuration for bag-of-words models reporter (:class:`autogluon.core.scheduler.reporter.Reporter`): object for reporting model evaluations to scheduler Returns: (tuple): Tuple containing: - model (:class:`tmnt.estimator.SeqBowEstimator`): variational BERT encoder-decoder model with trained parameters - obj (float): scaled objective with fit model - npmi (float): coherence on validation set - perplexity (float): perplexity score on validation data - redundancy (float): topic model redundancy of top 5 terms for each topic """ ctx_list = self._get_mxnet_visible_gpus() if self.use_gpu else [ mx.cpu() ] ctx = ctx_list[0] vectorizer = TMNTVectorizer(vocab_size=4000, text_key="text", label_key="label") _, _ = vectorizer.fit_transform_json(self.train_data_or_path) classes = list(vectorizer.label_map) if config.use_labels else None tr_ds = JsonlDataset(self.train_data_or_path, txt_key="text", label_key="label") val_ds = JsonlDataset(self.test_data_or_path, txt_key="text", label_key="label") aux_ds = JsonlDataset( self.aux_data_or_path, txt_key="text", label_key="label") if self.aux_data_or_path else None bert_model_name = config.bert_model_name bert_dataset = config.bert_dataset batch_size = config.batch_size max_seq_len = config.max_seq_len tr_dataset, val_dataset, num_examples, bert_base, bert_vocab = \ get_bert_datasets(classes, vectorizer, tr_ds, val_ds, batch_size, max_seq_len, aux_ds = aux_ds, bert_model_name=bert_model_name, bert_dataset=bert_dataset, ctx=ctx) n_labels = len(classes) if classes else 0 logging.info('Number of labels: {}'.format(n_labels)) logging.info('Number of examples: {}'.format(num_examples)) seq_ved_estimator = SeqBowEstimator.from_config( config, bert_base, vectorizer.get_vocab(), n_labels=len(classes), log_interval=self.log_interval, reporter=reporter, ctx=ctx) obj, v_res = \ seq_ved_estimator.fit_with_validation(tr_dataset, val_dataset, num_examples, aux_data=(aux_ds is not None)) return seq_ved_estimator, obj, v_res
def __init__(self, model, ctx=mx.cpu()): super().__init__(ctx) self.max_batch_size = 16 self.vocab = model.vocabulary self.vectorizer = TMNTVectorizer(initial_vocabulary=model.vocabulary) self.n_latent = model.n_latent self.model = model if isinstance(model, CovariateBowVAEModel): self.covar_model = True self.n_covars = model.n_covars self.covar_net_layers = model.covar_net_layers else: self.covar_model = False
def prepare_bert_via_json(json_file, max_len, bow_vocab_size=1000, vectorizer=None, json_text_key="text", json_label_key=None, ctx=mx.cpu()): with io.open(json_file, 'r', encoding='utf-8') as fp: content = [json.loads(line)[json_text_key] for line in fp] x_ids, x_val_lens, x_segs, bert_base, bert_vocab, _ = _load_dataset_bert(content, 0, max_len, ctx) tf_vectorizer = vectorizer or TMNTVectorizer(text_key=json_text_key, label_key=json_label_key, vocab_size = bow_vocab_size) X, y = tf_vectorizer.transform_json(json_file) if vectorizer else tf_vectorizer.fit_transform_json(json_file) data_train = gluon.data.ArrayDataset( mx.nd.array(x_ids, dtype='int32'), mx.nd.array(x_val_lens, dtype='int32'), mx.nd.array(x_segs, dtype='int32'), mx.nd.sparse.csr_matrix(X, dtype='float32').tostype('default')) return data_train, X, tf_vectorizer, bert_base, bert_vocab, y
def prepare_bert(content, max_len, bow_vocab_size=1000, vectorizer=None, ctx=mx.cpu()): """ Utility function to take text content (e.g. list of document strings), a maximum sequence length and vocabulary size, returning a data_train object that can be used by a SeqBowEstimator object for the call to fit_with_validation. Also returns the BOW matrix as a SciPy sparse matrix along with the BOW vocabulary. """ x_ids, x_val_lens, x_segs, bert_base, bert_vocab, _ = _load_dataset_bert(content, 0, max_len, ctx) tf_vectorizer = vectorizer or TMNTVectorizer(vocab_size = bow_vocab_size) X, _ = tf_vectorizer.transform(content) if vectorizer else tf_vectorizer.fit_transform(content) data_train = gluon.data.ArrayDataset( mx.nd.array(x_ids, dtype='int32'), mx.nd.array(x_val_lens, dtype='int32'), mx.nd.array(x_segs, dtype='int32'), mx.nd.sparse.csr_matrix(X, dtype='float32').tostype('default')) return data_train, X, tf_vectorizer, bert_base, bert_vocab
def __init__(self, model, bert_vocab, max_length, bow_vocab=None, pre_vectorizer=None, ctx=mx.cpu()): super().__init__(ctx) self.model = model self.bert_base = model.bert self.tokenizer = BERTTokenizer(bert_vocab) self.transform = BERTSentenceTransform(self.tokenizer, max_length, pair=False) self.bow_vocab = bow_vocab self.vectorizer = pre_vectorizer or TMNTVectorizer( initial_vocabulary=bow_vocab)
def prepare_dataset_sequence(content, max_len, labels=None, tokenizer=None, bow_vocab_size=1000, vectorizer=None, ctx=mx.cpu()): tf_vectorizer = vectorizer or TMNTVectorizer(vocab_size = bow_vocab_size) if tokenizer is None: tokenizer = nlp.data.SacreMosesTokenizer() X, _ = tf_vectorizer.transform(content) if vectorizer else tf_vectorizer.fit_transform(content) vocab = tf_vectorizer.get_vocab() #x_ids, x_val_lens = _load_dataset_sequence(content, max_len, tokenizer, vocab) x_ids, x_val_lens = _load_bow_identical_sequence(X, max_len) if labels is not None: larr = mx.nd.array(labels, dtype='float32') else: larr = mx.nd.full(len(x_ids), -1) data_train = gluon.data.ArrayDataset( mx.nd.array(x_ids, dtype='int32'), mx.nd.array(x_val_lens, dtype='int32'), mx.nd.sparse.csr_matrix(X, dtype='float32').tostype('default'), larr) return data_train, tf_vectorizer.get_vocab(), X
class BowVAEInferencer(BaseInferencer): """ """ def __init__(self, model, ctx=mx.cpu()): super().__init__(ctx) self.max_batch_size = 16 self.vocab = model.vocabulary self.vectorizer = TMNTVectorizer(initial_vocabulary=model.vocabulary) self.n_latent = model.n_latent self.model = model if isinstance(model, CovariateBowVAEModel): self.covar_model = True self.n_covars = model.n_covars self.covar_net_layers = model.covar_net_layers else: self.covar_model = False @classmethod def from_saved(cls, param_file=None, config_file=None, vocab_file=None, model_dir=None, ctx=mx.cpu()): if model_dir is not None: param_file = os.path.join(model_dir, 'model.params') vocab_file = os.path.join(model_dir, 'vocab.json') config_file = os.path.join(model_dir, 'model.config') with open(config_file) as f: config = json.loads(f.read()) with open(vocab_file) as f: voc_js = f.read() vocab = nlp.Vocab.from_json(voc_js) n_latent = config['n_latent'] enc_dim = config['enc_hidden_dim'] lat_distrib = config['latent_distribution']['dist_type'] n_encoding_layers = config.get('num_enc_layers', 0) enc_dr = float(config.get('enc_dr', 0.0)) emb_size = config['derived_info']['embedding_size'] if 'n_covars' in config: n_covars = config['n_covars'] covar_net_layers = config.get('covar_net_layers') model = CovariateBowVAEModel(covar_net_layers, enc_dim, emb_size, n_encoding_layers, enc_dr, False, vocabulary=vocab, n_covars=n_covars, latent_distrib=lat_distrib, n_latent=n_latent, ctx=ctx) else: model = BowVAEModel(enc_dim, emb_size, n_encoding_layers, enc_dr, False, vocabulary=vocab, latent_distrib=lat_distrib, n_latent=n_latent, ctx=ctx) model.load_parameters(str(param_file), allow_missing=False) return cls(model, ctx) def get_model_details(self, sp_vec_file): data_csr, labels = load_svmlight_file(sp_vec_file) data_csr = mx.nd.sparse.csr_matrix(data_csr, dtype='float32') ## 1) K x W matrix of P(term|topic) probabilities w = self.model.decoder.collect_params().get( 'weight').data().transpose() ## (K x W) w_pr = mx.nd.softmax(w, axis=1) ## 2) D x K matrix over the test data of topic probabilities covars = labels if self.covar_model else None dt_matrix = self.encode_data(data_csr, covars, use_probs=True) ## 3) D-length vector of document sizes doc_lengths = data_csr.sum(axis=1) ## 4) vocab (in same order as W columns) ## 5) frequency of each word w_i \in W over the test corpus term_cnts = data_csr.sum(axis=0) return w_pr, dt_matrix, doc_lengths, term_cnts def get_pyldavis_details(self, sp_vec_file): w_pr, dt_matrix, doc_lengths, term_cnts = self.get_model_details( sp_vec_file) d1 = w_pr.asnumpy().tolist() d2 = list(map(lambda x: x.asnumpy().tolist(), dt_matrix)) d3 = doc_lengths.asnumpy().tolist() d5 = term_cnts.asnumpy().tolist() d4 = list( map(lambda i: self.vocab.idx_to_token[i], range(len(self.vocab.idx_to_token)))) d = { 'topic_term_dists': d1, 'doc_topic_dists': d2, 'doc_lengths': d3, 'vocab': d4, 'term_frequency': d5 } return d def get_umap_embeddings(self, data, umap_metric='euclidean'): encs = self.encode_data(data, None) encs2 = np.array([enc.asnumpy() for enc in encs]) um = umap.UMAP(n_neighbors=4, min_dist=0.1, metric='euclidean') return um.fit_transform(encs2) def plot_to(self, embeddings, labels, f=None): import matplotlib.pyplot as plt plt.scatter(*embeddings.T, c=labels, s=0.8, alpha=0.9, cmap='coolwarm') if f is None: plt.show() else: plt.savefig(f) def export_full_model_inference_details(self, sp_vec_file, ofile): d = self.get_pyldavis_details(sp_vec_file) with io.open(ofile, 'w') as fp: json.dump(d, fp, sort_keys=True, indent=4) def encode_vec_file(self, sp_vec_file, use_probs=False): data_mat, labels = load_svmlight_file(sp_vec_file, n_features=len(self.vocab)) return self.encode_data(data_mat, labels, use_probs=use_probs), labels def encode_texts(self, texts, use_probs=False, include_bn=False): X, _ = self.vectorizer.transform(texts) encodings = self.encode_data(X, None, use_probs=use_probs, include_bn=include_bn) return encodings def _get_data_iterator(self, data_mat, labels): x_size = data_mat.shape[0] * data_mat.shape[1] if x_size <= MAX_DESIGN_MATRIX and isinstance( data_mat, scipy.sparse.csr.csr_matrix): data_mat = mx.nd.sparse.csr_matrix(data_mat, dtype='float32') elif isinstance(data_mat, mx.nd.NDArray): data_mat = mx.nd.array(data_mat, dtype='float32') batch_size = min(data_mat.shape[0], self.max_batch_size) last_batch_size = data_mat.shape[0] % batch_size covars = mx.nd.one_hot(mx.nd.array(labels, dtype='int'), self.n_covars) \ if self.covar_model and labels[:-last_batch_size] is not None else None if last_batch_size < 1: data_to_iter = data_mat else: data_to_iter = data_mat[:-last_batch_size] if x_size > MAX_DESIGN_MATRIX: logging.info( "Sparse matrix has total size = {}. Using Sparse Matrix data batcher." .format(x_size)) if covars is None: covars = mx.nd.zeros(data_to_iter.shape[0]) infer_iter = DataIterLoader( SparseMatrixDataIter(data_to_iter, covars, batch_size, last_batch_handle='discard', shuffle=False)) else: infer_iter = DataIterLoader( mx.io.NDArrayIter(data_to_iter, covars, batch_size, last_batch_handle='discard', shuffle=False)) return infer_iter, last_batch_size def encode_data(self, data_mat, labels, use_probs=False, include_bn=False): infer_iter, last_batch_size = self._get_data_iterator(data_mat, labels) encodings = [] for _, (data, labels) in enumerate(infer_iter): data = data.as_in_context(self.ctx) if self.covar_model and labels is not None: labels = labels.as_in_context(self.ctx) encs = self.model.encode_data_with_covariates( data, labels, include_bn=include_bn) else: encs = self.model.encode_data(data, include_bn=include_bn) if use_probs: e1 = encs - mx.nd.min(encs, axis=1).expand_dims(1) encs = mx.nd.softmax(e1**0.5) encodings.extend(encs) ## handle the last batch explicitly as NDArrayIter doesn't do that for us if last_batch_size > 0: last_data = mx.nd.sparse.csr_matrix(data_mat[-last_batch_size:], dtype='float32') data = last_data.as_in_context(self.ctx) if self.covar_model and labels is not None: labels = mx.nd.one_hot( mx.nd.array(labels[-last_batch_size:], dtype='int'), self.n_covars).as_in_context(self.ctx) encs = self.model.encode_data_with_covariates(data, labels) else: encs = self.model.encode_data(data) if use_probs: e1 = encs - mx.nd.min(encs, axis=1).expand_dims(1) encs = mx.nd.softmax(e1**0.5) encodings.extend(encs) return encodings def get_likelihood_stats(self, data_mat, n_samples=50): ## Notes: ## Following ideas in the paper: ## Bayesian Autoencoders: Analysing and Fixing the Bernoulli likelihood for Out-of-Distribution Detection ## But - that analysis was done on images with less sparsity ## Consider using Gaussian liklihood here as well to avoid skewness associated with Bernoulli likilhood data_iter, last_batch_size = self._get_data_iterator(data_mat, None) all_stats = [] for _, (data, labels) in enumerate(data_iter): elbos = [] for s in range(0, n_samples): elbo, _, _, _, _, _, _ = self.model(data) elbos.append(list(elbo.asnumpy())) wd_cnts = data.sum(axis=1).asnumpy() elbos_np = np.array(elbos) / (wd_cnts + 1) elbos_means = list(elbos_np.mean(axis=0)) elbos_var = list(elbos_np.var(axis=0)) all_stats.extend(list(zip(elbos_means, elbos_var))) return all_stats def get_top_k_words_per_topic(self, k): sorted_ids = self.model.get_ordered_terms() topic_terms = [] for t in range(self.n_latent): top_k = [ self.vocab.idx_to_token[int(i)] for i in list(sorted_ids[:k, t]) ] topic_terms.append(top_k) return topic_terms def get_top_k_words_per_topic_per_covariate(self, k): n_topics = self.n_latent w = self.model.cov_decoder.cov_inter_decoder.collect_params().get( 'weight').data() n_covars = int(w.shape[1] / n_topics) topic_terms = [] for i in range(n_covars): cv_i_slice = w[:, (i * n_topics):((i + 1) * n_topics)] sorted_ids = cv_i_slice.argsort(axis=0, is_ascend=False) cv_i_terms = [] for t in range(n_topics): top_k = [ self.vocab.idx_to_token[int(i)] for i in list(sorted_ids[:k, t].asnumpy()) ] cv_i_terms.append(top_k) topic_terms.append(cv_i_terms) return topic_terms def get_top_k_words_per_topic_over_scalar_covariate( self, k, min_v=0.0, max_v=1.0, step=0.1): raise NotImplemented
data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) train_data = data[:2000] dev_data = data[-2000:] train_y = y[:2000] dev_y = y[-2000:] model_name = 'bert_12_768_12' dataset = 'book_corpus_wiki_en_uncased' batch_size = 32 seq_len = 64 pad = True tr_ds = ArrayDataset(train_data, train_y) dev_ds = ArrayDataset(dev_data, dev_y) vectorizer = TMNTVectorizer(vocab_size=2000) vectorizer.fit_transform(train_data) ctx = mx.cpu() ## or mx.gpu(N) if using GPU device=N tr_dataset, dev_dataset, num_examples, bert_base, _ = get_bert_datasets(None, vectorizer, tr_ds, dev_ds, batch_size, seq_len, bert_model_name=model_name, bert_dataset=dataset, pad=False, ctx=ctx) num_classes = int(np.max(y) + 1) estimator = SeqBowEstimator(bert_base, bert_model_name = model_name, bert_data_name = dataset, n_labels = num_classes, bow_vocab = vectorizer.get_vocab(), optimizer='bertadam',
from tmnt.estimator import BowEstimator, CovariateBowEstimator import numpy as np import gluonnlp as nlp import os from sklearn.datasets import fetch_20newsgroups from tmnt.preprocess.vectorizer import TMNTVectorizer from tmnt.inference import BowVAEInferencer n_samples = 2000 n_features = 1000 data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) data_samples = data[:n_samples] tf_vectorizer = TMNTVectorizer(vocab_size=1000) X, _ = tf_vectorizer.fit_transform(data_samples) num_covar_values = int(np.max(y)) + 1 # get the number of possible labels m_estimator = CovariateBowEstimator(tf_vectorizer.get_vocab(), num_covar_values) _ = m_estimator.fit(X, y) # fit a covariate model using y m_inferencer = BowVAEInferencer(m_estimator.model) ## the following returns a list of top 5 words per topic per covariate/label t_terms = m_inferencer.get_top_k_words_per_topic_per_covariate(5) ## top-5 terms for each topic over label/covariate index = 4 cov_4_topics = t_terms[4]
""" from tmnt.estimator import BowEstimator import numpy as np import gluonnlp as nlp import os import umap from sklearn.datasets import fetch_20newsgroups from tmnt.preprocess.vectorizer import TMNTVectorizer from tmnt.inference import BowVAEInferencer data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) tf_vectorizer = TMNTVectorizer(vocab_size=1000) X, _ = tf_vectorizer.fit_transform(data) num_label_values = int(np.max(y)) + 1 # get the number of possible labels gamma = 1.0 ## balanced unsupervised and supservised losses ## total loss = topic_loss + gamma * classification_loss l_estimator = BowEstimator(tf_vectorizer.get_vocab(), n_labels=num_label_values, gamma=gamma) _ = l_estimator.fit(X, y) # fit a joint topic + classification model using y v_results = l_estimator.validate(X, y) l_inferencer = BowVAEInferencer(l_estimator.model) embeddings = l_inferencer.get_umap_embeddings(X) l_inferencer.plot_to(embeddings, y, None)
parser.add_argument('--label_prefix_chars', type=int, help='Use first N characters of label', default=-1) parser.add_argument('--str_encoding', type=str, help='String/file encoding to use', default='utf-8') parser.add_argument('--log_dir', type=str, help='Logging directory', default='.') args = parser.parse_args() if __name__ == '__main__': logging_config(folder=args.log_dir, name='vectorizer', level='info') if args.vocab_file is None: raise Exception("Vocabulary output file name/path must be provided") vectorizer = \ TMNTVectorizer(text_key=args.json_text_key, custom_stop_word_file=args.custom_stop_words, label_key=args.json_label_key, min_doc_size=args.min_doc_length, label_prefix=args.label_prefix_chars, json_out_dir=args.json_out_dir, vocab_size = args.vocab_size, encoding=args.str_encoding) vectorizer.fit_transform_in_place_json(args.tr_input_file) vectorizer.write_vocab(args.vocab_file) if args.val_input_file: vectorizer.transform_in_place_json(args.val_input_file) if args.tst_input_file: vectorizer.transform_in_place_json(args.tst_input_file)
if (args.tr_vec_file is None) or (args.tr_input is None): raise Exception( "Training directory and output vector file must be provided") tok_pattern = args.token_pattern or r'(?u)\b\w\w+\b' count_vectorizer_kwargs = { 'token_pattern': tok_pattern, 'stop_words': 'english', 'max_df': 0.95, 'min_df': 2 } vectorizer = \ TMNTVectorizer(text_key=args.json_text_key, label_key=args.json_label_key, min_doc_size=args.min_doc_length, label_prefix=args.label_prefix_chars, file_pat=args.file_pat, vocab_size=args.vocab_size, json_out_dir=args.json_out_dir, encoding=args.str_encoding, label_min_cnt=args.label_min_cnt, stop_word_file=args.custom_stop_words, count_vectorizer_kwargs=count_vectorizer_kwargs) tr_X, tr_y = \ vectorizer.fit_transform_json_dir(args.tr_input) if os.path.isdir(args.tr_input) else vectorizer.fit_transform_json(args.tr_input) vectorizer.write_to_vec_file(tr_X, tr_y, args.tr_vec_file) vectorizer.write_vocab(args.vocab_file) if args.val_input and args.val_vec_file: val_X, val_y = \ vectorizer.transform_json_dir(args.val_input) if os.path.isdir(args.val_input) else vectorizer.transform_json(args.val_input) vectorizer.write_to_vec_file(val_X, val_y, args.val_vec_file) if args.tst_input and args.tst_vec_file: tst_X, tst_y = \
import gluonnlp as nlp import os import umap from sklearn.datasets import fetch_20newsgroups from tmnt.preprocess.vectorizer import TMNTVectorizer from tmnt.configuration import TMNTConfigBOW from tmnt.trainer import BowVAETrainer from tmnt.selector import BaseSelector data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) tf_vectorizer = TMNTVectorizer(vocab_size=2000) X, _ = tf_vectorizer.fit_transform(data) vocab = tf_vectorizer.get_vocab() tmnt_config = TMNTConfigBOW( 'examples/select_model/config.yaml').get_configspace() selector = BaseSelector(tmnt_config, 8, 'random', 'fifo', 1, 4, False, 1, 1234, '_model_out') trainer = BowVAETrainer(vocab, X[:8000], X[8000:], log_out_dir='_exps', model_out_dir='_model_out') selector.select_model(trainer)