def train_model(self, config, reporter): """Primary training call used for model training/evaluation by autogluon model selection or for training one off models. Parameters: config (:class:`tmnt.configuration.TMNTConfigBOW`): TMNT configuration for bag-of-words models reporter (:class:`autogluon.core.scheduler.reporter.Reporter`): object for reporting model evaluations to scheduler Returns: (tuple): Tuple containing: - model (:class:`tmnt.estimator.SeqBowEstimator`): variational BERT encoder-decoder model with trained parameters - obj (float): scaled objective with fit model - npmi (float): coherence on validation set - perplexity (float): perplexity score on validation data - redundancy (float): topic model redundancy of top 5 terms for each topic """ ctx_list = self._get_mxnet_visible_gpus() if self.use_gpu else [ mx.cpu() ] ctx = ctx_list[0] vectorizer = TMNTVectorizer(vocab_size=4000, text_key="text", label_key="label") _, _ = vectorizer.fit_transform_json(self.train_data_or_path) classes = list(vectorizer.label_map) if config.use_labels else None tr_ds = JsonlDataset(self.train_data_or_path, txt_key="text", label_key="label") val_ds = JsonlDataset(self.test_data_or_path, txt_key="text", label_key="label") aux_ds = JsonlDataset( self.aux_data_or_path, txt_key="text", label_key="label") if self.aux_data_or_path else None bert_model_name = config.bert_model_name bert_dataset = config.bert_dataset batch_size = config.batch_size max_seq_len = config.max_seq_len tr_dataset, val_dataset, num_examples, bert_base, bert_vocab = \ get_bert_datasets(classes, vectorizer, tr_ds, val_ds, batch_size, max_seq_len, aux_ds = aux_ds, bert_model_name=bert_model_name, bert_dataset=bert_dataset, ctx=ctx) n_labels = len(classes) if classes else 0 logging.info('Number of labels: {}'.format(n_labels)) logging.info('Number of examples: {}'.format(num_examples)) seq_ved_estimator = SeqBowEstimator.from_config( config, bert_base, vectorizer.get_vocab(), n_labels=len(classes), log_interval=self.log_interval, reporter=reporter, ctx=ctx) obj, v_res = \ seq_ved_estimator.fit_with_validation(tr_dataset, val_dataset, num_examples, aux_data=(aux_ds is not None)) return seq_ved_estimator, obj, v_res
dataset = 'book_corpus_wiki_en_uncased' batch_size = 32 seq_len = 64 pad = True tr_ds = ArrayDataset(train_data, train_y) dev_ds = ArrayDataset(dev_data, dev_y) vectorizer = TMNTVectorizer(vocab_size=2000) vectorizer.fit_transform(train_data) ctx = mx.cpu() ## or mx.gpu(N) if using GPU device=N tr_dataset, dev_dataset, num_examples, bert_base, _ = get_bert_datasets(None, vectorizer, tr_ds, dev_ds, batch_size, seq_len, bert_model_name=model_name, bert_dataset=dataset, pad=False, ctx=ctx) num_classes = int(np.max(y) + 1) estimator = SeqBowEstimator(bert_base, bert_model_name = model_name, bert_data_name = dataset, n_labels = num_classes, bow_vocab = vectorizer.get_vocab(), optimizer='bertadam', batch_size=batch_size, ctx=ctx, log_interval=1, log_method='print', gamma=1.0, n_latent=20, lr=2e-5, decoder_lr=0.02, epochs=1) # this will take quite some time without a GPU! estimator.fit_with_validation(tr_dataset, dev_dataset, num_examples)
from tmnt.estimator import BowEstimator, CovariateBowEstimator import numpy as np import gluonnlp as nlp import os from sklearn.datasets import fetch_20newsgroups from tmnt.preprocess.vectorizer import TMNTVectorizer from tmnt.inference import BowVAEInferencer n_samples = 2000 n_features = 1000 data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) data_samples = data[:n_samples] tf_vectorizer = TMNTVectorizer(vocab_size=1000) X, _ = tf_vectorizer.fit_transform(data_samples) num_covar_values = int(np.max(y)) + 1 # get the number of possible labels m_estimator = CovariateBowEstimator(tf_vectorizer.get_vocab(), num_covar_values) _ = m_estimator.fit(X, y) # fit a covariate model using y m_inferencer = BowVAEInferencer(m_estimator.model) ## the following returns a list of top 5 words per topic per covariate/label t_terms = m_inferencer.get_top_k_words_per_topic_per_covariate(5) ## top-5 terms for each topic over label/covariate index = 4 cov_4_topics = t_terms[4]
""" from tmnt.estimator import BowEstimator import numpy as np import gluonnlp as nlp import os import umap from sklearn.datasets import fetch_20newsgroups from tmnt.preprocess.vectorizer import TMNTVectorizer from tmnt.inference import BowVAEInferencer data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) tf_vectorizer = TMNTVectorizer(vocab_size=1000) X, _ = tf_vectorizer.fit_transform(data) num_label_values = int(np.max(y)) + 1 # get the number of possible labels gamma = 1.0 ## balanced unsupervised and supservised losses ## total loss = topic_loss + gamma * classification_loss l_estimator = BowEstimator(tf_vectorizer.get_vocab(), n_labels=num_label_values, gamma=gamma) _ = l_estimator.fit(X, y) # fit a joint topic + classification model using y v_results = l_estimator.validate(X, y) l_inferencer = BowVAEInferencer(l_estimator.model) embeddings = l_inferencer.get_umap_embeddings(X) l_inferencer.plot_to(embeddings, y, None)
""" from tmnt.estimator import BowEstimator, LabeledBowEstimator import numpy as np import gluonnlp as nlp import os import umap from sklearn.datasets import fetch_20newsgroups from tmnt.preprocess.vectorizer import TMNTVectorizer from tmnt.inference import BowVAEInferencer data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) tf_vectorizer = TMNTVectorizer(vocab_size=1000) X, _ = tf_vectorizer.fit_transform(data) num_label_values = int(np.max(y)) + 1 # get the number of possible labels gamma = 1.0 ## balanced unsupervised and supservised losses ## total loss = topic_loss + gamma * classification_loss l_estimator = LabeledBowEstimator(tf_vectorizer.get_vocab(), n_labels=num_label_values, gamma=gamma) _ = l_estimator.fit(X, y) # fit a covariate model using y v_results = l_estimator.validate(X, y) l_inferencer = BowVAEInferencer(l_estimator.model) embeddings = l_inferencer.get_umap_embeddings(X) l_inferencer.plot_to(embeddings, y, f)
data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) tf_vectorizer = TMNTVectorizer(vocab_size=2000) X, _ = tf_vectorizer.fit_transform(data) ## NOTE: if loading from the .vec files and vocab.txt file generated by prepare_corpus.py, do the following: ## from sklearn.datasets import load_svmlight_file ## from tmnt.data_loading import load_vocab ## X, y = load_svmlight_file('train.vec') ## val_X, val_y = load_svmlight_file('val.vec') ## vocab = load_vocab('vocab.txt') ## estimator = BowEstimator(vocab).fit(X) estimator = BowEstimator(tf_vectorizer.get_vocab()).fit(X) inferencer = BowVAEInferencer(estimator.model) encodings = inferencer.encode_texts(['Greater Armenia would stretch from Karabakh, to the Black Sea, to the Mediterranean, so if you use the term Greater Armenia use it with care.','I have two pairs of headphones I\'d like to sell. These are excellent, and both in great condition']) ## write out model os.mkdir('_model_dir') estimator.write_model('_model_dir') ## reload model est2 = BowEstimator.from_config('_model_dir/model.config', '_model_dir/vocab.json', pretrained_param_file='_model_dir/model.params') ## instead of fitting with data; initialize with pretrained values est2.initialize_with_pretrained() est2.perplexity(X) # get preplexity est2.validate(X, None) # get perplexity, NPMI and redundancy
min_doc_size=args.min_doc_length, label_prefix=args.label_prefix_chars, file_pat=args.file_pat, vocab_size=args.vocab_size, json_out_dir=args.json_out_dir, encoding=args.str_encoding, label_min_cnt=args.label_min_cnt, stop_word_file=args.custom_stop_words, count_vectorizer_kwargs=count_vectorizer_kwargs) tr_X, tr_y = \ vectorizer.fit_transform_json_dir(args.tr_input) if os.path.isdir(args.tr_input) else vectorizer.fit_transform_json(args.tr_input) vectorizer.write_to_vec_file(tr_X, tr_y, args.tr_vec_file) vectorizer.write_vocab(args.vocab_file) if args.val_input and args.val_vec_file: val_X, val_y = \ vectorizer.transform_json_dir(args.val_input) if os.path.isdir(args.val_input) else vectorizer.transform_json(args.val_input) vectorizer.write_to_vec_file(val_X, val_y, args.val_vec_file) if args.tst_input and args.tst_vec_file: tst_X, tst_y = \ vectorizer.transform_json_dir(args.tst_input) if os.path.isdir(args.tst_input) else vectorizer.transform_json(args.tst_input) vectorizer.write_to_vec_file(tst_X, tst_y, args.tst_vec_file) if args.label_map: with io.open(args.label_map, 'w') as fp: fp.write(json.dumps(vectorizer.label_map, indent=4)) if args.full_vocab_histogram: import numpy as np i_to_t = vectorizer.get_vocab().idx_to_token with io.open(args.full_vocab_histogram, 'w') as fp: cnts = np.array(tr_X.sum(axis=0)).squeeze() for i in list(np.argsort(cnts * -1)): fp.write(i_to_t[i] + ' ' + str(cnts[i]) + '\n')
import umap from sklearn.datasets import fetch_20newsgroups from tmnt.preprocess.vectorizer import TMNTVectorizer from tmnt.configuration import TMNTConfigBOW from tmnt.trainer import BowVAETrainer from tmnt.selector import BaseSelector data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) tf_vectorizer = TMNTVectorizer(vocab_size=2000) X, _ = tf_vectorizer.fit_transform(data) vocab = tf_vectorizer.get_vocab() tmnt_config = TMNTConfigBOW( 'examples/select_model/config.yaml').get_configspace() selector = BaseSelector(tmnt_config, 8, 'random', 'fifo', 1, 4, False, 1, 1234, '_model_out') trainer = BowVAETrainer(vocab, X[:8000], X[8000:], log_out_dir='_exps', model_out_dir='_model_out') selector.select_model(trainer) n_labels = int(np.max(y)) + 1