from tmnt.estimator import BowEstimator, CovariateBowEstimator import numpy as np import gluonnlp as nlp import os from sklearn.datasets import fetch_20newsgroups from tmnt.preprocess.vectorizer import TMNTVectorizer from tmnt.inference import BowVAEInferencer n_samples = 2000 n_features = 1000 data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) data_samples = data[:n_samples] tf_vectorizer = TMNTVectorizer(vocab_size=1000) X, _ = tf_vectorizer.fit_transform(data_samples) num_covar_values = int(np.max(y)) + 1 # get the number of possible labels m_estimator = CovariateBowEstimator(tf_vectorizer.get_vocab(), num_covar_values) _ = m_estimator.fit(X, y) # fit a covariate model using y m_inferencer = BowVAEInferencer(m_estimator.model) ## the following returns a list of top 5 words per topic per covariate/label t_terms = m_inferencer.get_top_k_words_per_topic_per_covariate(5) ## top-5 terms for each topic over label/covariate index = 4 cov_4_topics = t_terms[4]
remove=('headers', 'footers', 'quotes'), return_X_y=True) train_data = data[:2000] dev_data = data[-2000:] train_y = y[:2000] dev_y = y[-2000:] model_name = 'bert_12_768_12' dataset = 'book_corpus_wiki_en_uncased' batch_size = 32 seq_len = 64 pad = True tr_ds = ArrayDataset(train_data, train_y) dev_ds = ArrayDataset(dev_data, dev_y) vectorizer = TMNTVectorizer(vocab_size=2000) vectorizer.fit_transform(train_data) ctx = mx.cpu() ## or mx.gpu(N) if using GPU device=N tr_dataset, dev_dataset, num_examples, bert_base, _ = get_bert_datasets(None, vectorizer, tr_ds, dev_ds, batch_size, seq_len, bert_model_name=model_name, bert_dataset=dataset, pad=False, ctx=ctx) num_classes = int(np.max(y) + 1) estimator = SeqBowEstimator(bert_base, bert_model_name = model_name, bert_data_name = dataset, n_labels = num_classes, bow_vocab = vectorizer.get_vocab(), optimizer='bertadam', batch_size=batch_size, ctx=ctx, log_interval=1,
""" from tmnt.estimator import BowEstimator import numpy as np import gluonnlp as nlp import os import umap from sklearn.datasets import fetch_20newsgroups from tmnt.preprocess.vectorizer import TMNTVectorizer from tmnt.inference import BowVAEInferencer data, y = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), return_X_y=True) tf_vectorizer = TMNTVectorizer(vocab_size=1000) X, _ = tf_vectorizer.fit_transform(data) num_label_values = int(np.max(y)) + 1 # get the number of possible labels gamma = 1.0 ## balanced unsupervised and supservised losses ## total loss = topic_loss + gamma * classification_loss l_estimator = BowEstimator(tf_vectorizer.get_vocab(), n_labels=num_label_values, gamma=gamma) _ = l_estimator.fit(X, y) # fit a joint topic + classification model using y v_results = l_estimator.validate(X, y) l_inferencer = BowVAEInferencer(l_estimator.model) embeddings = l_inferencer.get_umap_embeddings(X) l_inferencer.plot_to(embeddings, y, None)