예제 #1
0
    def train_model(self, config, reporter):
        """Primary training call used for model training/evaluation by autogluon model selection
        or for training one off models.
        
        Parameters:
            config (:class:`tmnt.configuration.TMNTConfigBOW`): TMNT configuration for bag-of-words models
            reporter (:class:`autogluon.core.scheduler.reporter.Reporter`): object for reporting model evaluations to scheduler
        
        Returns:
            (tuple): Tuple containing:
                - model (:class:`tmnt.estimator.SeqBowEstimator`): variational BERT encoder-decoder model with trained parameters
                - obj (float): scaled objective with fit model
                - npmi (float): coherence on validation set
                - perplexity (float): perplexity score on validation data
                - redundancy (float): topic model redundancy of top 5 terms for each topic
        """
        ctx_list = self._get_mxnet_visible_gpus() if self.use_gpu else [
            mx.cpu()
        ]
        ctx = ctx_list[0]
        vectorizer = TMNTVectorizer(vocab_size=4000,
                                    text_key="text",
                                    label_key="label")
        _, _ = vectorizer.fit_transform_json(self.train_data_or_path)
        classes = list(vectorizer.label_map) if config.use_labels else None
        tr_ds = JsonlDataset(self.train_data_or_path,
                             txt_key="text",
                             label_key="label")
        val_ds = JsonlDataset(self.test_data_or_path,
                              txt_key="text",
                              label_key="label")
        aux_ds = JsonlDataset(
            self.aux_data_or_path, txt_key="text",
            label_key="label") if self.aux_data_or_path else None

        bert_model_name = config.bert_model_name
        bert_dataset = config.bert_dataset
        batch_size = config.batch_size
        max_seq_len = config.max_seq_len

        tr_dataset, val_dataset, num_examples, bert_base, bert_vocab  = \
            get_bert_datasets(classes, vectorizer, tr_ds, val_ds, batch_size, max_seq_len, aux_ds = aux_ds,
                              bert_model_name=bert_model_name, bert_dataset=bert_dataset, ctx=ctx)
        n_labels = len(classes) if classes else 0
        logging.info('Number of labels: {}'.format(n_labels))
        logging.info('Number of examples: {}'.format(num_examples))
        seq_ved_estimator = SeqBowEstimator.from_config(
            config,
            bert_base,
            vectorizer.get_vocab(),
            n_labels=len(classes),
            log_interval=self.log_interval,
            reporter=reporter,
            ctx=ctx)
        obj, v_res = \
            seq_ved_estimator.fit_with_validation(tr_dataset, val_dataset, num_examples, aux_data=(aux_ds is not None))
        return seq_ved_estimator, obj, v_res
예제 #2
0
dataset = 'book_corpus_wiki_en_uncased'
batch_size = 32
seq_len = 64
pad = True
tr_ds = ArrayDataset(train_data, train_y)
dev_ds = ArrayDataset(dev_data, dev_y)

vectorizer = TMNTVectorizer(vocab_size=2000)
vectorizer.fit_transform(train_data)

ctx = mx.cpu() ## or mx.gpu(N) if using GPU device=N

tr_dataset, dev_dataset, num_examples, bert_base, _ = get_bert_datasets(None, vectorizer,
                                                                        tr_ds, dev_ds, batch_size, seq_len,
                                                                        bert_model_name=model_name,
                                                                        bert_dataset=dataset,
                                                                        pad=False, ctx=ctx)
num_classes = int(np.max(y) + 1)

estimator = SeqBowEstimator(bert_base, bert_model_name = model_name, bert_data_name = dataset,
                            n_labels = num_classes,
                            bow_vocab = vectorizer.get_vocab(),
                            optimizer='bertadam',
                            batch_size=batch_size, ctx=ctx, log_interval=1,
                            log_method='print', gamma=1.0, n_latent=20,
                            lr=2e-5, decoder_lr=0.02, epochs=1)

# this will take quite some time without a GPU!
estimator.fit_with_validation(tr_dataset, dev_dataset, num_examples)

예제 #3
0
from tmnt.estimator import BowEstimator, CovariateBowEstimator
import numpy as np
import gluonnlp as nlp
import os
from sklearn.datasets import fetch_20newsgroups
from tmnt.preprocess.vectorizer import TMNTVectorizer
from tmnt.inference import BowVAEInferencer

n_samples = 2000
n_features = 1000

data, y = fetch_20newsgroups(shuffle=True,
                             random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
data_samples = data[:n_samples]
tf_vectorizer = TMNTVectorizer(vocab_size=1000)
X, _ = tf_vectorizer.fit_transform(data_samples)

num_covar_values = int(np.max(y)) + 1  # get the number of possible labels
m_estimator = CovariateBowEstimator(tf_vectorizer.get_vocab(),
                                    num_covar_values)
_ = m_estimator.fit(X, y)  # fit a covariate model using y
m_inferencer = BowVAEInferencer(m_estimator.model)

## the following returns a list of top 5 words per topic per covariate/label
t_terms = m_inferencer.get_top_k_words_per_topic_per_covariate(5)

## top-5 terms for each topic over label/covariate index = 4
cov_4_topics = t_terms[4]
예제 #4
0
"""

from tmnt.estimator import BowEstimator
import numpy as np
import gluonnlp as nlp
import os
import umap
from sklearn.datasets import fetch_20newsgroups
from tmnt.preprocess.vectorizer import TMNTVectorizer
from tmnt.inference import BowVAEInferencer

data, y = fetch_20newsgroups(shuffle=True,
                             random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
tf_vectorizer = TMNTVectorizer(vocab_size=1000)
X, _ = tf_vectorizer.fit_transform(data)

num_label_values = int(np.max(y)) + 1  # get the number of possible labels
gamma = 1.0  ## balanced unsupervised and supservised losses
## total loss = topic_loss + gamma * classification_loss

l_estimator = BowEstimator(tf_vectorizer.get_vocab(),
                           n_labels=num_label_values,
                           gamma=gamma)
_ = l_estimator.fit(X, y)  # fit a joint topic + classification model using y
v_results = l_estimator.validate(X, y)
l_inferencer = BowVAEInferencer(l_estimator.model)
embeddings = l_inferencer.get_umap_embeddings(X)
l_inferencer.plot_to(embeddings, y, None)
예제 #5
0
"""

from tmnt.estimator import BowEstimator, LabeledBowEstimator
import numpy as np
import gluonnlp as nlp
import os
import umap
from sklearn.datasets import fetch_20newsgroups
from tmnt.preprocess.vectorizer import TMNTVectorizer
from tmnt.inference import BowVAEInferencer

data, y = fetch_20newsgroups(shuffle=True,
                             random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
tf_vectorizer = TMNTVectorizer(vocab_size=1000)
X, _ = tf_vectorizer.fit_transform(data)

num_label_values = int(np.max(y)) + 1  # get the number of possible labels
gamma = 1.0  ## balanced unsupervised and supservised losses
## total loss = topic_loss + gamma * classification_loss

l_estimator = LabeledBowEstimator(tf_vectorizer.get_vocab(),
                                  n_labels=num_label_values,
                                  gamma=gamma)
_ = l_estimator.fit(X, y)  # fit a covariate model using y
v_results = l_estimator.validate(X, y)
l_inferencer = BowVAEInferencer(l_estimator.model)
embeddings = l_inferencer.get_umap_embeddings(X)
l_inferencer.plot_to(embeddings, y, f)
예제 #6
0
data, y = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)

tf_vectorizer = TMNTVectorizer(vocab_size=2000)
X, _ = tf_vectorizer.fit_transform(data)

## NOTE: if loading from the .vec files and vocab.txt file generated by prepare_corpus.py, do the following:
## from sklearn.datasets import load_svmlight_file
## from tmnt.data_loading import load_vocab
## X, y = load_svmlight_file('train.vec')
## val_X, val_y = load_svmlight_file('val.vec')
## vocab = load_vocab('vocab.txt')
## estimator = BowEstimator(vocab).fit(X)

estimator = BowEstimator(tf_vectorizer.get_vocab()).fit(X)

inferencer = BowVAEInferencer(estimator.model)
encodings = inferencer.encode_texts(['Greater Armenia would stretch from Karabakh, to the Black Sea, to the Mediterranean, so if you use the term Greater Armenia use it with care.','I have two pairs of headphones I\'d like to sell.  These are excellent, and both in great condition'])

## write out model
os.mkdir('_model_dir')
estimator.write_model('_model_dir') 

## reload model
est2 = BowEstimator.from_config('_model_dir/model.config', '_model_dir/vocab.json', pretrained_param_file='_model_dir/model.params')

## instead of fitting with data; initialize with pretrained values
est2.initialize_with_pretrained()
est2.perplexity(X) # get preplexity
est2.validate(X, None) # get perplexity, NPMI and redundancy
예제 #7
0
                       min_doc_size=args.min_doc_length, label_prefix=args.label_prefix_chars,
                       file_pat=args.file_pat,
                       vocab_size=args.vocab_size,
                       json_out_dir=args.json_out_dir,
                       encoding=args.str_encoding,
                       label_min_cnt=args.label_min_cnt,
                       stop_word_file=args.custom_stop_words,
                       count_vectorizer_kwargs=count_vectorizer_kwargs)
    tr_X, tr_y = \
        vectorizer.fit_transform_json_dir(args.tr_input) if os.path.isdir(args.tr_input) else vectorizer.fit_transform_json(args.tr_input)
    vectorizer.write_to_vec_file(tr_X, tr_y, args.tr_vec_file)
    vectorizer.write_vocab(args.vocab_file)
    if args.val_input and args.val_vec_file:
        val_X, val_y = \
            vectorizer.transform_json_dir(args.val_input) if os.path.isdir(args.val_input) else vectorizer.transform_json(args.val_input)
        vectorizer.write_to_vec_file(val_X, val_y, args.val_vec_file)
    if args.tst_input and args.tst_vec_file:
        tst_X, tst_y = \
            vectorizer.transform_json_dir(args.tst_input) if os.path.isdir(args.tst_input) else vectorizer.transform_json(args.tst_input)
        vectorizer.write_to_vec_file(tst_X, tst_y, args.tst_vec_file)
    if args.label_map:
        with io.open(args.label_map, 'w') as fp:
            fp.write(json.dumps(vectorizer.label_map, indent=4))
    if args.full_vocab_histogram:
        import numpy as np
        i_to_t = vectorizer.get_vocab().idx_to_token
        with io.open(args.full_vocab_histogram, 'w') as fp:
            cnts = np.array(tr_X.sum(axis=0)).squeeze()
            for i in list(np.argsort(cnts * -1)):
                fp.write(i_to_t[i] + ' ' + str(cnts[i]) + '\n')
예제 #8
0
파일: select_20news.py 프로젝트: mitre/tmnt
import umap

from sklearn.datasets import fetch_20newsgroups
from tmnt.preprocess.vectorizer import TMNTVectorizer
from tmnt.configuration import TMNTConfigBOW
from tmnt.trainer import BowVAETrainer
from tmnt.selector import BaseSelector

data, y = fetch_20newsgroups(shuffle=True,
                             random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)

tf_vectorizer = TMNTVectorizer(vocab_size=2000)
X, _ = tf_vectorizer.fit_transform(data)
vocab = tf_vectorizer.get_vocab()

tmnt_config = TMNTConfigBOW(
    'examples/select_model/config.yaml').get_configspace()
selector = BaseSelector(tmnt_config, 8, 'random', 'fifo', 1, 4, False, 1, 1234,
                        '_model_out')

trainer = BowVAETrainer(vocab,
                        X[:8000],
                        X[8000:],
                        log_out_dir='_exps',
                        model_out_dir='_model_out')
selector.select_model(trainer)

n_labels = int(np.max(y)) + 1