def __init__(self, log_out_dir, model_out_dir, vocabulary, wd_freqs, train_data_path, test_data_path, coherence_via_encoder=False, pretrained_param_file=None, topic_seed_file=None, use_labels_as_covars=False, use_gpu=False, n_covars=None, val_each_epoch=True, rng_seed=1234): super().__init__(vocabulary, train_data_path, test_data_path, val_each_epoch, rng_seed) if not log_utils.CONFIGURED: logging_config(folder=log_out_dir, name='tmnt', level='info', console_level='info') self.log_out_dir = log_out_dir self.model_out_dir = model_out_dir self.use_gpu = use_gpu self.wd_freqs = wd_freqs self.seed_matrix = None self.pretrained_param_file = pretrained_param_file self.n_covars = n_covars self.use_labels_as_covars = use_labels_as_covars self.coherence_via_encoder = coherence_via_encoder if topic_seed_file: self.seed_matrix = get_seed_matrix_from_file( topic_seed_file, vocabulary, ctx)
def model_select_bow_vae(c_args): logging_config(folder=c_args.save_dir, name='tmnt', level=c_args.log_level, console_level=c_args.log_level) ## dask config overrides dask.config.config['distributed']['worker']['use-file-locking'] = False dask.config.config['distributed']['comm']['timeouts']['connect'] = '90s' ## tmnt_config = TMNTConfigBOW(c_args.config_space).get_configspace() trainer = BowVAETrainer.from_arguments( c_args, val_each_epoch=(not (c_args.searcher == 'random'))) selector = BaseSelector(tmnt_config, iterations=c_args.iterations, searcher=c_args.searcher, scheduler=c_args.scheduler, brackets=c_args.brackets, cpus_per_task=c_args.cpus_per_task, num_final_evals=c_args.num_final_evals, rng_seed=c_args.seed, log_dir=trainer.log_out_dir) sources = [ e['source'] for e in tmnt_config.get('embedding').data if e['source'] != 'random' ] logging.info( '>> Pre-caching pre-trained embeddings/vocabularies: {}'.format( sources)) trainer.pre_cache_vocabularies(sources) selector.select_model(trainer)
def get_worker(args, budget, id_str, ns_port): i_dt = datetime.datetime.now() train_out_dir = \ os.path.join(args.save_dir, "train_{}_{}_{}_{}_{}_{}_{}".format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond)) logging_config(folder=train_out_dir, name='tmnt', level=logging.INFO) logging.info(args) seed_rng(args.seed) if args.vocab_file and args.tr_vec_file: vpath = Path(args.vocab_file) tpath = Path(args.tr_vec_file) if not (vpath.is_file() and tpath.is_file()): raise Exception( "Vocab file {} and/or training vector file {} do not exist". format(args.vocab_file, args.tr_vec_file)) logging.info( "Loading data via pre-computed vocabulary and sparse vector format document representation" ) vocab, tr_csr_mat, total_tr_words, tr_labels, label_map = \ collect_sparse_data(args.tr_vec_file, args.vocab_file, scalar_labels=args.scalar_covars, encoding=args.str_encoding) if args.val_vec_file: tst_csr_mat, total_tst_words, tst_labels = \ collect_sparse_test(args.val_vec_file, vocab, scalar_labels=args.scalar_covars, encoding=args.str_encoding) else: tst_csr_mat, total_tst_words, tst_labels = None, None, None ctx = mx.cpu() if args.gpu is None or args.gpu == '' or int( args.gpu) < 0 else mx.gpu(int(args.gpu)) model_out_dir = args.model_dir if args.model_dir else os.path.join( train_out_dir, 'MODEL') if not os.path.exists(model_out_dir): os.mkdir(model_out_dir) if args.use_labels_as_covars and tr_labels is not None: if label_map is not None: n_covars = len(label_map) tr_labels = mx.nd.one_hot(tr_labels, n_covars) tst_labels = mx.nd.one_hot( tst_labels, n_covars) if tst_labels is not None else None else: tr_labels = mx.nd.expand_dims(tr_labels, 1) tst_labels = mx.nd.expand_dims( tst_labels, 1) if tst_labels is not None else None worker = BowVAEWorker(model_out_dir, args, vocab, tr_csr_mat, total_tr_words, tst_csr_mat, total_tst_words, tr_labels, tst_labels, label_map, ctx=ctx, max_budget=budget, nameserver='127.0.0.1', run_id=id_str, nameserver_port=ns_port) return worker, train_out_dir
def test_ar(args): i_dt = datetime.datetime.now() train_out_dir = '{}/train_{}_{}_{}_{}_{}_{}'.format( args.save_dir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour, i_dt.minute, i_dt.second) print("Set logging config to {}".format(train_out_dir)) logging_config(folder=train_out_dir, name='train_trans_vae', level=logging.INFO, no_console=False) logging.info(args) context = mx.cpu() if args.gpus is None or args.gpus == '' else mx.gpu( int(args.gpus)) emb = nlp.embedding.create( 'glove', source=args.embedding_source) if args.embedding_source else None data_train, vocab = load_dataset_basic(args.input_file, vocab=None, json_text_key=args.json_text_key, max_len=args.sent_size, max_vocab_size=args.max_vocab_size, ctx=context) if emb: vocab.set_embedding(emb) _, emb_size = vocab.embedding.idx_to_vec.shape oov_items = 0 for word in vocab.embedding._idx_to_token: if (vocab.embedding[word] == mx.nd.zeros(emb_size) ).sum() == emb_size: oov_items += 1 vocab.embedding[word] = mx.nd.random.normal(0.0, 0.1, emb_size) logging.info("** There are {} out of vocab items **".format(oov_items)) else: logging.info( "** No pre-trained embedding provided, learning embedding weights from scratch **" ) emb_dim = len(vocab.embedding.idx_to_vec[0]) model = ARTransformerVAE(vocab, emb_dim, args.latent_dist, num_units=args.num_units, hidden_size=args.hidden_size, num_heads=args.num_heads, n_latent=args.latent_dim, max_sent_len=args.sent_size, transformer_layers=args.transformer_layers, kappa=args.kappa, batch_size=args.batch_size, kld=args.kld_wt, ctx=context) model.latent_dist.initialize(init=mx.init.Xavier(magnitude=2.34), ctx=context) model.encoder.initialize(init=mx.init.Xavier(magnitude=2.34), ctx=context) #model.decoder.initialize(init=mx.init.Xavier(magnitude=2.34), ctx=context) pad_id = vocab[vocab.padding_token]
def from_arguments(cls, c_args, val_each_epoch=True): """Constructor method to build BowVAETrainer from command-line arguments directly. Parameters: c_args (`argparse.Namespace`): Command-line arguments. val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True) """ i_dt = datetime.datetime.now() log_out_dir = \ os.path.join(c_args.save_dir, "train_{}_{}_{}_{}_{}_{}_{}" .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond)) if not os.path.exists(log_out_dir): lpath = Path(log_out_dir) lpath.mkdir(parents=True, exist_ok=True) if not log_utils.CONFIGURED: logging_config(folder=log_out_dir, name='tmnt', level=c_args.log_level, console_level=c_args.log_level) logging.info(c_args) seed_rng(c_args.seed) if c_args.vocab_file and c_args.tr_vec_file: vpath = Path(c_args.vocab_file) tpath = Path(c_args.tr_vec_file) if not (vpath.is_file() and tpath.is_file()): raise Exception( "Vocab file {} and/or training vector file {} do not exist" .format(c_args.vocab_file, c_args.tr_vec_file)) logging.info( "Loading data via pre-computed vocabulary and sparse vector format document representation" ) vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding) voc_size = len(vocab) X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size) model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join( log_out_dir, 'MODEL') n_covars = int(float(np.max(y)) + 1) if not os.path.exists(model_out_dir): os.mkdir(model_out_dir) return cls(log_out_dir, model_out_dir, vocab, wd_freqs, c_args.tr_vec_file, c_args.val_vec_file, coherence_via_encoder=c_args.encoder_coherence, pretrained_param_file=c_args.pretrained_param_file, topic_seed_file=c_args.topic_seed_file, use_labels_as_covars=c_args.use_labels_as_covars, use_gpu=c_args.use_gpu, n_covars=n_covars, val_each_epoch=val_each_epoch)
def train(args): i_dt = datetime.datetime.now() exp_folder = '{}/exp_{}_{}_{}_{}_{}_{}'.format(args.logdir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour, i_dt.minute, i_dt.second) logging_config(exp_folder, name="Embeddings", level=logging.INFO) logging.info(args) random.seed(args.seed) mx.random.seed(args.seed) np.random.seed(args.seed) train_embeddings(args, exp_folder)
def model_select_seq_bow(c_args): logging_config(folder=c_args.save_dir, name='tmnt', level=c_args.log_level, console_level=c_args.log_level) tmnt_config = TMNTConfigSeqBOW(c_args.config_space).get_configspace() trainer = SeqBowVEDTrainer.from_arguments(c_args) selector = BaseSelector(tmnt_config, c_args.iterations, c_args.searcher, c_args.scheduler, c_args.brackets, c_args.cpus_per_task, c_args.use_gpu, c_args.num_final_evals, c_args.seed, trainer.model_out_dir) selector.select_model(trainer)
def from_arguments(cls, args, config): i_dt = datetime.datetime.now() train_out_dir = '{}/train_{}_{}_{}_{}_{}_{}'.format( args.save_dir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour, i_dt.minute, i_dt.second) print("Set logging config to {}".format(train_out_dir)) logging_config(folder=train_out_dir, name='train_trans_vae', level=args.log_level, console_level=args.log_level, no_console=False) logging.info(args) trainer = cls(train_out_dir, args.tr_file, args.val_file, aux_data_path=args.aux_file, use_gpu=args.use_gpu, log_interval=args.log_interval) return trainer
def from_arguments(cls, args, config): i_dt = datetime.datetime.now() train_out_dir = '{}/train_{}_{}_{}_{}_{}_{}'.format( args.save_dir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour, i_dt.minute, i_dt.second) print("Set logging config to {}".format(train_out_dir)) logging_config(folder=train_out_dir, name='train_trans_vae', level=logging.INFO, no_console=False) logging.info(args) bow_vocab = load_vocab(args.bow_vocab_file) data_train, bert_base, vocab, data_csr = load_dataset_bert( args.tr_file, len(bow_vocab), max_len=config.sent_size, ctx=mx.cpu()) if args.val_file: data_val, _, _, val_csr = load_dataset_bert( args.val_file, len(bow_vocab), max_len=config.sent_size, ctx=mx.cpu()) val_wds = val_csr.sum().asscalar() else: data_val, val_csr, val_wds = None, None, None sample_size = min(50000, data_csr.shape[0]) data = data_csr[:sample_size] wd_freqs = mx.nd.sum(data, axis=0) trainer = cls(train_out_dir, bow_vocab, wd_freqs, val_wds, (data_train, data_csr), (data_val, val_csr), use_gpu=args.use_gpu, log_interval=args.log_interval) return trainer
def train_main(args): i_dt = datetime.datetime.now() train_out_dir = '{}/train_{}_{}_{}_{}_{}_{}'.format( args.save_dir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour, i_dt.minute, i_dt.second) print("Set logging config to {}".format(train_out_dir)) logging_config(folder=train_out_dir, name='train_trans_vae', level=logging.INFO, no_console=False) logging.info(args) context = mx.cpu() if args.gpus is None or args.gpus == '' else mx.gpu( int(args.gpus)) if args.use_bert: data_train, bert_base, vocab = load_dataset_bert( args.input_file, max_len=args.sent_size, ctx=context) model = get_bert_model(args, bert_base, context) pad_id = vocab[vocab.padding_token] report_fn = get_report_reconstruct_data_fn(vocab, pad_id=pad_id) train_trans_vae(args, model, data_train, data_test=None, ctx=context, report_fn=report_fn, use_bert=True) else: emb = nlp.embedding.create( 'glove', source=args.embedding_source) if args.embedding_source else None data_train, vocab = load_dataset_basic( args.input_file, vocab=None, json_text_key=args.json_text_key, max_len=args.sent_size, max_vocab_size=args.max_vocab_size, ctx=context) if emb: vocab.set_embedding(emb) _, emb_size = vocab.embedding.idx_to_vec.shape oov_items = 0 for word in vocab.embedding._idx_to_token: if (vocab.embedding[word] == mx.nd.zeros(emb_size) ).sum() == emb_size: oov_items += 1 vocab.embedding[word] = mx.nd.random.normal( 0.0, 0.1, emb_size) logging.info( "** There are {} out of vocab items **".format(oov_items)) else: logging.info( "** No pre-trained embedding provided, learning embedding weights from scratch **" ) model = get_basic_model(args, vocab, context) pad_id = vocab[vocab.padding_token] report_fn = get_report_reconstruct_data_fn(vocab, pad_id=pad_id) train_trans_vae(args, model, data_train, data_test=None, ctx=context, report_fn=report_fn, use_bert=False)
type=int, help='Use first N characters of label', default=-1) parser.add_argument('--str_encoding', type=str, help='String/file encoding to use', default='utf-8') parser.add_argument('--log_dir', type=str, help='Logging directory', default='.') args = parser.parse_args() if __name__ == '__main__': logging_config(folder=args.log_dir, name='vectorizer', level=logging.INFO) if args.vocab_file is None: raise Exception("Vocabulary output file name/path must be provided") vectorizer = \ TextVectorizer(min_doc_size=args.min_doc_length, encoding=args.str_encoding, custom_stop_word_file=args.custom_stop_words) \ if args.txt_mode \ else JsonVectorizer(text_key=args.json_text_key, custom_stop_word_file=args.custom_stop_words, label_key=args.json_label_key, min_doc_size=args.min_doc_length, label_prefix=args.label_prefix_chars, json_out_dir=args.json_out_dir, encoding=args.str_encoding) vocab = vectorizer.get_sparse_vecs( args.tr_vec_file, args.vocab_file, args.tr_input_dir, args.vocab_size, full_histogram_file=args.full_vocab_histogram,
parser.add_argument('--label_prefix_chars', type=int, help='Use first N characters of label', default=-1) parser.add_argument('--str_encoding', type=str, help='String/file encoding to use', default='utf-8') parser.add_argument('--log_dir', type=str, help='Logging directory', default='.') args = parser.parse_args() if __name__ == '__main__': logging_config(folder=args.log_dir, name='vectorizer', level='info') if args.vocab_file is None: raise Exception("Vocabulary output file name/path must be provided") vectorizer = \ TMNTVectorizer(text_key=args.json_text_key, custom_stop_word_file=args.custom_stop_words, label_key=args.json_label_key, min_doc_size=args.min_doc_length, label_prefix=args.label_prefix_chars, json_out_dir=args.json_out_dir, vocab_size = args.vocab_size, encoding=args.str_encoding) vectorizer.fit_transform_in_place_json(args.tr_input_file) vectorizer.write_vocab(args.vocab_file) if args.val_input_file: vectorizer.transform_in_place_json(args.val_input_file) if args.tst_input_file: vectorizer.transform_in_place_json(args.tst_input_file)
all_labels = [] for i, (data, label, mask) in enumerate(dataloader): out = model(data, mask) predictions = mx.nd.argmax(out, axis=1).astype('int32') for j in range(out.shape[0]): probs = mx.nd.softmax(out[j]) lab = int(label[j].asscalar()) all_scores.append(probs[1].asscalar()) all_labels.append(lab) if probs[1] > probs[0] and lab == 1: total_correct += 1 elif probs[1] < probs[0] and lab == 0: total_correct += 1 total += 1 acc = total_correct / float(total) ap = average_precision_score(all_labels, all_scores) return ap, acc if __name__ == '__main__': args = get_args() logging_config(args.log_dir, 'train', level=logging.INFO, console_level=logging.INFO) train_dataset, val_dataset, test_dataset, transform = \ load_sparse_dataset(args.train_file, args.val_file, args.test_file, voc_size=args.voc_size, max_length=args.max_length) ctx = mx.cpu() train_classifier(args.voc_size, args.embedding_dim, transform, train_dataset, val_dataset, test_dataset, ctx)
def from_arguments(cls, c_args, val_each_epoch=True): """Constructor method to build BowVAETrainer from command-line arguments directly. Parameters: c_args (`argparse.Namespace`): Command-line arguments. val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True) """ i_dt = datetime.datetime.now() log_out_dir = \ os.path.join(c_args.save_dir, "train_{}_{}_{}_{}_{}_{}_{}" .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond)) ll = c_args.log_level log_level = logging.INFO if ll.lower() == 'info': log_level = logging.INFO elif ll.lower() == 'debug': log_level = logging.DEBUG elif ll.lower() == 'error': log_level = logging.ERROR elif ll.lower() == 'warning': log_level = logging.WARNING else: log_level = logging.INFO logging_config(folder=log_out_dir, name='tmnt', level=log_level, console_level=log_level) logging.info(c_args) seed_rng(c_args.seed) if c_args.vocab_file and c_args.tr_vec_file: vpath = Path(c_args.vocab_file) tpath = Path(c_args.tr_vec_file) if not (vpath.is_file() and tpath.is_file()): raise Exception( "Vocab file {} and/or training vector file {} do not exist" .format(c_args.vocab_file, c_args.tr_vec_file)) logging.info( "Loading data via pre-computed vocabulary and sparse vector format document representation" ) vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding) voc_size = len(vocab) X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size) total_test_wds = 0 if c_args.val_vec_file: val_X, val_y, _, total_test_wds = file_to_data( c_args.val_vec_file, voc_size) else: val_X, val_y, total_test_wds = None, None, 0 ctx = mx.cpu() if not c_args.use_gpu else mx.gpu(0) model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join( log_out_dir, 'MODEL') if not os.path.exists(model_out_dir): os.mkdir(model_out_dir) return cls(log_out_dir, model_out_dir, c_args, vocab, wd_freqs, X, val_X, total_test_wds, train_labels=y, test_labels=val_y, label_map=None, use_gpu=c_args.use_gpu, val_each_epoch=val_each_epoch)