def get_worker(args, budget, id_str, ns_port): i_dt = datetime.datetime.now() train_out_dir = \ os.path.join(args.save_dir, "train_{}_{}_{}_{}_{}_{}_{}".format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond)) logging_config(folder=train_out_dir, name='tmnt', level=logging.INFO) logging.info(args) seed_rng(args.seed) if args.vocab_file and args.tr_vec_file: vpath = Path(args.vocab_file) tpath = Path(args.tr_vec_file) if not (vpath.is_file() and tpath.is_file()): raise Exception( "Vocab file {} and/or training vector file {} do not exist". format(args.vocab_file, args.tr_vec_file)) logging.info( "Loading data via pre-computed vocabulary and sparse vector format document representation" ) vocab, tr_csr_mat, total_tr_words, tr_labels, label_map = \ collect_sparse_data(args.tr_vec_file, args.vocab_file, scalar_labels=args.scalar_covars, encoding=args.str_encoding) if args.val_vec_file: tst_csr_mat, total_tst_words, tst_labels = \ collect_sparse_test(args.val_vec_file, vocab, scalar_labels=args.scalar_covars, encoding=args.str_encoding) else: tst_csr_mat, total_tst_words, tst_labels = None, None, None ctx = mx.cpu() if args.gpu is None or args.gpu == '' or int( args.gpu) < 0 else mx.gpu(int(args.gpu)) model_out_dir = args.model_dir if args.model_dir else os.path.join( train_out_dir, 'MODEL') if not os.path.exists(model_out_dir): os.mkdir(model_out_dir) if args.use_labels_as_covars and tr_labels is not None: if label_map is not None: n_covars = len(label_map) tr_labels = mx.nd.one_hot(tr_labels, n_covars) tst_labels = mx.nd.one_hot( tst_labels, n_covars) if tst_labels is not None else None else: tr_labels = mx.nd.expand_dims(tr_labels, 1) tst_labels = mx.nd.expand_dims( tst_labels, 1) if tst_labels is not None else None worker = BowVAEWorker(model_out_dir, args, vocab, tr_csr_mat, total_tr_words, tst_csr_mat, total_tst_words, tr_labels, tst_labels, label_map, ctx=ctx, max_budget=budget, nameserver='127.0.0.1', run_id=id_str, nameserver_port=ns_port) return worker, train_out_dir
def retrain_best_config(self, config, budget, rng_seed, ntimes=1): """Train a model as per the provided `Configuration` and `budget` and write to file. Parameters ---------- config: `Configuration` to use to train/evaluate the model budget: int - number of iterations to train """ best_loss = 100000000.0 best_model = None npmis = [] perplexities = [] redundancies = [] if self.c_args.tst_vec_file: self.set_heldout_data_as_test() if self.c_args.val_vec_file: for i in range(ntimes): seed_rng(rng_seed + i) model, results = self._train_model(config, budget) loss = results['loss'] npmis.append(results['info']['test_npmi']) perplexities.append(results['info']['test_perplexity']) redundancies.append(results['info']['redundancy']) if loss < best_loss: best_loss = loss best_model = model logging.info("******************************************") test_type = "HELDOUT" if self.c_args.tst_vec_file else "VALIDATATION" if ntimes > 1: logging.info( "Final {} NPMI ==> Mean: {}, StdDev: {}".format( test_type, statistics.mean(npmis), statistics.stdev(npmis))) logging.info( "Final {} Perplexity ==> Mean: {}, StdDev: {}".format( test_type, statistics.mean(perplexities), statistics.stdev(perplexities))) logging.info( "Final {} Redundancy ==> Mean: {}, StdDev: {}".format( test_type, statistics.mean(redundancies), statistics.stdev(redundancies))) else: logging.info("Final {} NPMI ==> {}".format( test_type, npmis[0])) logging.info("Final {} Perplexity ==> {}".format( test_type, perplexities[0])) logging.info("Final {} Redundancy ==> {}".format( test_type, redundancies[0])) else: ## in this case, no validation test data supplied best_model, _ = self._train_model(config, budget) write_model(best_model, self.model_out_dir, config, budget, self.c_args)
def from_arguments(cls, c_args, val_each_epoch=True): """Constructor method to build BowVAETrainer from command-line arguments directly. Parameters: c_args (`argparse.Namespace`): Command-line arguments. val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True) """ i_dt = datetime.datetime.now() log_out_dir = \ os.path.join(c_args.save_dir, "train_{}_{}_{}_{}_{}_{}_{}" .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond)) if not os.path.exists(log_out_dir): lpath = Path(log_out_dir) lpath.mkdir(parents=True, exist_ok=True) if not log_utils.CONFIGURED: logging_config(folder=log_out_dir, name='tmnt', level=c_args.log_level, console_level=c_args.log_level) logging.info(c_args) seed_rng(c_args.seed) if c_args.vocab_file and c_args.tr_vec_file: vpath = Path(c_args.vocab_file) tpath = Path(c_args.tr_vec_file) if not (vpath.is_file() and tpath.is_file()): raise Exception( "Vocab file {} and/or training vector file {} do not exist" .format(c_args.vocab_file, c_args.tr_vec_file)) logging.info( "Loading data via pre-computed vocabulary and sparse vector format document representation" ) vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding) voc_size = len(vocab) X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size) model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join( log_out_dir, 'MODEL') n_covars = int(float(np.max(y)) + 1) if not os.path.exists(model_out_dir): os.mkdir(model_out_dir) return cls(log_out_dir, model_out_dir, vocab, wd_freqs, c_args.tr_vec_file, c_args.val_vec_file, coherence_via_encoder=c_args.encoder_coherence, pretrained_param_file=c_args.pretrained_param_file, topic_seed_file=c_args.topic_seed_file, use_labels_as_covars=c_args.use_labels_as_covars, use_gpu=c_args.use_gpu, n_covars=n_covars, val_each_epoch=val_each_epoch)
def train_with_single_config(self, config, num_evals): """Fit models with a single configuration and report the value of the objective function. This method trains a model defined by the configuration `num_evals` times. Each time the model weights are randomly initialized with a different RNG seed. The results of each run are captured and mean and std reported. Args: config (dict): Configuration instance with hyperparameter values for model definition. num_evals (int): Number of model fits and evaluations to perform (with random initialization) Returns: (tuple): Tuple containing: - model (:class:`tmnt.modeling.BowVAEModel`): VAE Model instance with trained/fit parameters. - obj (float): objective value of the objective function with the best model. """ rng_seed = self.rng_seed best_obj = -1000000000.0 best_model = None if self.test_data_path is not None: #if c_args.tst_vec_file: # trainer.set_heldout_data_path_as_test() logging.info("Training with config: {}".format(config)) npmis, perplexities, redundancies, objectives = [], [], [], [] ntimes = int(num_evals) for i in range(ntimes): seed_rng(rng_seed) # update RNG rng_seed += 1 model, obj, v_res = self.train_model(config, FakeReporter()) npmis.append(v_res['npmi']) perplexities.append(v_res['ppl']) redundancies.append(v_res['redundancy']) objectives.append(obj) if obj > best_obj: best_obj = obj best_model = model #test_type = "HELDOUT" if c_args.tst_vec_file else "VALIDATION" test_type = "VALIDATION" if ntimes > 1: logging.info( "Final {} NPMI ==> Mean: {}, StdDev: {}".format( test_type, statistics.mean(npmis), statistics.stdev(npmis))) logging.info( "Final {} Perplexity ==> Mean: {}, StdDev: {}".format( test_type, statistics.mean(perplexities), statistics.stdev(perplexities))) logging.info( "Final {} Redundancy ==> Mean: {}, StdDev: {}".format( test_type, statistics.mean(redundancies), statistics.stdev(redundancies))) logging.info( "Final {} Objective ==> Mean: {}, StdDev: {}".format( test_type, statistics.mean(objectives), statistics.stdev(objectives))) else: logging.info("Final {} NPMI ==> {}".format( test_type, npmis[0])) logging.info("Final {} Perplexity ==> {}".format( test_type, perplexities[0])) logging.info("Final {} Redundancy ==> {}".format( test_type, redundancies[0])) logging.info("Final {} Objective ==> {}".format( test_type, objectives[0])) return best_model, best_obj else: model, obj, _ = self.train_model(config, FakeReporter()) return model, obj
def from_arguments(cls, c_args, val_each_epoch=True): """Constructor method to build BowVAETrainer from command-line arguments directly. Parameters: c_args (`argparse.Namespace`): Command-line arguments. val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True) """ i_dt = datetime.datetime.now() log_out_dir = \ os.path.join(c_args.save_dir, "train_{}_{}_{}_{}_{}_{}_{}" .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond)) ll = c_args.log_level log_level = logging.INFO if ll.lower() == 'info': log_level = logging.INFO elif ll.lower() == 'debug': log_level = logging.DEBUG elif ll.lower() == 'error': log_level = logging.ERROR elif ll.lower() == 'warning': log_level = logging.WARNING else: log_level = logging.INFO logging_config(folder=log_out_dir, name='tmnt', level=log_level, console_level=log_level) logging.info(c_args) seed_rng(c_args.seed) if c_args.vocab_file and c_args.tr_vec_file: vpath = Path(c_args.vocab_file) tpath = Path(c_args.tr_vec_file) if not (vpath.is_file() and tpath.is_file()): raise Exception( "Vocab file {} and/or training vector file {} do not exist" .format(c_args.vocab_file, c_args.tr_vec_file)) logging.info( "Loading data via pre-computed vocabulary and sparse vector format document representation" ) vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding) voc_size = len(vocab) X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size) total_test_wds = 0 if c_args.val_vec_file: val_X, val_y, _, total_test_wds = file_to_data( c_args.val_vec_file, voc_size) else: val_X, val_y, total_test_wds = None, None, 0 ctx = mx.cpu() if not c_args.use_gpu else mx.gpu(0) model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join( log_out_dir, 'MODEL') if not os.path.exists(model_out_dir): os.mkdir(model_out_dir) return cls(log_out_dir, model_out_dir, c_args, vocab, wd_freqs, X, val_X, total_test_wds, train_labels=y, test_labels=val_y, label_map=None, use_gpu=c_args.use_gpu, val_each_epoch=val_each_epoch)