def main(): usage = "%prog infiles.jsonlist[.gz]" parser = OptionParser(usage=usage) parser.add_option('-o', dest='output_dir', default='output', help='Output_dir: default=%default') parser.add_option('-w', dest='target_word', default='mass shooting', help='Target word: default=%default') (options, args) = parser.parse_args() infiles = args output_dir = options.output_dir target_word = options.target_word if not os.path.exists(output_dir): fh.makedirs(output_dir) n_articles_per_day = defaultdict(int) target_count_per_day = defaultdict(int) for f in infiles: print(f) articles = fh.read_jsonlist(f) print(len(articles)) for i, article in enumerate(articles): if i % 10000 == 0 and i > 0: print(i) year = int(article['year']) month = int(article['month']) day = int(article['day']) date = datetime.date(year=year, month=month, day=day) ordinal_date = date.toordinal() n_articles_per_day[ordinal_date] += 1 text = '' if 'headline' in article: text += article['headline'] + '\n' if 'body' in article: text += article['body'] if 'text' in article: text += article['text'] text = ' ' + clean_text(text, lower=True) + ' ' if target_word in text: if 'film' not in text and 'game' not in text: target_count_per_day[ordinal_date] += 1 fh.write_to_json(n_articles_per_day, os.path.join(output_dir, 'articles_per_day.json')) fh.write_to_json(target_count_per_day, os.path.join(output_dir, 'target_counts_per_day.json'))
def download_articles(name, categories, subset): data = [] print("Downloading articles") newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=()) for i in range(len(newsgroups_data['data'])): line = newsgroups_data['data'][i] data.append({'text': line, 'group': newsgroups_data['target_names'][newsgroups_data['target'][i]]}) print(len(data)) raw_data_dir = os.path.join('data', '20ng', name) print("Saving to", raw_data_dir) fh.makedirs(raw_data_dir) fh.write_jsonlist(data, os.path.join(raw_data_dir, subset + '.jsonlist'))
def make_base_dir(project_name=None): global base_dir cwd = os.getcwd() parts = os.path.split(cwd) print parts assert parts[-1] == 'guac' base_dir = parts[0] base_dir = fh.makedirs(os.path.join(base_dir, 'datasets', project_name)) if os.path.exists(base_dir): _setup_dirs() else: sys.exit("Error: base directory " + base_dir + " does not exist!")
def set_project(project_name, splits_filename): global base_dir cwd = os.getcwd() parts = os.path.split(cwd) print parts assert parts[-1] == 'guac' base_dir = parts[0] base_dir = fh.makedirs(os.path.join(base_dir, 'datasets', project_name)) if os.path.exists(base_dir): _setup_dirs() global splits_file_name splits_file_name = splits_filename global data_splits_file data_splits_file = os.path.join(data_subsets_dir, splits_filename + '.csv') else: sys.exit("Error: base directory " + base_dir + " does not exist!")
def main(): usage = "%prog input_dir train_prefix" parser = OptionParser(usage=usage) parser.add_option('-a', dest='alpha', default=1.0, help='Hyperparameter for logistic normal prior: default=%default') parser.add_option('-k', dest='n_topics', default=20, help='Size of latent representation (~num topics): default=%default') parser.add_option('-b', dest='batch_size', default=200, help='Size of minibatches: default=%default') parser.add_option('-l', dest='learning_rate', default=0.002, help='Initial learning rate: default=%default') parser.add_option('-m', dest='momentum', default=0.99, help='beta1 for Adam: default=%default') parser.add_option('-e', dest='epochs', default=250, help='Number of epochs: default=%default') parser.add_option('--en_layers', dest='encoder_layers', default=1, help='Number of encoder layers [0|1|2]: default=%default') parser.add_option('--emb_dim', dest='embedding_dim', default=300, help='Dimension of input embeddings: default=%default') parser.add_option('--en_short', action="store_true", dest="encoder_shortcuts", default=False, help='Use shortcut connections on encoder: default=%default') parser.add_option('--labels', dest='label_name', default=None, help='Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default') parser.add_option('--covars', dest='covar_names', default=None, help='Read covars from files with these names (comma-separated): default=%default') parser.add_option('--label_emb_dim', dest='label_emb_dim', default=0, help='Class embedding dimension [0 = identity]: default=%default') parser.add_option('--covar_emb_dim', dest='covar_emb_dim', default=0, help='Covariate embedding dimension [0 = identity]: default=%default') parser.add_option('--min_covar_count', dest='min_covar_count', default=None, help='Drop binary covariates that occur less than this in training: default=%default') parser.add_option('--covar_inter', action="store_true", dest="covar_interactions", default=False, help='Use covariate interactions in model: default=%default') parser.add_option('--c_layers', dest='classifier_layers', default=1, help='Number of layers in (generative) classifier [0|1|2]: default=%default') parser.add_option('--exclude_covars', action="store_true", dest="exclude_covars", default=False, help='Exclude covariates from the classifier: default=%default') parser.add_option('-r', action="store_true", dest="regularize", default=False, help='Apply adaptive regularization for sparsity in topics: default=%default') parser.add_option('-t', dest='test_prefix', default=None, help='Prefix of test set: default=%default') parser.add_option('-f', dest='final_evaluate', default=None, help='perform final evaluation on test set') parser.add_option('-d', dest='dev_prefix', default=None, help='Prefix of dev set: default=%default') parser.add_option('-o', dest='output_dir', default='output', help='Output directory: default=%default') parser.add_option('--w2v', dest='word2vec_file', default=None, help='Use this word2vec .bin file to initialize and fix embeddings: default=%default') parser.add_option('--vocab_size', dest='vocab_size', default=None, help='Filter the vocabulary keeping the most common n words: default=%default') parser.add_option('--update_bg', action="store_true", dest="update_bg", default=False, help='Update background parameters: default=%default') parser.add_option('--no_bg', action="store_true", dest="no_bg", default=False, help='Do not use background freq: default=%default') parser.add_option('--no_bn_anneal', action="store_true", dest="no_bn_anneal", default=False, help='Do not anneal away from batchnorm: default=%default') parser.add_option('--dev_folds', dest='dev_folds', default=0, help='Number of dev folds: default=%default') parser.add_option('--dev_fold', dest='dev_fold', default=0, help='Fold to use as dev (if dev_folds > 0): default=%default') parser.add_option('--opt', dest='optimizer', default='adam', help='Optimization algorithm to use [adam|adagrad|sgd]: default=%default') parser.add_option('--threads', dest='threads', default=8, help='Use this to limit the number of CPUs: default=%default') parser.add_option('--seed', dest='seed', default=None, help='Random seed: default=%default') (options, args) = parser.parse_args() input_dir = args[0] train_prefix = args[1] alpha = float(options.alpha) n_topics = int(options.n_topics) batch_size = int(options.batch_size) learning_rate = float(options.learning_rate) adam_beta1 = float(options.momentum) n_epochs = int(options.epochs) encoder_layers = int(options.encoder_layers) embedding_dim = int(options.embedding_dim) encoder_shortcuts = options.encoder_shortcuts label_file_name = options.label_name covar_file_names = options.covar_names use_covar_interactions = options.covar_interactions label_emb_dim = int(options.label_emb_dim) covar_emb_dim = int(options.covar_emb_dim) min_covar_count = options.min_covar_count classifier_layers = int(options.classifier_layers) covars_in_classifier = not options.exclude_covars auto_regularize = options.regularize test_prefix = options.test_prefix dev_prefix = options.dev_prefix output_dir = options.output_dir word2vec_file = options.word2vec_file vocab_size = options.vocab_size update_background = options.update_bg no_bg = options.no_bg bn_anneal = not options.no_bn_anneal dev_folds = int(options.dev_folds) final_evaluate = options.final_evaluate dev_fold = int(options.dev_fold) optimizer = options.optimizer seed = options.seed threads = int(options.threads) if seed is not None: seed = int(seed) rng = np.random.RandomState(seed) else: rng = np.random.RandomState(np.random.randint(0, 100000)) train_X, vocab, train_labels, label_names, na_label_index, label_type, train_covariates, covariate_names, covariates_type, col_sel = load_data(input_dir, train_prefix, label_file_name, covar_file_names, vocab_size=vocab_size) n_train, dv = train_X.shape if train_labels is not None: _, n_labels = train_labels.shape # convert binary labels to a single dimensional vector #if binary and n_classes == 2 and not generative: # train_labels = np.argmax(train_labels, axis=1) # train_labels = train_labels.reshape((n_train, 1)) # n_classes = 1 else: n_labels = 0 if train_covariates is not None: _, n_covariates = train_covariates.shape if min_covar_count is not None and int(min_covar_count) > 0: print("Removing rare covariates") covar_sums = train_covariates.sum(axis=0).reshape((n_covariates, )) covariate_selector = covar_sums > int(min_covar_count) train_covariates = train_covariates[:, covariate_selector] covariate_names = [name for i, name in enumerate(covariate_names) if covariate_selector[i]] n_covariates = len(covariate_names) else: n_covariates = 0 if dev_prefix is not None: dev_X, _, dev_labels, _, _, _, dev_covariates, _, _, _ = load_data(input_dir, dev_prefix, label_file_name, covar_file_names, vocab=vocab, col_sel=col_sel) n_dev, _ = dev_X.shape if dev_labels is not None: _, n_labels_dev = dev_labels.shape assert n_labels_dev == n_labels #if binary and n_classes == 2 and not generative: # test_labels = np.argmax(test_labels, axis=1) # test_labels = test_labels.reshape((n_test, 1)) # n_classes = 1 if dev_covariates is not None: if min_covar_count is not None and int(min_covar_count) > 0: dev_covariates = dev_covariates[:, covariate_selector] _, n_covariates_dev = dev_covariates.shape assert n_covariates_dev == n_covariates else: dev_X = None n_dev = 0 dev_labels = None dev_covariates = None if test_prefix is not None: if final_evaluate: test_X, _, test_labels, _, _, _, test_covariates, _, _, _ = load_data(input_dir, test_prefix, label_file_name, covar_file_names, vocab=vocab, col_sel=col_sel) n_test, _ = test_X.shape if test_labels is not None: _, n_labels_test = test_labels.shape assert n_labels_test == n_labels #if binary and n_classes == 2 and not generative: # test_labels = np.argmax(test_labels, axis=1) # test_labels = test_labels.reshape((n_test, 1)) # n_classes = 1 if test_covariates is not None: if min_covar_count is not None and int(min_covar_count) > 0: test_covariates = test_covariates[:, covariate_selector] _, n_covariates_test = test_covariates.shape assert n_covariates_test == n_covariates else: test_X = None n_test = 0 test_labels = None test_covariates = None is_labeled = pd.read_csv(os.path.join(input_dir, "train.is_labeled.csv"), names=['labeled']).labeled init_bg = get_init_bg(train_X) init_beta = None update_beta = True if no_bg: if n_topics == 1: init_beta = init_bg.copy() init_beta = init_beta.reshape([1, len(vocab)]) update_beta = False init_bg = np.zeros_like(init_bg) network_architecture = make_network(dv, encoder_layers, embedding_dim, n_topics, encoder_shortcuts, label_type, n_labels, label_emb_dim, covariates_type, n_covariates, covar_emb_dim, use_covar_interactions, classifier_layers, covars_in_classifier) # make_network() print("Network architecture:") for key, val in network_architecture.items(): print(key + ':', val) # load pretrained word vectors if word2vec_file is not None: vocab_size = len( vocab) vocab_dict = dict(zip(vocab, range(vocab_size))) embeddings = np.array(rng.rand(vocab_size, 300) * 0.25 - 0.5, dtype=np.float32) count = 0 print("Loading word vectors") pretrained = gensim.models.KeyedVectors.load_word2vec_format(word2vec_file, binary=False) for word, index in vocab_dict.items(): if word in pretrained: count += 1 embeddings[index, :] = pretrained[word] print("Found embeddings for %d words" % count) update_embeddings = False else: embeddings = None update_embeddings = True tf.reset_default_graph() model = Student(network_architecture, alpha=alpha, learning_rate=learning_rate, batch_size=batch_size, init_embeddings=embeddings, update_embeddings=update_embeddings, init_bg=init_bg, update_background=update_background, init_beta=init_beta, update_beta=update_beta, threads=threads, regularize=auto_regularize, optimizer=optimizer, adam_beta1=adam_beta1, seed=seed) # train full model print("Optimizing full model") model = train(model, network_architecture, train_X, train_labels, train_covariates, is_labeled=is_labeled, regularize=auto_regularize, training_epochs=n_epochs, batch_size=batch_size, rng=rng, X_dev=dev_X, Y_dev=dev_labels, C_dev=dev_covariates, bn_anneal=bn_anneal) fh.makedirs(output_dir) # print background bg = model.get_bg() if not no_bg: print_top_bg(bg, vocab) # print topics emb = model.get_weights() print("Topics:") maw, sparsity = print_top_words(emb, vocab) print("sparsity in topics = %0.4f" % sparsity) save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5) fh.write_list_to_text(['{:.4f}'.format(maw)], os.path.join(output_dir, 'maw.txt')) fh.write_list_to_text(['{:.4f}'.format(sparsity)], os.path.join(output_dir, 'sparsity.txt')) if n_covariates > 0: emb_c = model.get_covar_weights() print("Covariate deviations:") if covar_emb_dim == 0: maw, sparsity = print_top_words(emb_c, vocab, covariate_names, n_top_words=16) else: maw, sparsity = print_top_words(emb_c, vocab) print("sparsity in covariates = %0.4f" % sparsity) if use_covar_interactions: print("Covariate interactions") emb_ci = model.get_covar_inter_weights() print(emb_ci.shape) if covariate_names is not None: names = [str(k) + ':' + c for k in range(n_topics) for c in covariate_names] else: names = None maw, sparsity = print_top_words(emb_ci, vocab, names) print("sparsity in covariate interactions = %0.4f" % sparsity) print("Combined covariates and interactions:") if covar_emb_dim > 0: print_covariate_embeddings(model, covariate_names, output_dir) # Evaluate perplexity on dev and test dataa if dev_X is not None: perplexity = evaluate_perplexity(model, dev_X, dev_labels, dev_covariates, eta_bn_prop=0.0) print("Dev perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.dev.txt')) if test_X is not None: if final_evaluate: perplexity = evaluate_perplexity(model, test_X, test_labels, test_covariates, eta_bn_prop=0.0) print("Test perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.test.txt')) if n_covariates > 0 and covariates_type == 'categorical': print("Predicting categorical covariates") predictions = infer_categorical_covariate(model, network_architecture, train_X, train_labels) accuracy = float(np.sum(predictions == np.argmax(train_covariates, axis=1)) / float(len(train_covariates))) print("Train accuracy on covariates = %0.4f" % accuracy) if dev_X is not None: predictions = infer_categorical_covariate(model, network_architecture, dev_X, dev_labels) accuracy = float(np.sum(predictions == np.argmax(dev_covariates, axis=1)) / float(len(dev_covariates))) print("Dev accuracy on covariates = %0.4f" % accuracy) if test_X is not None: if final_evaluate: predictions = infer_categorical_covariate(model, network_architecture, test_X, test_labels) accuracy = float(np.sum(predictions == np.argmax(test_covariates, axis=1)) / float(len(test_covariates))) print("Test accuracy on covariates = %0.4f" % accuracy) if n_labels > 0: print("Predicting labels") predict_labels_and_evaluate(model, network_architecture, train_X, train_labels, train_covariates, output_dir, subset='train') if dev_X is not None: predict_labels_and_evaluate(model, network_architecture, dev_X, dev_labels, dev_covariates, output_dir, subset='dev') if test_X is not None: if final_evaluate: predict_labels_and_evaluate(model, network_architecture, test_X, test_labels, test_covariates, output_dir, subset='test') # Print associations between topics and labels if n_labels > 0 and n_labels < 7: print("Label probabilities based on topics") print("Labels:", ' '.join([name for name in label_names])) for k in range(n_topics): Z = np.zeros([1, n_topics]).astype('float32') Z[0, k] = 1.0 if n_covariates > 0: C = np.zeros([1, n_covariates]).astype('float32') else: C = None probs = model.predict_from_topics(Z, C) output = str(k) + ': ' for i in range(n_labels): output += '%.4f ' % probs[0, i] print(output) if n_covariates > 0: all_probs = np.zeros([n_covariates, n_topics]) for k in range(n_topics): Z = np.zeros([1, n_topics]).astype('float32') Z[0, k] = 1.0 Y = None for c in range(n_covariates): C = np.zeros([1, n_covariates]).astype('float32') C[0, c] = 1.0 probs = model.predict_from_topics(Z, C) all_probs[c, k] = probs[0, 0] np.savez(os.path.join(output_dir, 'covar_topic_probs.npz'), probs=all_probs) # save document representations theta = model.compute_theta(train_X, train_labels, train_covariates) np.savez(os.path.join(output_dir, 'train.theta.npz'), theta=theta) if dev_X is not None: if dev_labels is None: dev_Y = None else: dev_Y = np.zeros_like(dev_labels) theta = model.compute_theta(dev_X, dev_Y, dev_covariates) np.savez(os.path.join(output_dir, 'dev.theta.npz'), theta=theta) if n_test > 0: if final_evaluate: if test_labels is None: test_Y = None else: test_Y = np.zeros_like(test_labels) theta = model.compute_theta(test_X, test_Y, test_covariates) np.savez(os.path.join(output_dir, 'test.theta.npz'), theta=theta)
def get_oov_count_filename(self): return fh.make_filename(fh.makedirs(self.dirname), 'oov_counts', 'json')
def get_feature_filename(self): return fh.make_filename(fh.makedirs(self.dirname), 'counts', 'pkl')
def get_index_filename(self): return fh.make_filename(fh.makedirs(self.get_dirname()), 'index', 'json')
def get_vocab_filename(self): return fh.make_filename(fh.makedirs(self.get_dirname()), 'vocab', 'json')
def main(): usage = "%prog input_dir train_prefix" parser = OptionParser(usage=usage) parser.add_option( '-a', dest='alpha', default=1.0, help='Hyperparameter for logistic normal prior: default=%default') parser.add_option( '-k', dest='n_topics', default=20, help='Size of latent representation (~num topics): default=%default') parser.add_option('-b', dest='batch_size', default=200, help='Size of minibatches: default=%default') parser.add_option('-l', dest='learning_rate', default=0.002, help='Initial learning rate: default=%default') parser.add_option('-m', dest='momentum', default=0.99, help='beta1 for Adam: default=%default') parser.add_option('-e', dest='epochs', default=200, help='Number of epochs: default=%default') parser.add_option('--emb_dim', dest='embedding_dim', default=300, help='Dimension of input embeddings: default=%default') parser.add_option( '--labels', dest='label_name', default=None, help= 'Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default' ) parser.add_option( '--covars', dest='covar_names', default=None, help= 'Read covars from files with these names (comma-separated): default=%default' ) parser.add_option( '--label_emb_dim', dest='label_emb_dim', default=-1, help='Class embedding dimension [0 = identity]: default=%default') parser.add_option( '--covar_emb_dim', dest='covar_emb_dim', default=-1, help='Covariate embedding dimension [0 = identity]: default=%default') parser.add_option( '--min_covar_count', dest='min_covar_count', default=None, help= 'Drop binary covariates that occur less than this in training: default=%default' ) parser.add_option( '--c_layers', dest='classifier_layers', default=1, help= 'Number of layers in (generative) classifier [0|1|2]: default=%default' ) parser.add_option('-t', dest='test_prefix', default=None, help='Prefix of test set: default=%default') parser.add_option('-o', dest='output_dir', default='output', help='Output directory: default=%default') parser.add_option( '--w2v', dest='word2vec_file', default=None, help= 'Use this word2vec .bin file to initialize and fix embeddings: default=%default' ) parser.add_option('--update_bg', action="store_true", dest="update_bg", default=False, help='Update background parameters: default=%default') parser.add_option('--no_bg', action="store_true", dest="no_bg", default=False, help='Do not use background freq: default=%default') parser.add_option( '--no_bn_anneal', action="store_true", dest="no_bn_anneal", default=False, help='Do not anneal away from batchnorm: default=%default') parser.add_option( '--test_samples', dest='test_samples', default=20, help= 'Number of samples to use in computing test perplexity: default=%default' ) parser.add_option('--dev_folds', dest='dev_folds', default=0, help='Number of dev folds: default=%default') parser.add_option( '--dev_fold', dest='dev_fold', default=0, help='Fold to use as dev (if dev_folds > 0): default=%default') (options, args) = parser.parse_args() input_dir = args[0] train_prefix = args[1] alpha = float(options.alpha) n_topics = int(options.n_topics) batch_size = int(options.batch_size) learning_rate = float(options.learning_rate) adam_beta1 = float(options.momentum) n_epochs = int(options.epochs) embedding_dim = int(options.embedding_dim) label_file_name = options.label_name covar_file_names = options.covar_names label_emb_dim = int(options.label_emb_dim) covar_emb_dim = int(options.covar_emb_dim) min_covar_count = options.min_covar_count classifier_layers = int(options.classifier_layers) test_prefix = options.test_prefix output_dir = options.output_dir word2vec_file = options.word2vec_file update_background = options.update_bg no_bg = options.no_bg bn_anneal = not options.no_bn_anneal test_samples = int(options.test_samples) dev_folds = int(options.dev_folds) dev_fold = int(options.dev_fold) rng = np.random.RandomState(np.random.randint(0, 100000)) # load the training data train_X, vocab, train_labels, label_names, label_type, train_covariates, covariate_names, covariates_type = load_data( input_dir, train_prefix, label_file_name, covar_file_names) n_train, dv = train_X.shape if train_labels is not None: _, n_labels = train_labels.shape else: n_labels = 0 if train_covariates is not None: _, n_covariates = train_covariates.shape if min_covar_count is not None and int(min_covar_count) > 0: print("Removing rare covariates") covar_sums = train_covariates.sum(axis=0).reshape((n_covariates, )) covariate_selector = covar_sums > int(min_covar_count) train_covariates = train_covariates[:, covariate_selector] covariate_names = [ name for i, name in enumerate(covariate_names) if covariate_selector[i] ] n_covariates = len(covariate_names) else: n_covariates = 0 # split into train and dev if dev_folds > 0: n_dev = int(n_train / dev_folds) indices = np.array(range(n_train), dtype=int) rng.shuffle(indices) if dev_fold < dev_folds - 1: dev_indices = indices[n_dev * dev_fold:n_dev * (dev_fold + 1)] else: dev_indices = indices[n_dev * dev_fold:] train_indices = list(set(indices) - set(dev_indices)) dev_X = train_X[dev_indices, :] train_X = train_X[train_indices, :] if train_labels is not None: dev_labels = train_labels[dev_indices, :] train_labels = train_labels[train_indices, :] else: dev_labels = None if train_covariates is not None: dev_covariates = train_covariates[dev_indices, :] train_covariates = train_covariates[train_indices, :] else: dev_covariates = None n_train = len(train_indices) else: dev_X = None dev_labels = None dev_covariates = None n_dev = 0 # load the test data if test_prefix is not None: test_X, _, test_labels, _, _, test_covariates, _, _ = load_data( input_dir, test_prefix, label_file_name, covar_file_names, vocab=vocab) n_test, _ = test_X.shape if test_labels is not None: _, n_labels_test = test_labels.shape assert n_labels_test == n_labels if test_covariates is not None: if min_covar_count is not None and int(min_covar_count) > 0: test_covariates = test_covariates[:, covariate_selector] _, n_covariates_test = test_covariates.shape assert n_covariates_test == n_covariates else: test_X = None n_test = 0 test_labels = None test_covariates = None # initialize the background using overall word frequencies init_bg = get_init_bg(train_X) if no_bg: init_bg = np.zeros_like(init_bg) # combine the network configuration parameters into a dictionary network_architecture = make_network(dv, embedding_dim, n_topics, label_type, n_labels, label_emb_dim, covariates_type, n_covariates, covar_emb_dim, classifier_layers) # make_network() print("Network architecture:") for key, val in network_architecture.items(): print(key + ':', val) # load pretrained word vectors if word2vec_file is not None: vocab_size = len(vocab) vocab_dict = dict(zip(vocab, range(vocab_size))) embeddings = np.array(rng.rand(vocab_size, 300) * 0.25 - 0.5, dtype=np.float32) count = 0 print("Loading word vectors") pretrained = gensim.models.KeyedVectors.load_word2vec_format( word2vec_file, binary=True) for word, index in vocab_dict.items(): if word in pretrained: count += 1 embeddings[index, :] = pretrained[word] print("Found embeddings for %d words" % count) update_embeddings = False else: embeddings = None update_embeddings = True # create the model model = Scholar(network_architecture, alpha=alpha, learning_rate=learning_rate, init_embeddings=embeddings, update_embeddings=update_embeddings, init_bg=init_bg, update_background=update_background, adam_beta1=adam_beta1) # train the model print("Optimizing full model") model = train(model, network_architecture, train_X, train_labels, train_covariates, training_epochs=n_epochs, batch_size=batch_size, rng=rng, X_dev=dev_X, Y_dev=dev_labels, C_dev=dev_covariates, bn_anneal=bn_anneal) # make output directory fh.makedirs(output_dir) # print background bg = model.get_bg() if not no_bg: print_top_bg(bg, vocab) # print topics emb = model.get_weights() print("Topics:") maw, sparsity = print_top_words(emb, vocab) print("sparsity in topics = %0.4f" % sparsity) save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5) fh.write_list_to_text(['{:.4f}'.format(maw)], os.path.join(output_dir, 'maw.txt')) fh.write_list_to_text(['{:.4f}'.format(sparsity)], os.path.join(output_dir, 'sparsity.txt')) if n_covariates > 0: beta_c = model.get_covar_weights() print("Covariate deviations:") if covar_emb_dim == 0: maw, sparsity = print_top_words(beta_c, vocab, covariate_names) else: maw, sparsity = print_top_words(beta_c, vocab) print("sparsity in covariates = %0.4f" % sparsity) if output_dir is not None: np.savez(os.path.join(output_dir, 'beta_c.npz'), beta=beta_c, names=covariate_names) # Evaluate perplexity on dev and test dataa if dev_X is not None: perplexity = evaluate_perplexity(model, dev_X, dev_labels, dev_covariates, eta_bn_prop=0.0, n_samples=test_samples) print("Dev perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.dev.txt')) if test_X is not None: perplexity = evaluate_perplexity(model, test_X, test_labels, test_covariates, eta_bn_prop=0.0, n_samples=test_samples) print("Test perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.test.txt')) # evaluate accuracy on predicting categorical covariates if n_covariates > 0 and covariates_type == 'categorical': print("Predicting categorical covariates") predictions = infer_categorical_covariate(model, network_architecture, train_X, train_labels) accuracy = float( np.sum(predictions == np.argmax(train_covariates, axis=1)) / float(len(train_covariates))) print("Train accuracy on covariates = %0.4f" % accuracy) if output_dir is not None: fh.write_list_to_text([str(accuracy)], os.path.join(output_dir, 'accuracy.train.txt')) if dev_X is not None: predictions = infer_categorical_covariate(model, network_architecture, dev_X, dev_labels) accuracy = float( np.sum(predictions == np.argmax(dev_covariates, axis=1)) / float(len(dev_covariates))) print("Dev accuracy on covariates = %0.4f" % accuracy) if output_dir is not None: fh.write_list_to_text([str(accuracy)], os.path.join(output_dir, 'accuracy.dev.txt')) if test_X is not None: predictions = infer_categorical_covariate(model, network_architecture, test_X, test_labels) accuracy = float( np.sum(predictions == np.argmax(test_covariates, axis=1)) / float(len(test_covariates))) print("Test accuracy on covariates = %0.4f" % accuracy) if output_dir is not None: fh.write_list_to_text([str(accuracy)], os.path.join(output_dir, 'accuracy.test.txt')) # evaluate accuracy on predicting labels if n_labels > 0: print("Predicting labels") predict_labels_and_evaluate(model, train_X, train_labels, train_covariates, output_dir, subset='train') if dev_X is not None: predict_labels_and_evaluate(model, dev_X, dev_labels, dev_covariates, output_dir, subset='dev') if test_X is not None: predict_labels_and_evaluate(model, test_X, test_labels, test_covariates, output_dir, subset='test') # Print associations between topics and labels if n_labels > 0 and n_labels < 7: print("Label probabilities based on topics") print("Labels:", ' '.join([name for name in label_names])) for k in range(n_topics): Z = np.zeros([1, n_topics]).astype('float32') Z[0, k] = 1.0 Y = None if n_covariates > 0: C = np.zeros([1, n_covariates]).astype('float32') else: C = None probs = model.predict_from_topics(Z, C) output = str(k) + ': ' for i in range(n_labels): output += '%.4f ' % probs[0, i] print(output) if n_covariates > 0: all_probs = np.zeros([n_covariates, n_topics]) for k in range(n_topics): Z = np.zeros([1, n_topics]).astype('float32') Z[0, k] = 1.0 Y = None for c in range(n_covariates): C = np.zeros([1, n_covariates]).astype('float32') C[0, c] = 1.0 probs = model.predict_from_topics(Z, C) all_probs[c, k] = probs[0, 0] np.savez(os.path.join(output_dir, 'covar_topic_probs.npz'), probs=all_probs) # save document representations print("Getting topic proportions") theta = model.compute_theta(train_X, train_labels, train_covariates) print("Saving topic proportions") np.savez(os.path.join(output_dir, 'theta.train.npz'), theta=theta) if dev_X is not None: dev_Y = np.zeros_like(dev_labels) print("Getting topic proportions for dev data") theta = model.compute_theta(dev_X, dev_Y, dev_covariates) print("Saving topic proportions") np.savez(os.path.join(output_dir, 'theta.dev.npz'), theta=theta) if n_test > 0: test_Y = np.zeros_like(test_labels) print("Getting topic proportions for test data") theta = model.compute_theta(test_X, test_Y, test_covariates) print("Saving topic proportions") np.savez(os.path.join(output_dir, 'theta.test.npz'), theta=theta)
def main(): usage = "%prog input_dir" parser = OptionParser(usage=usage) parser.add_option( '-k', dest='n_topics', type=int, default=20, help='Size of latent representation (~num topics): default=%default') parser.add_option('-l', dest='learning_rate', type=float, default=0.002, help='Initial learning rate: default=%default') parser.add_option('-m', dest='momentum', type=float, default=0.99, help='beta1 for Adam: default=%default') parser.add_option('--batch-size', dest='batch_size', type=int, default=200, help='Size of minibatches: default=%default') parser.add_option('--epochs', type=int, default=200, help='Number of epochs: default=%default') parser.add_option('--train-prefix', type=str, default='train', help='Prefix of train set: default=%default') parser.add_option('--test-prefix', type=str, default=None, help='Prefix of test set: default=%default') parser.add_option( '--labels', type=str, default=None, help= 'Read labels from input_dir/[train|test].labels.csv: default=%default') parser.add_option( '--prior-covars', type=str, default=None, help= 'Read prior covariates from files with these names (comma-separated): default=%default' ) parser.add_option( '--topic-covars', type=str, default=None, help= 'Read topic covariates from files with these names (comma-separated): default=%default' ) parser.add_option( '--interactions', action="store_true", default=False, help= 'Use interactions between topics and topic covariates: default=%default' ) parser.add_option( '--min-prior-covar-count', type=int, default=None, help= 'Drop prior covariates with less than this many non-zero values in the training dataa: default=%default' ) parser.add_option( '--min-topic-covar-count', type=int, default=None, help= 'Drop topic covariates with less than this many non-zero values in the training dataa: default=%default' ) parser.add_option( '--l1-topics', type=float, default=0.0, help='Regularization strength on topic weights: default=%default') parser.add_option( '--l1-topic-covars', type=float, default=0.0, help= 'Regularization strength on topic covariate weights: default=%default') parser.add_option( '--l1-interactions', type=float, default=0.0, help= 'Regularization strength on topic covariate interaction weights: default=%default' ) parser.add_option( '--l2-prior-covars', type=float, default=0.0, help= 'Regularization strength on prior covariate weights: default=%default') parser.add_option('-o', dest='output_dir', type=str, default='output', help='Output directory: default=%default') parser.add_option('--emb-dim', type=int, default=300, help='Dimension of input embeddings: default=%default') parser.add_option( '--w2v', dest='word2vec_file', type=str, default=None, help= 'Use this word2vec .bin file to initialize and fix embeddings: default=%default' ) parser.add_option( '--alpha', type=float, default=1.0, help='Hyperparameter for logistic normal prior: default=%default') parser.add_option('--no-bg', action="store_true", default=False, help='Do not use background freq: default=%default') parser.add_option('--dev-folds', type=int, default=0, help='Number of dev folds: default=%default') parser.add_option( '--dev-fold', type=int, default=0, help='Fold to use as dev (if dev_folds > 0): default=%default') parser.add_option('--device', type=int, default=None, help='GPU to use: default=%default') parser.add_option('--seed', type=int, default=None, help='Random seed: default=%default') options, args = parser.parse_args() input_dir = args[0] if options.seed is not None: rng = np.random.RandomState(options.seed) else: rng = np.random.RandomState(np.random.randint(0, 100000)) # load the training data train_X, vocab, row_selector = load_word_counts(input_dir, options.train_prefix) train_labels, label_type, label_names, n_labels = load_labels( input_dir, options.train_prefix, row_selector, options) train_prior_covars, prior_covar_selector, prior_covar_names, n_prior_covars = load_covariates( input_dir, options.train_prefix, row_selector, options.prior_covars, options.min_prior_covar_count) train_topic_covars, topic_covar_selector, topic_covar_names, n_topic_covars = load_covariates( input_dir, options.train_prefix, row_selector, options.topic_covars, options.min_topic_covar_count) options.n_train, vocab_size = train_X.shape options.n_labels = n_labels if n_labels > 0: print("Train label proportions:", np.mean(train_labels, axis=0)) # split into training and dev if desired train_indices, dev_indices = train_dev_split(options, rng) train_X, dev_X = split_matrix(train_X, train_indices, dev_indices) train_labels, dev_labels = split_matrix(train_labels, train_indices, dev_indices) train_prior_covars, dev_prior_covars = split_matrix( train_prior_covars, train_indices, dev_indices) train_topic_covars, dev_topic_covars = split_matrix( train_topic_covars, train_indices, dev_indices) n_train, _ = train_X.shape # load the test data if options.test_prefix is not None: test_X, _, row_selector = load_word_counts(input_dir, options.test_prefix, vocab=vocab) test_labels, _, _, _ = load_labels(input_dir, options.test_prefix, row_selector, options) test_prior_covars, _, _, _ = load_covariates( input_dir, options.test_prefix, row_selector, options.prior_covars, covariate_selector=prior_covar_selector) test_topic_covars, _, _, _ = load_covariates( input_dir, options.test_prefix, row_selector, options.topic_covars, covariate_selector=topic_covar_selector) n_test, _ = test_X.shape else: test_X = None n_test = 0 test_labels = None test_prior_covars = None test_topic_covars = None # initialize the background using overall word frequencies init_bg = get_init_bg(train_X) if options.no_bg: init_bg = np.zeros_like(init_bg) # combine the network configuration parameters into a dictionary network_architecture = make_network(options, vocab_size, label_type, n_labels, n_prior_covars, n_topic_covars) print("Network architecture:") for key, val in network_architecture.items(): print(key + ':', val) # load word vectors embeddings, update_embeddings = load_word_vectors(options, rng, vocab) # create the model model = Scholar(network_architecture, alpha=options.alpha, learning_rate=options.learning_rate, init_embeddings=embeddings, update_embeddings=update_embeddings, init_bg=init_bg, adam_beta1=options.momentum, device=options.device) # train the model print("Optimizing full model") model = train(model, network_architecture, train_X, train_labels, train_prior_covars, train_topic_covars, training_epochs=options.epochs, batch_size=options.batch_size, rng=rng, X_dev=dev_X, Y_dev=dev_labels, PC_dev=dev_prior_covars, TC_dev=dev_topic_covars) # make output directory fh.makedirs(options.output_dir) # display and save weights print_and_save_weights(options, model, vocab, prior_covar_names, topic_covar_names) # Evaluate perplexity on dev and test data if dev_X is not None: perplexity = evaluate_perplexity(model, dev_X, dev_labels, dev_prior_covars, dev_topic_covars, options.batch_size, eta_bn_prop=0.0) print("Dev perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(options.output_dir, 'perplexity.dev.txt')) if test_X is not None: perplexity = evaluate_perplexity(model, test_X, test_labels, test_prior_covars, test_topic_covars, options.batch_size, eta_bn_prop=0.0) print("Test perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(options.output_dir, 'perplexity.test.txt')) # evaluate accuracy on predicting labels if n_labels > 0: print("Predicting labels") predict_labels_and_evaluate(model, train_X, train_labels, train_prior_covars, train_topic_covars, options.output_dir, subset='train') if dev_X is not None: predict_labels_and_evaluate(model, dev_X, dev_labels, dev_prior_covars, dev_topic_covars, options.output_dir, subset='dev') if test_X is not None: predict_labels_and_evaluate(model, test_X, test_labels, test_prior_covars, test_topic_covars, options.output_dir, subset='test') # print label probabilities for each topic print_topic_label_associations(options, label_names, model, n_prior_covars, n_topic_covars) # save document representations print("Saving document representations") save_document_representations(model, train_X, train_labels, train_prior_covars, train_topic_covars, options.output_dir, 'train', batch_size=options.batch_size) if dev_X is not None: save_document_representations(model, dev_X, dev_labels, dev_prior_covars, dev_topic_covars, options.output_dir, 'dev', batch_size=options.batch_size) if n_test > 0: save_document_representations(model, test_X, test_labels, test_prior_covars, test_topic_covars, options.output_dir, 'test', batch_size=options.batch_size)
def _setup_dirs(): global base_dir global data_dir global features_dir #global features_dir2 global data_raw_dir global data_raw_labels_dir global data_raw_index_dir #global data_raw_text_dir #global data_raw_text_file global data_raw_sentences_dir global data_subsets_dir global splits_file_name #global data_splits_dir global data_processed_dir global data_processed_text_dir global data_processed_text_file global data_processed_brown_dir global data_processed_wikilinks_dir global data_processed_phrasemachine_dir global data_stanford_dir global data_semafor_dir global data_amalgram_dir global data_wordnet_domains_dir global lda_dir global persona_dir global persona_dir_mentions global persona_dir_all global exp_dir data_dir = fh.makedirs(base_dir, 'data') features_dir = fh.makedirs(base_dir, 'features') #features_dir2 = fh.makedirs(base_dir, 'features2') data_raw_dir = fh.makedirs(data_dir, 'raw') data_raw_labels_dir = fh.makedirs(data_raw_dir, 'labels') data_raw_index_dir = fh.makedirs(data_raw_dir, 'index') #data_raw_text_dir = fh.makedirs(data_raw_dir, 'text') #data_raw_text_file = os.path.join(data_raw_text_dir, 'sentences.json') data_raw_sentences_dir = fh.makedirs(data_raw_dir, 'sentences') data_subsets_dir = fh.makedirs(data_dir, 'subsets') #data_splits_file = fh.make_filename(data_subsets_dir, 'splits', 'csv') data_processed_dir = fh.makedirs(data_dir, 'processed') data_processed_text_dir = fh.makedirs(data_processed_dir, 'text') data_processed_brown_dir = fh.makedirs(data_processed_dir, 'brown') data_processed_wikilinks_dir = fh.makedirs(data_processed_dir, 'wikilinks') data_processed_phrasemachine_dir = fh.makedirs(data_processed_dir, 'phrasemachine') data_stanford_dir = fh.makedirs(data_processed_dir, 'stanford') data_processed_text_file = os.path.join(data_stanford_dir, 'words.json') data_semafor_dir = fh.makedirs(data_processed_dir, 'semafor') data_amalgram_dir = fh.makedirs(data_processed_dir, 'amalgram') data_wordnet_domains_dir = fh.makedirs(data_processed_dir, 'wordnet_domains') lda_dir = fh.makedirs(data_processed_dir, 'lda') persona_dir = fh.makedirs(data_processed_dir, 'personas') persona_dir_mentions = fh.makedirs(data_processed_dir, 'personas_mentions') persona_dir_all = fh.makedirs(data_processed_dir, 'personas_all') exp_dir = fh.makedirs(base_dir, 'experiments')
def main(): usage = "%prog input_dir" parser = OptionParser(usage=usage) parser.add_option( '-k', dest='n_topics', default=100, help='Size of latent representation (~num topics): default=%default') parser.add_option( '-r', action="store_true", dest="regularize", default=False, help= 'Apply adaptive regularization for sparsity in topics: default=%default' ) parser.add_option('-o', dest='output_dir', default='output', help='Output directory: default=%default') parser.add_option( '--vocab-size', dest='vocab_size', default=None, help= 'Filter the vocabulary keeping the most common n words: default=%default' ) parser.add_option('--no-bg', action="store_true", dest="no_bg", default=False, help='Do not use background freq: default=%default') parser.add_option( '--no-bn-anneal', action="store_true", dest="no_bn_anneal", default=False, help='Do not anneal away from batchnorm: default=%default') parser.add_option( '--opt', dest='optimizer', default='adam', help= 'Optimization algorithm to use [adam|adagrad|sgd]: default=%default') parser.add_option('--dev-folds', dest='dev_folds', default=0, help='Number of dev folds: default=%default') parser.add_option( '--dev-fold', dest='dev_fold', default=0, help='Fold to use as dev (if dev_folds > 0): default=%default') parser.add_option('--test-prefix', dest='test_prefix', default=None, help='Prefix of test set: default=%default') parser.add_option( '--labels', dest='label_name', default=None, help= 'Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default' ) (options, args) = parser.parse_args() input_dir = args[0] dev_folds = int(options.dev_folds) dev_fold = int(options.dev_fold) label_file_name = options.label_name alpha = 1.0 n_topics = int(options.n_topics) batch_size = 200 # learning_rate = 0.002 learning_rate = 0.001 adam_beta1 = 0.99 n_epochs = 450 encoder_layers = 1 #Number of encoder layers [0|1|2] encoder_shortcuts = False classifier_layers = 1 #[0|1|2] auto_regularize = options.regularize output_dir = options.output_dir # word2vec_file = "/home/lcw2/share/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin" word2vec_file = "../embeddings/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.bin" # word2vec_file = "C:\\\\Soft\\share\\GoogleNews-vectors-negative300.bin" embedding_dim = 200 vocab_size = options.vocab_size update_background = False no_bg = options.no_bg bn_anneal = True optimizer = options.optimizer seed = 1 threads = 4 if seed is not None: seed = int(seed) rng = np.random.RandomState(seed) else: rng = np.random.RandomState(np.random.randint(0, 100000)) # kb embedding file # kb2vec_file = "/home/lcw2/github/my_vaetm/data/kb2vec/WikiData.KB.100d.zh.pickle" kb2vec_file = "./data/kb2vec/WikiData.KB.100d.zh.v2.pickle" kb_dim = 100 test_prefix = 'test' # load the training data train_prefix = 'train' train_X, vocab, train_labels, label_names, label_type, col_sel, num = load_data( input_dir, train_prefix, label_file_name, vocab_size=vocab_size) n_train, dv = train_X.shape if train_labels is not None: _, n_labels = train_labels.shape print('n_labels:', n_labels) else: n_labels = 0 if test_prefix == 'test': test_X, _, test_labels, _, _, _, _ = load_data(input_dir, test_prefix, label_file_name, vocab=vocab) n_test, _ = test_X.shape if test_labels is not None: _, n_labels_test = test_labels.shape assert n_labels_test == n_labels # split training data into train and dev if dev_folds > 0: n_dev = int(n_train / dev_folds) indices = np.array(range(n_train), dtype=int) rng.shuffle(indices) if dev_fold < dev_folds - 1: dev_indices = indices[n_dev * dev_fold:n_dev * (dev_fold + 1)] else: dev_indices = indices[n_dev * dev_fold:] train_indices = list(set(indices) - set(dev_indices)) dev_X = train_X[dev_indices, :] train_X = train_X[train_indices, :] n_train = len(train_indices) else: dev_X = None # initialize the background using the overall frequency of terms init_bg = get_init_bg(train_X) init_beta = None update_beta = True # if no_bg: # if n_topics == 1: # init_beta = init_bg.copy() # init_beta = init_beta.reshape([1, len(vocab)]) # update_beta = False # init_bg = np.zeros_like(init_bg) label_emb_dim = -1 # create the network configuration network_architecture = make_network(dv, encoder_layers, embedding_dim, n_topics, encoder_shortcuts, label_type, n_labels, label_emb_dim, classifier_layers) print("Network architecture:") for key, val in network_architecture.items(): print(key + ':', val) # # load pretrained word vectors if word2vec_file is not None: vocab_size = len(vocab) vocab_dict = dict(zip(vocab, range(vocab_size))) embeddings = np.array(rng.rand(vocab_size, embedding_dim) * 0.25 - 0.5, dtype=np.float32) count = 0 print("Loading word vectors") if word2vec_file[-3:] == 'bin': pretrained = gensim.models.KeyedVectors.load(word2vec_file) else: pretrained = gensim.models.KeyedVectors.load_word2vec_format( word2vec_file, binary=False) for word, index in vocab_dict.items(): if word in pretrained: count += 1 embeddings[index, :] = pretrained[word] print("Found word embeddings for %d words" % count) print('shape of word embeddings:', embeddings.shape) else: print("No embeddings for words!") exit() # load pretrained entity vectors # if kb2vec_file is not None: # vocab_size = len(vocab) # vocab_dict = dict(zip(vocab, range(vocab_size))) # entity_embeddings = np.array(rng.rand(vocab_size, kb_dim) * 0.25 - 0.5, dtype=np.float32) # count = 0 # # print("Loading emtity vectors...") # pretrained = None # with open(kb2vec_file, 'rb') as f: # pretrained = pickle.load(f) # print('# of entities:', len(pretrained)) # vocab_counter = collections.Counter() # vocab_counter.update(s for s in num if s in pretrained) # print(vocab_counter.most_common(10)) # h = open('./output/topics.txt', 'r', encoding='utf-8') # read_data = h.read() # a = read_data.split() # print('#of topic',len(a)) # for word, index in vocab_dict.items(): # if word in pretrained and word in a: # print(word) # if word in pretrained: # # elif word in pretrained and word not in a: # count += 1 # entity_embeddings[index, :] = pretrained[word] # # print("Found entity embeddings for %d words" % count) # print('shape of entity embeddings:', entity_embeddings.shape) # else: # print("No embeddings for knowledge entities!") # exit() tf.reset_default_graph() # create the model model = VaeTm(network_architecture, alpha=alpha,\ learning_rate=learning_rate, \ batch_size=batch_size, # init_embeddings=embeddings,\ # entity_embeddings=entity_embeddings,\ init_bg=init_bg,\ update_background=update_background, init_beta=init_beta,\ update_beta=update_beta, threads=threads,\ regularize=auto_regularize, optimizer=optimizer,\ adam_beta1=adam_beta1, seed=seed) # train the model print("Optimizing full model") model = train(model, network_architecture, train_X, train_labels, vocab, regularize=auto_regularize, training_epochs=n_epochs, batch_size=batch_size, rng=rng, bn_anneal=bn_anneal, X_dev=dev_X) # create output directory fh.makedirs(output_dir) # print background bg = model.get_bg() if not no_bg: print_top_bg(bg, vocab) # print topics emb = model.get_weights() print("Topics:") maw, sparsity, topics = print_top_words(emb, vocab) print("sparsity in topics = %0.4f" % sparsity) save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5) fh.write_list_to_text(['{:.4f}'.format(maw)], os.path.join(output_dir, 'maw.txt')) fh.write_list_to_text(['{:.4f}'.format(sparsity)], os.path.join(output_dir, 'sparsity.txt')) # print('Predicting training representations...') # reps, preds = model.predict(train_X) # # print('rep-0:', reps[0]) # # print('rep-0:', reps[1]) # fh.write_matrix_to_text(reps, os.path.join(output_dir, 'train_representation.txt')) # if test_X is not None: # print('Predicting testing representations...') # reps, preds = model.predict(test_X) # # print('rep-0:', reps[0]) # # print('rep-0:', reps[1]) # fh.write_matrix_to_text(reps, os.path.join(output_dir, 'test_representation.txt')) # Evaluate perplexity on dev and test dataa if dev_X is not None: perplexity = evaluate_perplexity(model, dev_X, eta_bn_prop=0.0) print("Dev perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.dev.txt')) if test_X is not None: perplexity = evaluate_perplexity(model, test_X, test_labels, eta_bn_prop=0.0) print("Test perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.test.txt')) # evaluate accuracy on labels if n_labels > 0: print("Predicting labels") predict_labels_and_evaluate(model, train_X, train_labels, None, output_dir, subset='train') if dev_X is not None: predict_labels_and_evaluate(model, dev_X, dev_labels, None, output_dir, subset='dev') if test_X is not None: predict_labels_and_evaluate(model, test_X, test_labels, None, output_dir, subset='test') # save document representations theta = model.compute_theta(train_X, train_labels) np.savez(os.path.join(output_dir, 'theta.train.npz'), theta=theta) compute_npmi_at_n(topics, vocab, train_X)