Пример #1
0
def main():
    usage = "%prog infiles.jsonlist[.gz]"
    parser = OptionParser(usage=usage)
    parser.add_option('-o',
                      dest='output_dir',
                      default='output',
                      help='Output_dir: default=%default')
    parser.add_option('-w',
                      dest='target_word',
                      default='mass shooting',
                      help='Target word: default=%default')

    (options, args) = parser.parse_args()
    infiles = args
    output_dir = options.output_dir
    target_word = options.target_word

    if not os.path.exists(output_dir):
        fh.makedirs(output_dir)

    n_articles_per_day = defaultdict(int)
    target_count_per_day = defaultdict(int)
    for f in infiles:
        print(f)
        articles = fh.read_jsonlist(f)
        print(len(articles))
        for i, article in enumerate(articles):
            if i % 10000 == 0 and i > 0:
                print(i)
            year = int(article['year'])
            month = int(article['month'])
            day = int(article['day'])
            date = datetime.date(year=year, month=month, day=day)
            ordinal_date = date.toordinal()
            n_articles_per_day[ordinal_date] += 1
            text = ''
            if 'headline' in article:
                text += article['headline'] + '\n'
            if 'body' in article:
                text += article['body']
            if 'text' in article:
                text += article['text']

            text = ' ' + clean_text(text, lower=True) + ' '
            if target_word in text:
                if 'film' not in text and 'game' not in text:
                    target_count_per_day[ordinal_date] += 1

    fh.write_to_json(n_articles_per_day,
                     os.path.join(output_dir, 'articles_per_day.json'))
    fh.write_to_json(target_count_per_day,
                     os.path.join(output_dir, 'target_counts_per_day.json'))
def download_articles(name, categories, subset):

    data = []
    print("Downloading articles")
    newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=())

    for i in range(len(newsgroups_data['data'])):
        line = newsgroups_data['data'][i]
        data.append({'text': line, 'group': newsgroups_data['target_names'][newsgroups_data['target'][i]]})

    print(len(data))
    raw_data_dir = os.path.join('data', '20ng', name)
    print("Saving to", raw_data_dir)
    fh.makedirs(raw_data_dir)
    fh.write_jsonlist(data, os.path.join(raw_data_dir, subset + '.jsonlist'))
Пример #3
0
def make_base_dir(project_name=None):
    global base_dir
    cwd = os.getcwd()
    parts = os.path.split(cwd)
    print parts
    assert parts[-1] == 'guac'
    base_dir = parts[0]
    base_dir = fh.makedirs(os.path.join(base_dir, 'datasets', project_name))
    if os.path.exists(base_dir):
        _setup_dirs()
    else:
        sys.exit("Error: base directory " + base_dir + " does not exist!")
Пример #4
0
def set_project(project_name, splits_filename):
    global base_dir
    cwd = os.getcwd()
    parts = os.path.split(cwd)
    print parts
    assert parts[-1] == 'guac'
    base_dir = parts[0]
    base_dir = fh.makedirs(os.path.join(base_dir, 'datasets', project_name))
    if os.path.exists(base_dir):
        _setup_dirs()
        global splits_file_name
        splits_file_name = splits_filename
        global data_splits_file
        data_splits_file = os.path.join(data_subsets_dir, splits_filename + '.csv')
    else:
        sys.exit("Error: base directory " + base_dir + " does not exist!")
Пример #5
0
def main():
    usage = "%prog input_dir train_prefix"
    parser = OptionParser(usage=usage)
    parser.add_option('-a', dest='alpha', default=1.0,
                      help='Hyperparameter for logistic normal prior: default=%default')
    parser.add_option('-k', dest='n_topics', default=20,
                      help='Size of latent representation (~num topics): default=%default')
    parser.add_option('-b', dest='batch_size', default=200,
                      help='Size of minibatches: default=%default')
    parser.add_option('-l', dest='learning_rate', default=0.002,
                      help='Initial learning rate: default=%default')
    parser.add_option('-m', dest='momentum', default=0.99,
                      help='beta1 for Adam: default=%default')
    parser.add_option('-e', dest='epochs', default=250,
                      help='Number of epochs: default=%default')
    parser.add_option('--en_layers', dest='encoder_layers', default=1,
                      help='Number of encoder layers [0|1|2]: default=%default')
    parser.add_option('--emb_dim', dest='embedding_dim', default=300,
                      help='Dimension of input embeddings: default=%default')
    parser.add_option('--en_short', action="store_true", dest="encoder_shortcuts", default=False,
                      help='Use shortcut connections on encoder: default=%default')
    parser.add_option('--labels', dest='label_name', default=None,
                      help='Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default')
    parser.add_option('--covars', dest='covar_names', default=None,
                      help='Read covars from files with these names (comma-separated): default=%default')
    parser.add_option('--label_emb_dim', dest='label_emb_dim', default=0,
                      help='Class embedding dimension [0 = identity]: default=%default')
    parser.add_option('--covar_emb_dim', dest='covar_emb_dim', default=0,
                      help='Covariate embedding dimension [0 = identity]: default=%default')
    parser.add_option('--min_covar_count', dest='min_covar_count', default=None,
                      help='Drop binary covariates that occur less than this in training: default=%default')
    parser.add_option('--covar_inter', action="store_true", dest="covar_interactions", default=False,
                      help='Use covariate interactions in model: default=%default')
    parser.add_option('--c_layers', dest='classifier_layers', default=1,
                      help='Number of layers in (generative) classifier [0|1|2]: default=%default')
    parser.add_option('--exclude_covars', action="store_true", dest="exclude_covars", default=False,
                      help='Exclude covariates from the classifier: default=%default')
    parser.add_option('-r', action="store_true", dest="regularize", default=False,
                      help='Apply adaptive regularization for sparsity in topics: default=%default')
    parser.add_option('-t', dest='test_prefix', default=None,
                      help='Prefix of test set: default=%default')
    parser.add_option('-f', dest='final_evaluate', default=None,
                      help='perform final evaluation on test set')
    parser.add_option('-d', dest='dev_prefix', default=None,
                      help='Prefix of dev set: default=%default')
    parser.add_option('-o', dest='output_dir', default='output',
                      help='Output directory: default=%default')
    parser.add_option('--w2v', dest='word2vec_file', default=None,
                      help='Use this word2vec .bin file to initialize and fix embeddings: default=%default')
    parser.add_option('--vocab_size', dest='vocab_size', default=None,
                      help='Filter the vocabulary keeping the most common n words: default=%default')
    parser.add_option('--update_bg', action="store_true", dest="update_bg", default=False,
                      help='Update background parameters: default=%default')
    parser.add_option('--no_bg', action="store_true", dest="no_bg", default=False,
                      help='Do not use background freq: default=%default')
    parser.add_option('--no_bn_anneal', action="store_true", dest="no_bn_anneal", default=False,
                      help='Do not anneal away from batchnorm: default=%default')
    parser.add_option('--dev_folds', dest='dev_folds', default=0,
                      help='Number of dev folds: default=%default')
    parser.add_option('--dev_fold', dest='dev_fold', default=0,
                      help='Fold to use as dev (if dev_folds > 0): default=%default')
    parser.add_option('--opt', dest='optimizer', default='adam',
                      help='Optimization algorithm to use [adam|adagrad|sgd]: default=%default')
    parser.add_option('--threads', dest='threads', default=8,
                      help='Use this to limit the number of CPUs: default=%default')
    parser.add_option('--seed', dest='seed', default=None,
                      help='Random seed: default=%default')

    (options, args) = parser.parse_args()

    input_dir = args[0]
    train_prefix = args[1]

    alpha = float(options.alpha)
    n_topics = int(options.n_topics)
    batch_size = int(options.batch_size)
    learning_rate = float(options.learning_rate)
    adam_beta1 = float(options.momentum)
    n_epochs = int(options.epochs)
    encoder_layers = int(options.encoder_layers)
    embedding_dim = int(options.embedding_dim)
    encoder_shortcuts = options.encoder_shortcuts
    label_file_name = options.label_name
    covar_file_names = options.covar_names
    use_covar_interactions = options.covar_interactions
    label_emb_dim = int(options.label_emb_dim)
    covar_emb_dim = int(options.covar_emb_dim)
    min_covar_count = options.min_covar_count
    classifier_layers = int(options.classifier_layers)
    covars_in_classifier = not options.exclude_covars
    auto_regularize = options.regularize
    test_prefix = options.test_prefix
    dev_prefix = options.dev_prefix
    output_dir = options.output_dir
    word2vec_file = options.word2vec_file
    vocab_size = options.vocab_size
    update_background = options.update_bg
    no_bg = options.no_bg
    bn_anneal = not options.no_bn_anneal
    dev_folds = int(options.dev_folds)
    final_evaluate = options.final_evaluate
    dev_fold = int(options.dev_fold)
    optimizer = options.optimizer
    seed = options.seed
    threads = int(options.threads)
    if seed is not None:
        seed = int(seed)
        rng = np.random.RandomState(seed)
    else:
        rng = np.random.RandomState(np.random.randint(0, 100000))

    train_X, vocab, train_labels, label_names, na_label_index, label_type, train_covariates, covariate_names, covariates_type, col_sel = load_data(input_dir, train_prefix, label_file_name, covar_file_names, vocab_size=vocab_size)
    n_train, dv = train_X.shape

    if train_labels is not None:
        _, n_labels = train_labels.shape
        # convert binary labels to a single dimensional vector
        #if binary and n_classes == 2 and not generative:
        #    train_labels = np.argmax(train_labels, axis=1)
        #    train_labels = train_labels.reshape((n_train, 1))
        #    n_classes = 1
    else:
        n_labels = 0

    if train_covariates is not None:
        _, n_covariates = train_covariates.shape
        if min_covar_count is not None and int(min_covar_count) > 0:
            print("Removing rare covariates")
            covar_sums = train_covariates.sum(axis=0).reshape((n_covariates, ))
            covariate_selector = covar_sums > int(min_covar_count)
            train_covariates = train_covariates[:, covariate_selector]
            covariate_names = [name for i, name in enumerate(covariate_names) if covariate_selector[i]]
            n_covariates = len(covariate_names)

    else:
        n_covariates = 0

    if dev_prefix is not None:
        dev_X, _, dev_labels, _, _, _, dev_covariates, _, _, _ = load_data(input_dir, dev_prefix, label_file_name, covar_file_names, vocab=vocab, col_sel=col_sel)
        n_dev, _ = dev_X.shape
        if dev_labels is not None:
            _, n_labels_dev = dev_labels.shape
            assert n_labels_dev == n_labels
            #if binary and n_classes == 2 and not generative:
            #    test_labels = np.argmax(test_labels, axis=1)
            #    test_labels = test_labels.reshape((n_test, 1))
            #    n_classes = 1
        if dev_covariates is not None:
            if min_covar_count is not None and int(min_covar_count) > 0:
                dev_covariates = dev_covariates[:, covariate_selector]
            _, n_covariates_dev = dev_covariates.shape
            assert n_covariates_dev == n_covariates

    else:
        dev_X = None
        n_dev = 0
        dev_labels = None
        dev_covariates = None

    if test_prefix is not None:
        if final_evaluate:
            test_X, _, test_labels, _, _, _, test_covariates, _, _, _ = load_data(input_dir, test_prefix, label_file_name, covar_file_names, vocab=vocab, col_sel=col_sel)
            n_test, _ = test_X.shape
            if test_labels is not None:
                _, n_labels_test = test_labels.shape
                assert n_labels_test == n_labels
                #if binary and n_classes == 2 and not generative:
                #    test_labels = np.argmax(test_labels, axis=1)
                #    test_labels = test_labels.reshape((n_test, 1))
                #    n_classes = 1
            if test_covariates is not None:
                if min_covar_count is not None and int(min_covar_count) > 0:
                    test_covariates = test_covariates[:, covariate_selector]
                _, n_covariates_test = test_covariates.shape
                assert n_covariates_test == n_covariates

        else:
            test_X = None
            n_test = 0
            test_labels = None
            test_covariates = None

    is_labeled = pd.read_csv(os.path.join(input_dir, "train.is_labeled.csv"), names=['labeled']).labeled

    init_bg = get_init_bg(train_X)
    init_beta = None
    update_beta = True
    if no_bg:
        if n_topics == 1:
            init_beta = init_bg.copy()
            init_beta = init_beta.reshape([1, len(vocab)])
            update_beta = False
        init_bg = np.zeros_like(init_bg)


    network_architecture = make_network(dv, encoder_layers, embedding_dim,
                                        n_topics, encoder_shortcuts, label_type, n_labels, label_emb_dim,
                                        covariates_type, n_covariates, covar_emb_dim, use_covar_interactions,
                                        classifier_layers, covars_in_classifier)  # make_network()

    print("Network architecture:")
    for key, val in network_architecture.items():
        print(key + ':', val)

    # load pretrained word vectors
    if word2vec_file is not None:
        vocab_size = len(
            vocab)
        vocab_dict = dict(zip(vocab, range(vocab_size)))
        embeddings = np.array(rng.rand(vocab_size, 300) * 0.25 - 0.5, dtype=np.float32)
        count = 0
        print("Loading word vectors")
        pretrained = gensim.models.KeyedVectors.load_word2vec_format(word2vec_file, binary=False)

        for word, index in vocab_dict.items():
            if word in pretrained:
                count += 1
                embeddings[index, :] = pretrained[word]

        print("Found embeddings for %d words" % count)
        update_embeddings = False
    else:
        embeddings = None
        update_embeddings = True

    tf.reset_default_graph()

    model = Student(network_architecture, alpha=alpha, learning_rate=learning_rate, batch_size=batch_size, init_embeddings=embeddings, update_embeddings=update_embeddings, init_bg=init_bg, update_background=update_background, init_beta=init_beta, update_beta=update_beta, threads=threads, regularize=auto_regularize, optimizer=optimizer, adam_beta1=adam_beta1, seed=seed)

    # train full model
    print("Optimizing full model")
    model = train(model, network_architecture, train_X, train_labels, train_covariates, is_labeled=is_labeled, regularize=auto_regularize, training_epochs=n_epochs, batch_size=batch_size, rng=rng, X_dev=dev_X, Y_dev=dev_labels, C_dev=dev_covariates, bn_anneal=bn_anneal)

    fh.makedirs(output_dir)

    # print background
    bg = model.get_bg()
    if not no_bg:
        print_top_bg(bg, vocab)

    # print topics
    emb = model.get_weights()
    print("Topics:")
    maw, sparsity = print_top_words(emb, vocab)
    print("sparsity in topics = %0.4f" % sparsity)
    save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5)

    fh.write_list_to_text(['{:.4f}'.format(maw)], os.path.join(output_dir, 'maw.txt'))
    fh.write_list_to_text(['{:.4f}'.format(sparsity)], os.path.join(output_dir, 'sparsity.txt'))

    if n_covariates > 0:
        emb_c = model.get_covar_weights()
        print("Covariate deviations:")
        if covar_emb_dim == 0:
            maw, sparsity = print_top_words(emb_c, vocab, covariate_names, n_top_words=16)
        else:
            maw, sparsity = print_top_words(emb_c, vocab)
        print("sparsity in covariates = %0.4f" % sparsity)
        if use_covar_interactions:
            print("Covariate interactions")
            emb_ci = model.get_covar_inter_weights()
            print(emb_ci.shape)
            if covariate_names is not None:
                names = [str(k) + ':' + c for k in range(n_topics) for c in covariate_names]
            else:
                names = None
            maw, sparsity = print_top_words(emb_ci, vocab, names)
            print("sparsity in covariate interactions = %0.4f" % sparsity)
            print("Combined covariates and interactions:")

        if covar_emb_dim > 0:
            print_covariate_embeddings(model, covariate_names, output_dir)

    # Evaluate perplexity on dev and test dataa
    if dev_X is not None:
        perplexity = evaluate_perplexity(model, dev_X, dev_labels, dev_covariates, eta_bn_prop=0.0)
        print("Dev perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.dev.txt'))

    if test_X is not None:
        if final_evaluate:
            perplexity = evaluate_perplexity(model, test_X, test_labels, test_covariates, eta_bn_prop=0.0)
            print("Test perplexity = %0.4f" % perplexity)
            fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.test.txt'))

    if n_covariates > 0 and covariates_type == 'categorical':
        print("Predicting categorical covariates")
        predictions = infer_categorical_covariate(model, network_architecture, train_X, train_labels)
        accuracy = float(np.sum(predictions == np.argmax(train_covariates, axis=1)) / float(len(train_covariates)))
        print("Train accuracy on covariates = %0.4f" % accuracy)

        if dev_X is not None:
            predictions = infer_categorical_covariate(model, network_architecture, dev_X, dev_labels)
            accuracy = float(np.sum(predictions == np.argmax(dev_covariates, axis=1)) / float(len(dev_covariates)))
            print("Dev accuracy on covariates = %0.4f" % accuracy)

        if test_X is not None:
            if final_evaluate:
                predictions = infer_categorical_covariate(model, network_architecture, test_X, test_labels)
                accuracy = float(np.sum(predictions == np.argmax(test_covariates, axis=1)) / float(len(test_covariates)))
                print("Test accuracy on covariates = %0.4f" % accuracy)

    if n_labels > 0:
        print("Predicting labels")
        predict_labels_and_evaluate(model, network_architecture, train_X, train_labels, train_covariates, output_dir, subset='train')

        if dev_X is not None:
            predict_labels_and_evaluate(model, network_architecture, dev_X, dev_labels, dev_covariates, output_dir, subset='dev')

        if test_X is not None:
            if final_evaluate:
                predict_labels_and_evaluate(model, network_architecture, test_X, test_labels, test_covariates, output_dir, subset='test')

    # Print associations between topics and labels
    if n_labels > 0 and n_labels < 7:
        print("Label probabilities based on topics")
        print("Labels:", ' '.join([name for name in label_names]))
        for k in range(n_topics):
            Z = np.zeros([1, n_topics]).astype('float32')
            Z[0, k] = 1.0
            if n_covariates > 0:
                C = np.zeros([1, n_covariates]).astype('float32')
            else:
                C = None
            probs = model.predict_from_topics(Z, C)
            output = str(k) + ': '
            for i in range(n_labels):
                output += '%.4f ' % probs[0, i]
            print(output)

        if n_covariates > 0:
            all_probs = np.zeros([n_covariates, n_topics])
            for k in range(n_topics):
                Z = np.zeros([1, n_topics]).astype('float32')
                Z[0, k] = 1.0
                Y = None
                for c in range(n_covariates):
                    C = np.zeros([1, n_covariates]).astype('float32')
                    C[0, c] = 1.0
                    probs = model.predict_from_topics(Z, C)
                    all_probs[c, k] = probs[0, 0]
            np.savez(os.path.join(output_dir, 'covar_topic_probs.npz'), probs=all_probs)

    # save document representations
    theta = model.compute_theta(train_X, train_labels, train_covariates)
    np.savez(os.path.join(output_dir, 'train.theta.npz'), theta=theta)

    if dev_X is not None:
        if dev_labels is None:
            dev_Y = None
        else:
            dev_Y = np.zeros_like(dev_labels)
        theta = model.compute_theta(dev_X, dev_Y, dev_covariates)
        np.savez(os.path.join(output_dir, 'dev.theta.npz'), theta=theta)

    if n_test > 0:
        if final_evaluate:
            if test_labels is None:
                test_Y = None
            else:
                test_Y = np.zeros_like(test_labels)
            theta = model.compute_theta(test_X, test_Y, test_covariates)
            np.savez(os.path.join(output_dir, 'test.theta.npz'), theta=theta)
Пример #6
0
 def get_oov_count_filename(self):
     return fh.make_filename(fh.makedirs(self.dirname), 'oov_counts', 'json')
Пример #7
0
 def get_feature_filename(self):
     return fh.make_filename(fh.makedirs(self.dirname), 'counts', 'pkl')
Пример #8
0
 def get_index_filename(self):
     return fh.make_filename(fh.makedirs(self.get_dirname()), 'index', 'json')
Пример #9
0
 def get_vocab_filename(self):
     return fh.make_filename(fh.makedirs(self.get_dirname()), 'vocab', 'json')
Пример #10
0
def main():
    usage = "%prog input_dir train_prefix"
    parser = OptionParser(usage=usage)
    parser.add_option(
        '-a',
        dest='alpha',
        default=1.0,
        help='Hyperparameter for logistic normal prior: default=%default')
    parser.add_option(
        '-k',
        dest='n_topics',
        default=20,
        help='Size of latent representation (~num topics): default=%default')
    parser.add_option('-b',
                      dest='batch_size',
                      default=200,
                      help='Size of minibatches: default=%default')
    parser.add_option('-l',
                      dest='learning_rate',
                      default=0.002,
                      help='Initial learning rate: default=%default')
    parser.add_option('-m',
                      dest='momentum',
                      default=0.99,
                      help='beta1 for Adam: default=%default')
    parser.add_option('-e',
                      dest='epochs',
                      default=200,
                      help='Number of epochs: default=%default')
    parser.add_option('--emb_dim',
                      dest='embedding_dim',
                      default=300,
                      help='Dimension of input embeddings: default=%default')
    parser.add_option(
        '--labels',
        dest='label_name',
        default=None,
        help=
        'Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default'
    )
    parser.add_option(
        '--covars',
        dest='covar_names',
        default=None,
        help=
        'Read covars from files with these names (comma-separated): default=%default'
    )
    parser.add_option(
        '--label_emb_dim',
        dest='label_emb_dim',
        default=-1,
        help='Class embedding dimension [0 = identity]: default=%default')
    parser.add_option(
        '--covar_emb_dim',
        dest='covar_emb_dim',
        default=-1,
        help='Covariate embedding dimension [0 = identity]: default=%default')
    parser.add_option(
        '--min_covar_count',
        dest='min_covar_count',
        default=None,
        help=
        'Drop binary covariates that occur less than this in training: default=%default'
    )
    parser.add_option(
        '--c_layers',
        dest='classifier_layers',
        default=1,
        help=
        'Number of layers in (generative) classifier [0|1|2]: default=%default'
    )
    parser.add_option('-t',
                      dest='test_prefix',
                      default=None,
                      help='Prefix of test set: default=%default')
    parser.add_option('-o',
                      dest='output_dir',
                      default='output',
                      help='Output directory: default=%default')
    parser.add_option(
        '--w2v',
        dest='word2vec_file',
        default=None,
        help=
        'Use this word2vec .bin file to initialize and fix embeddings: default=%default'
    )
    parser.add_option('--update_bg',
                      action="store_true",
                      dest="update_bg",
                      default=False,
                      help='Update background parameters: default=%default')
    parser.add_option('--no_bg',
                      action="store_true",
                      dest="no_bg",
                      default=False,
                      help='Do not use background freq: default=%default')
    parser.add_option(
        '--no_bn_anneal',
        action="store_true",
        dest="no_bn_anneal",
        default=False,
        help='Do not anneal away from batchnorm: default=%default')
    parser.add_option(
        '--test_samples',
        dest='test_samples',
        default=20,
        help=
        'Number of samples to use in computing test perplexity: default=%default'
    )
    parser.add_option('--dev_folds',
                      dest='dev_folds',
                      default=0,
                      help='Number of dev folds: default=%default')
    parser.add_option(
        '--dev_fold',
        dest='dev_fold',
        default=0,
        help='Fold to use as dev (if dev_folds > 0): default=%default')

    (options, args) = parser.parse_args()

    input_dir = args[0]
    train_prefix = args[1]

    alpha = float(options.alpha)
    n_topics = int(options.n_topics)
    batch_size = int(options.batch_size)
    learning_rate = float(options.learning_rate)
    adam_beta1 = float(options.momentum)
    n_epochs = int(options.epochs)
    embedding_dim = int(options.embedding_dim)
    label_file_name = options.label_name
    covar_file_names = options.covar_names
    label_emb_dim = int(options.label_emb_dim)
    covar_emb_dim = int(options.covar_emb_dim)
    min_covar_count = options.min_covar_count
    classifier_layers = int(options.classifier_layers)
    test_prefix = options.test_prefix
    output_dir = options.output_dir
    word2vec_file = options.word2vec_file
    update_background = options.update_bg
    no_bg = options.no_bg
    bn_anneal = not options.no_bn_anneal
    test_samples = int(options.test_samples)
    dev_folds = int(options.dev_folds)
    dev_fold = int(options.dev_fold)
    rng = np.random.RandomState(np.random.randint(0, 100000))

    # load the training data
    train_X, vocab, train_labels, label_names, label_type, train_covariates, covariate_names, covariates_type = load_data(
        input_dir, train_prefix, label_file_name, covar_file_names)
    n_train, dv = train_X.shape

    if train_labels is not None:
        _, n_labels = train_labels.shape
    else:
        n_labels = 0

    if train_covariates is not None:
        _, n_covariates = train_covariates.shape
        if min_covar_count is not None and int(min_covar_count) > 0:
            print("Removing rare covariates")
            covar_sums = train_covariates.sum(axis=0).reshape((n_covariates, ))
            covariate_selector = covar_sums > int(min_covar_count)
            train_covariates = train_covariates[:, covariate_selector]
            covariate_names = [
                name for i, name in enumerate(covariate_names)
                if covariate_selector[i]
            ]
            n_covariates = len(covariate_names)
    else:
        n_covariates = 0

    # split into train and dev
    if dev_folds > 0:
        n_dev = int(n_train / dev_folds)
        indices = np.array(range(n_train), dtype=int)
        rng.shuffle(indices)
        if dev_fold < dev_folds - 1:
            dev_indices = indices[n_dev * dev_fold:n_dev * (dev_fold + 1)]
        else:
            dev_indices = indices[n_dev * dev_fold:]
        train_indices = list(set(indices) - set(dev_indices))
        dev_X = train_X[dev_indices, :]
        train_X = train_X[train_indices, :]
        if train_labels is not None:
            dev_labels = train_labels[dev_indices, :]
            train_labels = train_labels[train_indices, :]
        else:
            dev_labels = None
        if train_covariates is not None:
            dev_covariates = train_covariates[dev_indices, :]
            train_covariates = train_covariates[train_indices, :]
        else:
            dev_covariates = None
        n_train = len(train_indices)
    else:
        dev_X = None
        dev_labels = None
        dev_covariates = None
        n_dev = 0

    # load the test data
    if test_prefix is not None:
        test_X, _, test_labels, _, _, test_covariates, _, _ = load_data(
            input_dir,
            test_prefix,
            label_file_name,
            covar_file_names,
            vocab=vocab)
        n_test, _ = test_X.shape
        if test_labels is not None:
            _, n_labels_test = test_labels.shape
            assert n_labels_test == n_labels
        if test_covariates is not None:
            if min_covar_count is not None and int(min_covar_count) > 0:
                test_covariates = test_covariates[:, covariate_selector]
            _, n_covariates_test = test_covariates.shape
            assert n_covariates_test == n_covariates

    else:
        test_X = None
        n_test = 0
        test_labels = None
        test_covariates = None

    # initialize the background using overall word frequencies
    init_bg = get_init_bg(train_X)
    if no_bg:
        init_bg = np.zeros_like(init_bg)

    # combine the network configuration parameters into a dictionary
    network_architecture = make_network(dv, embedding_dim, n_topics,
                                        label_type, n_labels, label_emb_dim,
                                        covariates_type, n_covariates,
                                        covar_emb_dim,
                                        classifier_layers)  # make_network()

    print("Network architecture:")
    for key, val in network_architecture.items():
        print(key + ':', val)

    # load pretrained word vectors
    if word2vec_file is not None:
        vocab_size = len(vocab)
        vocab_dict = dict(zip(vocab, range(vocab_size)))
        embeddings = np.array(rng.rand(vocab_size, 300) * 0.25 - 0.5,
                              dtype=np.float32)
        count = 0
        print("Loading word vectors")
        pretrained = gensim.models.KeyedVectors.load_word2vec_format(
            word2vec_file, binary=True)

        for word, index in vocab_dict.items():
            if word in pretrained:
                count += 1
                embeddings[index, :] = pretrained[word]

        print("Found embeddings for %d words" % count)
        update_embeddings = False
    else:
        embeddings = None
        update_embeddings = True

    # create the model
    model = Scholar(network_architecture,
                    alpha=alpha,
                    learning_rate=learning_rate,
                    init_embeddings=embeddings,
                    update_embeddings=update_embeddings,
                    init_bg=init_bg,
                    update_background=update_background,
                    adam_beta1=adam_beta1)

    # train the model
    print("Optimizing full model")
    model = train(model,
                  network_architecture,
                  train_X,
                  train_labels,
                  train_covariates,
                  training_epochs=n_epochs,
                  batch_size=batch_size,
                  rng=rng,
                  X_dev=dev_X,
                  Y_dev=dev_labels,
                  C_dev=dev_covariates,
                  bn_anneal=bn_anneal)

    # make output directory
    fh.makedirs(output_dir)

    # print background
    bg = model.get_bg()
    if not no_bg:
        print_top_bg(bg, vocab)

    # print topics
    emb = model.get_weights()
    print("Topics:")
    maw, sparsity = print_top_words(emb, vocab)
    print("sparsity in topics = %0.4f" % sparsity)
    save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5)

    fh.write_list_to_text(['{:.4f}'.format(maw)],
                          os.path.join(output_dir, 'maw.txt'))
    fh.write_list_to_text(['{:.4f}'.format(sparsity)],
                          os.path.join(output_dir, 'sparsity.txt'))

    if n_covariates > 0:
        beta_c = model.get_covar_weights()
        print("Covariate deviations:")
        if covar_emb_dim == 0:
            maw, sparsity = print_top_words(beta_c, vocab, covariate_names)
        else:
            maw, sparsity = print_top_words(beta_c, vocab)
        print("sparsity in covariates = %0.4f" % sparsity)
        if output_dir is not None:
            np.savez(os.path.join(output_dir, 'beta_c.npz'),
                     beta=beta_c,
                     names=covariate_names)

    # Evaluate perplexity on dev and test dataa
    if dev_X is not None:
        perplexity = evaluate_perplexity(model,
                                         dev_X,
                                         dev_labels,
                                         dev_covariates,
                                         eta_bn_prop=0.0,
                                         n_samples=test_samples)
        print("Dev perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(output_dir, 'perplexity.dev.txt'))

    if test_X is not None:
        perplexity = evaluate_perplexity(model,
                                         test_X,
                                         test_labels,
                                         test_covariates,
                                         eta_bn_prop=0.0,
                                         n_samples=test_samples)
        print("Test perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(output_dir, 'perplexity.test.txt'))

    # evaluate accuracy on predicting categorical covariates
    if n_covariates > 0 and covariates_type == 'categorical':
        print("Predicting categorical covariates")
        predictions = infer_categorical_covariate(model, network_architecture,
                                                  train_X, train_labels)
        accuracy = float(
            np.sum(predictions == np.argmax(train_covariates, axis=1)) /
            float(len(train_covariates)))
        print("Train accuracy on covariates = %0.4f" % accuracy)
        if output_dir is not None:
            fh.write_list_to_text([str(accuracy)],
                                  os.path.join(output_dir,
                                               'accuracy.train.txt'))

        if dev_X is not None:
            predictions = infer_categorical_covariate(model,
                                                      network_architecture,
                                                      dev_X, dev_labels)
            accuracy = float(
                np.sum(predictions == np.argmax(dev_covariates, axis=1)) /
                float(len(dev_covariates)))
            print("Dev accuracy on covariates = %0.4f" % accuracy)
            if output_dir is not None:
                fh.write_list_to_text([str(accuracy)],
                                      os.path.join(output_dir,
                                                   'accuracy.dev.txt'))

        if test_X is not None:
            predictions = infer_categorical_covariate(model,
                                                      network_architecture,
                                                      test_X, test_labels)
            accuracy = float(
                np.sum(predictions == np.argmax(test_covariates, axis=1)) /
                float(len(test_covariates)))
            print("Test accuracy on covariates = %0.4f" % accuracy)
            if output_dir is not None:
                fh.write_list_to_text([str(accuracy)],
                                      os.path.join(output_dir,
                                                   'accuracy.test.txt'))

    # evaluate accuracy on predicting labels
    if n_labels > 0:
        print("Predicting labels")
        predict_labels_and_evaluate(model,
                                    train_X,
                                    train_labels,
                                    train_covariates,
                                    output_dir,
                                    subset='train')

        if dev_X is not None:
            predict_labels_and_evaluate(model,
                                        dev_X,
                                        dev_labels,
                                        dev_covariates,
                                        output_dir,
                                        subset='dev')

        if test_X is not None:
            predict_labels_and_evaluate(model,
                                        test_X,
                                        test_labels,
                                        test_covariates,
                                        output_dir,
                                        subset='test')

    # Print associations between topics and labels
    if n_labels > 0 and n_labels < 7:
        print("Label probabilities based on topics")
        print("Labels:", ' '.join([name for name in label_names]))
        for k in range(n_topics):
            Z = np.zeros([1, n_topics]).astype('float32')
            Z[0, k] = 1.0
            Y = None
            if n_covariates > 0:
                C = np.zeros([1, n_covariates]).astype('float32')
            else:
                C = None
            probs = model.predict_from_topics(Z, C)
            output = str(k) + ': '
            for i in range(n_labels):
                output += '%.4f ' % probs[0, i]
            print(output)

        if n_covariates > 0:
            all_probs = np.zeros([n_covariates, n_topics])
            for k in range(n_topics):
                Z = np.zeros([1, n_topics]).astype('float32')
                Z[0, k] = 1.0
                Y = None
                for c in range(n_covariates):
                    C = np.zeros([1, n_covariates]).astype('float32')
                    C[0, c] = 1.0
                    probs = model.predict_from_topics(Z, C)
                    all_probs[c, k] = probs[0, 0]
            np.savez(os.path.join(output_dir, 'covar_topic_probs.npz'),
                     probs=all_probs)

    # save document representations
    print("Getting topic proportions")
    theta = model.compute_theta(train_X, train_labels, train_covariates)
    print("Saving topic proportions")
    np.savez(os.path.join(output_dir, 'theta.train.npz'), theta=theta)

    if dev_X is not None:
        dev_Y = np.zeros_like(dev_labels)
        print("Getting topic proportions for dev data")
        theta = model.compute_theta(dev_X, dev_Y, dev_covariates)
        print("Saving topic proportions")
        np.savez(os.path.join(output_dir, 'theta.dev.npz'), theta=theta)

    if n_test > 0:
        test_Y = np.zeros_like(test_labels)
        print("Getting topic proportions for test data")
        theta = model.compute_theta(test_X, test_Y, test_covariates)
        print("Saving topic proportions")
        np.savez(os.path.join(output_dir, 'theta.test.npz'), theta=theta)
Пример #11
0
def main():
    usage = "%prog input_dir"
    parser = OptionParser(usage=usage)
    parser.add_option(
        '-k',
        dest='n_topics',
        type=int,
        default=20,
        help='Size of latent representation (~num topics): default=%default')
    parser.add_option('-l',
                      dest='learning_rate',
                      type=float,
                      default=0.002,
                      help='Initial learning rate: default=%default')
    parser.add_option('-m',
                      dest='momentum',
                      type=float,
                      default=0.99,
                      help='beta1 for Adam: default=%default')
    parser.add_option('--batch-size',
                      dest='batch_size',
                      type=int,
                      default=200,
                      help='Size of minibatches: default=%default')
    parser.add_option('--epochs',
                      type=int,
                      default=200,
                      help='Number of epochs: default=%default')
    parser.add_option('--train-prefix',
                      type=str,
                      default='train',
                      help='Prefix of train set: default=%default')
    parser.add_option('--test-prefix',
                      type=str,
                      default=None,
                      help='Prefix of test set: default=%default')
    parser.add_option(
        '--labels',
        type=str,
        default=None,
        help=
        'Read labels from input_dir/[train|test].labels.csv: default=%default')
    parser.add_option(
        '--prior-covars',
        type=str,
        default=None,
        help=
        'Read prior covariates from files with these names (comma-separated): default=%default'
    )
    parser.add_option(
        '--topic-covars',
        type=str,
        default=None,
        help=
        'Read topic covariates from files with these names (comma-separated): default=%default'
    )
    parser.add_option(
        '--interactions',
        action="store_true",
        default=False,
        help=
        'Use interactions between topics and topic covariates: default=%default'
    )
    parser.add_option(
        '--min-prior-covar-count',
        type=int,
        default=None,
        help=
        'Drop prior covariates with less than this many non-zero values in the training dataa: default=%default'
    )
    parser.add_option(
        '--min-topic-covar-count',
        type=int,
        default=None,
        help=
        'Drop topic covariates with less than this many non-zero values in the training dataa: default=%default'
    )
    parser.add_option(
        '--l1-topics',
        type=float,
        default=0.0,
        help='Regularization strength on topic weights: default=%default')
    parser.add_option(
        '--l1-topic-covars',
        type=float,
        default=0.0,
        help=
        'Regularization strength on topic covariate weights: default=%default')
    parser.add_option(
        '--l1-interactions',
        type=float,
        default=0.0,
        help=
        'Regularization strength on topic covariate interaction weights: default=%default'
    )
    parser.add_option(
        '--l2-prior-covars',
        type=float,
        default=0.0,
        help=
        'Regularization strength on prior covariate weights: default=%default')
    parser.add_option('-o',
                      dest='output_dir',
                      type=str,
                      default='output',
                      help='Output directory: default=%default')
    parser.add_option('--emb-dim',
                      type=int,
                      default=300,
                      help='Dimension of input embeddings: default=%default')
    parser.add_option(
        '--w2v',
        dest='word2vec_file',
        type=str,
        default=None,
        help=
        'Use this word2vec .bin file to initialize and fix embeddings: default=%default'
    )
    parser.add_option(
        '--alpha',
        type=float,
        default=1.0,
        help='Hyperparameter for logistic normal prior: default=%default')
    parser.add_option('--no-bg',
                      action="store_true",
                      default=False,
                      help='Do not use background freq: default=%default')
    parser.add_option('--dev-folds',
                      type=int,
                      default=0,
                      help='Number of dev folds: default=%default')
    parser.add_option(
        '--dev-fold',
        type=int,
        default=0,
        help='Fold to use as dev (if dev_folds > 0): default=%default')
    parser.add_option('--device',
                      type=int,
                      default=None,
                      help='GPU to use: default=%default')
    parser.add_option('--seed',
                      type=int,
                      default=None,
                      help='Random seed: default=%default')

    options, args = parser.parse_args()

    input_dir = args[0]

    if options.seed is not None:
        rng = np.random.RandomState(options.seed)
    else:
        rng = np.random.RandomState(np.random.randint(0, 100000))

    # load the training data
    train_X, vocab, row_selector = load_word_counts(input_dir,
                                                    options.train_prefix)
    train_labels, label_type, label_names, n_labels = load_labels(
        input_dir, options.train_prefix, row_selector, options)
    train_prior_covars, prior_covar_selector, prior_covar_names, n_prior_covars = load_covariates(
        input_dir, options.train_prefix, row_selector, options.prior_covars,
        options.min_prior_covar_count)
    train_topic_covars, topic_covar_selector, topic_covar_names, n_topic_covars = load_covariates(
        input_dir, options.train_prefix, row_selector, options.topic_covars,
        options.min_topic_covar_count)
    options.n_train, vocab_size = train_X.shape
    options.n_labels = n_labels

    if n_labels > 0:
        print("Train label proportions:", np.mean(train_labels, axis=0))

    # split into training and dev if desired
    train_indices, dev_indices = train_dev_split(options, rng)
    train_X, dev_X = split_matrix(train_X, train_indices, dev_indices)
    train_labels, dev_labels = split_matrix(train_labels, train_indices,
                                            dev_indices)
    train_prior_covars, dev_prior_covars = split_matrix(
        train_prior_covars, train_indices, dev_indices)
    train_topic_covars, dev_topic_covars = split_matrix(
        train_topic_covars, train_indices, dev_indices)

    n_train, _ = train_X.shape

    # load the test data
    if options.test_prefix is not None:
        test_X, _, row_selector = load_word_counts(input_dir,
                                                   options.test_prefix,
                                                   vocab=vocab)
        test_labels, _, _, _ = load_labels(input_dir, options.test_prefix,
                                           row_selector, options)
        test_prior_covars, _, _, _ = load_covariates(
            input_dir,
            options.test_prefix,
            row_selector,
            options.prior_covars,
            covariate_selector=prior_covar_selector)
        test_topic_covars, _, _, _ = load_covariates(
            input_dir,
            options.test_prefix,
            row_selector,
            options.topic_covars,
            covariate_selector=topic_covar_selector)
        n_test, _ = test_X.shape

    else:
        test_X = None
        n_test = 0
        test_labels = None
        test_prior_covars = None
        test_topic_covars = None

    # initialize the background using overall word frequencies
    init_bg = get_init_bg(train_X)
    if options.no_bg:
        init_bg = np.zeros_like(init_bg)

    # combine the network configuration parameters into a dictionary
    network_architecture = make_network(options, vocab_size, label_type,
                                        n_labels, n_prior_covars,
                                        n_topic_covars)

    print("Network architecture:")
    for key, val in network_architecture.items():
        print(key + ':', val)

    # load word vectors
    embeddings, update_embeddings = load_word_vectors(options, rng, vocab)

    # create the model
    model = Scholar(network_architecture,
                    alpha=options.alpha,
                    learning_rate=options.learning_rate,
                    init_embeddings=embeddings,
                    update_embeddings=update_embeddings,
                    init_bg=init_bg,
                    adam_beta1=options.momentum,
                    device=options.device)

    # train the model
    print("Optimizing full model")
    model = train(model,
                  network_architecture,
                  train_X,
                  train_labels,
                  train_prior_covars,
                  train_topic_covars,
                  training_epochs=options.epochs,
                  batch_size=options.batch_size,
                  rng=rng,
                  X_dev=dev_X,
                  Y_dev=dev_labels,
                  PC_dev=dev_prior_covars,
                  TC_dev=dev_topic_covars)

    # make output directory
    fh.makedirs(options.output_dir)

    # display and save weights
    print_and_save_weights(options, model, vocab, prior_covar_names,
                           topic_covar_names)

    # Evaluate perplexity on dev and test data
    if dev_X is not None:
        perplexity = evaluate_perplexity(model,
                                         dev_X,
                                         dev_labels,
                                         dev_prior_covars,
                                         dev_topic_covars,
                                         options.batch_size,
                                         eta_bn_prop=0.0)
        print("Dev perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(options.output_dir,
                                           'perplexity.dev.txt'))

    if test_X is not None:
        perplexity = evaluate_perplexity(model,
                                         test_X,
                                         test_labels,
                                         test_prior_covars,
                                         test_topic_covars,
                                         options.batch_size,
                                         eta_bn_prop=0.0)
        print("Test perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(options.output_dir,
                                           'perplexity.test.txt'))

    # evaluate accuracy on predicting labels
    if n_labels > 0:
        print("Predicting labels")
        predict_labels_and_evaluate(model,
                                    train_X,
                                    train_labels,
                                    train_prior_covars,
                                    train_topic_covars,
                                    options.output_dir,
                                    subset='train')

        if dev_X is not None:
            predict_labels_and_evaluate(model,
                                        dev_X,
                                        dev_labels,
                                        dev_prior_covars,
                                        dev_topic_covars,
                                        options.output_dir,
                                        subset='dev')

        if test_X is not None:
            predict_labels_and_evaluate(model,
                                        test_X,
                                        test_labels,
                                        test_prior_covars,
                                        test_topic_covars,
                                        options.output_dir,
                                        subset='test')

    # print label probabilities for each topic
    print_topic_label_associations(options, label_names, model, n_prior_covars,
                                   n_topic_covars)

    # save document representations
    print("Saving document representations")
    save_document_representations(model,
                                  train_X,
                                  train_labels,
                                  train_prior_covars,
                                  train_topic_covars,
                                  options.output_dir,
                                  'train',
                                  batch_size=options.batch_size)

    if dev_X is not None:
        save_document_representations(model,
                                      dev_X,
                                      dev_labels,
                                      dev_prior_covars,
                                      dev_topic_covars,
                                      options.output_dir,
                                      'dev',
                                      batch_size=options.batch_size)

    if n_test > 0:
        save_document_representations(model,
                                      test_X,
                                      test_labels,
                                      test_prior_covars,
                                      test_topic_covars,
                                      options.output_dir,
                                      'test',
                                      batch_size=options.batch_size)
Пример #12
0
def _setup_dirs():
    global base_dir
    global data_dir
    global features_dir
    #global features_dir2
    global data_raw_dir
    global data_raw_labels_dir
    global data_raw_index_dir
    #global data_raw_text_dir
    #global data_raw_text_file
    global data_raw_sentences_dir
    global data_subsets_dir
    global splits_file_name
    #global data_splits_dir
    global data_processed_dir
    global data_processed_text_dir
    global data_processed_text_file
    global data_processed_brown_dir
    global data_processed_wikilinks_dir
    global data_processed_phrasemachine_dir
    global data_stanford_dir
    global data_semafor_dir
    global data_amalgram_dir
    global data_wordnet_domains_dir
    global lda_dir
    global persona_dir
    global persona_dir_mentions
    global persona_dir_all

    global exp_dir

    data_dir = fh.makedirs(base_dir, 'data')
    features_dir = fh.makedirs(base_dir, 'features')
    #features_dir2 = fh.makedirs(base_dir, 'features2')
    data_raw_dir = fh.makedirs(data_dir, 'raw')
    data_raw_labels_dir = fh.makedirs(data_raw_dir, 'labels')
    data_raw_index_dir = fh.makedirs(data_raw_dir, 'index')
    #data_raw_text_dir = fh.makedirs(data_raw_dir, 'text')
    #data_raw_text_file = os.path.join(data_raw_text_dir, 'sentences.json')
    data_raw_sentences_dir = fh.makedirs(data_raw_dir, 'sentences')
    data_subsets_dir = fh.makedirs(data_dir, 'subsets')
    #data_splits_file = fh.make_filename(data_subsets_dir, 'splits', 'csv')
    data_processed_dir = fh.makedirs(data_dir, 'processed')
    data_processed_text_dir = fh.makedirs(data_processed_dir, 'text')
    data_processed_brown_dir = fh.makedirs(data_processed_dir, 'brown')
    data_processed_wikilinks_dir = fh.makedirs(data_processed_dir, 'wikilinks')
    data_processed_phrasemachine_dir = fh.makedirs(data_processed_dir, 'phrasemachine')
    data_stanford_dir = fh.makedirs(data_processed_dir, 'stanford')
    data_processed_text_file = os.path.join(data_stanford_dir, 'words.json')
    data_semafor_dir = fh.makedirs(data_processed_dir, 'semafor')
    data_amalgram_dir = fh.makedirs(data_processed_dir, 'amalgram')
    data_wordnet_domains_dir = fh.makedirs(data_processed_dir, 'wordnet_domains')

    lda_dir = fh.makedirs(data_processed_dir, 'lda')
    persona_dir = fh.makedirs(data_processed_dir, 'personas')
    persona_dir_mentions = fh.makedirs(data_processed_dir, 'personas_mentions')
    persona_dir_all = fh.makedirs(data_processed_dir, 'personas_all')

    exp_dir = fh.makedirs(base_dir, 'experiments')
Пример #13
0
def main():
    usage = "%prog input_dir"
    parser = OptionParser(usage=usage)
    parser.add_option(
        '-k',
        dest='n_topics',
        default=100,
        help='Size of latent representation (~num topics): default=%default')
    parser.add_option(
        '-r',
        action="store_true",
        dest="regularize",
        default=False,
        help=
        'Apply adaptive regularization for sparsity in topics: default=%default'
    )
    parser.add_option('-o',
                      dest='output_dir',
                      default='output',
                      help='Output directory: default=%default')
    parser.add_option(
        '--vocab-size',
        dest='vocab_size',
        default=None,
        help=
        'Filter the vocabulary keeping the most common n words: default=%default'
    )
    parser.add_option('--no-bg',
                      action="store_true",
                      dest="no_bg",
                      default=False,
                      help='Do not use background freq: default=%default')
    parser.add_option(
        '--no-bn-anneal',
        action="store_true",
        dest="no_bn_anneal",
        default=False,
        help='Do not anneal away from batchnorm: default=%default')
    parser.add_option(
        '--opt',
        dest='optimizer',
        default='adam',
        help=
        'Optimization algorithm to use [adam|adagrad|sgd]: default=%default')
    parser.add_option('--dev-folds',
                      dest='dev_folds',
                      default=0,
                      help='Number of dev folds: default=%default')
    parser.add_option(
        '--dev-fold',
        dest='dev_fold',
        default=0,
        help='Fold to use as dev (if dev_folds > 0): default=%default')
    parser.add_option('--test-prefix',
                      dest='test_prefix',
                      default=None,
                      help='Prefix of test set: default=%default')
    parser.add_option(
        '--labels',
        dest='label_name',
        default=None,
        help=
        'Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default'
    )

    (options, args) = parser.parse_args()

    input_dir = args[0]

    dev_folds = int(options.dev_folds)
    dev_fold = int(options.dev_fold)
    label_file_name = options.label_name

    alpha = 1.0
    n_topics = int(options.n_topics)
    batch_size = 200
    # learning_rate = 0.002
    learning_rate = 0.001
    adam_beta1 = 0.99
    n_epochs = 450
    encoder_layers = 1  #Number of encoder layers [0|1|2]
    encoder_shortcuts = False
    classifier_layers = 1  #[0|1|2]
    auto_regularize = options.regularize
    output_dir = options.output_dir
    # word2vec_file = "/home/lcw2/share/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
    word2vec_file = "../embeddings/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.bin"
    # word2vec_file = "C:\\\\Soft\\share\\GoogleNews-vectors-negative300.bin"
    embedding_dim = 200
    vocab_size = options.vocab_size
    update_background = False
    no_bg = options.no_bg
    bn_anneal = True
    optimizer = options.optimizer
    seed = 1
    threads = 4
    if seed is not None:
        seed = int(seed)
        rng = np.random.RandomState(seed)
    else:
        rng = np.random.RandomState(np.random.randint(0, 100000))
    # kb embedding file
    # kb2vec_file = "/home/lcw2/github/my_vaetm/data/kb2vec/WikiData.KB.100d.zh.pickle"
    kb2vec_file = "./data/kb2vec/WikiData.KB.100d.zh.v2.pickle"
    kb_dim = 100
    test_prefix = 'test'

    # load the training data
    train_prefix = 'train'
    train_X, vocab, train_labels, label_names, label_type, col_sel, num = load_data(
        input_dir, train_prefix, label_file_name, vocab_size=vocab_size)
    n_train, dv = train_X.shape
    if train_labels is not None:
        _, n_labels = train_labels.shape
        print('n_labels:', n_labels)
    else:
        n_labels = 0

    if test_prefix == 'test':
        test_X, _, test_labels, _, _, _, _ = load_data(input_dir,
                                                       test_prefix,
                                                       label_file_name,
                                                       vocab=vocab)
        n_test, _ = test_X.shape
        if test_labels is not None:
            _, n_labels_test = test_labels.shape
            assert n_labels_test == n_labels

    # split training data into train and dev
    if dev_folds > 0:
        n_dev = int(n_train / dev_folds)
        indices = np.array(range(n_train), dtype=int)
        rng.shuffle(indices)
        if dev_fold < dev_folds - 1:
            dev_indices = indices[n_dev * dev_fold:n_dev * (dev_fold + 1)]
        else:
            dev_indices = indices[n_dev * dev_fold:]
        train_indices = list(set(indices) - set(dev_indices))
        dev_X = train_X[dev_indices, :]
        train_X = train_X[train_indices, :]
        n_train = len(train_indices)
    else:
        dev_X = None

    # initialize the background using the overall frequency of terms
    init_bg = get_init_bg(train_X)
    init_beta = None
    update_beta = True
    # if no_bg:
    #     if n_topics == 1:
    #         init_beta = init_bg.copy()
    #         init_beta = init_beta.reshape([1, len(vocab)])
    #         update_beta = False
    #     init_bg = np.zeros_like(init_bg)

    label_emb_dim = -1
    # create the network configuration
    network_architecture = make_network(dv, encoder_layers, embedding_dim,
                                        n_topics, encoder_shortcuts,
                                        label_type, n_labels, label_emb_dim,
                                        classifier_layers)

    print("Network architecture:")
    for key, val in network_architecture.items():
        print(key + ':', val)

    # # load pretrained word vectors
    if word2vec_file is not None:
        vocab_size = len(vocab)
        vocab_dict = dict(zip(vocab, range(vocab_size)))
        embeddings = np.array(rng.rand(vocab_size, embedding_dim) * 0.25 - 0.5,
                              dtype=np.float32)
        count = 0
        print("Loading word vectors")
        if word2vec_file[-3:] == 'bin':
            pretrained = gensim.models.KeyedVectors.load(word2vec_file)
        else:
            pretrained = gensim.models.KeyedVectors.load_word2vec_format(
                word2vec_file, binary=False)

        for word, index in vocab_dict.items():
            if word in pretrained:
                count += 1
                embeddings[index, :] = pretrained[word]

        print("Found word embeddings for %d words" % count)
        print('shape of word embeddings:', embeddings.shape)
    else:
        print("No embeddings for words!")
        exit()

    # load pretrained entity vectors
    # if kb2vec_file is not None:
    #     vocab_size = len(vocab)
    #     vocab_dict = dict(zip(vocab, range(vocab_size)))
    #     entity_embeddings = np.array(rng.rand(vocab_size, kb_dim) * 0.25 - 0.5, dtype=np.float32)
    #     count = 0
    #
    #     print("Loading emtity vectors...")
    #     pretrained = None
    #     with open(kb2vec_file, 'rb') as f:
    #         pretrained = pickle.load(f)
    #     print('# of entities:', len(pretrained))
    #     vocab_counter = collections.Counter()
    #     vocab_counter.update(s for s in num if s in pretrained)
    #     print(vocab_counter.most_common(10))
    #     h = open('./output/topics.txt', 'r', encoding='utf-8')
    #     read_data = h.read()
    #     a = read_data.split()
    #     print('#of topic',len(a))
    #     for word, index in vocab_dict.items():
    #         if word in pretrained and word in a:
    #             print(word)
    #         if word in pretrained:
    #         # elif word in pretrained and word not in a:
    #             count += 1
    #             entity_embeddings[index, :] = pretrained[word]
    #
    #     print("Found entity embeddings for %d words" % count)
    #     print('shape of entity embeddings:', entity_embeddings.shape)
    # else:
    #     print("No embeddings for knowledge entities!")
    #     exit()

    tf.reset_default_graph()

    # create the model
    model = VaeTm(network_architecture, alpha=alpha,\
        learning_rate=learning_rate, \
        batch_size=batch_size,
        # init_embeddings=embeddings,\
        # entity_embeddings=entity_embeddings,\
        init_bg=init_bg,\
        update_background=update_background, init_beta=init_beta,\
        update_beta=update_beta, threads=threads,\
        regularize=auto_regularize, optimizer=optimizer,\
        adam_beta1=adam_beta1, seed=seed)

    # train the model
    print("Optimizing full model")
    model = train(model,
                  network_architecture,
                  train_X,
                  train_labels,
                  vocab,
                  regularize=auto_regularize,
                  training_epochs=n_epochs,
                  batch_size=batch_size,
                  rng=rng,
                  bn_anneal=bn_anneal,
                  X_dev=dev_X)

    # create output directory
    fh.makedirs(output_dir)

    # print background
    bg = model.get_bg()
    if not no_bg:
        print_top_bg(bg, vocab)

    # print topics
    emb = model.get_weights()
    print("Topics:")
    maw, sparsity, topics = print_top_words(emb, vocab)
    print("sparsity in topics = %0.4f" % sparsity)
    save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5)

    fh.write_list_to_text(['{:.4f}'.format(maw)],
                          os.path.join(output_dir, 'maw.txt'))
    fh.write_list_to_text(['{:.4f}'.format(sparsity)],
                          os.path.join(output_dir, 'sparsity.txt'))

    # print('Predicting training representations...')
    # reps, preds = model.predict(train_X)
    # # print('rep-0:', reps[0])
    # # print('rep-0:', reps[1])
    # fh.write_matrix_to_text(reps, os.path.join(output_dir, 'train_representation.txt'))

    # if test_X is not None:
    #     print('Predicting testing representations...')
    #     reps, preds = model.predict(test_X)
    #     # print('rep-0:', reps[0])
    #     # print('rep-0:', reps[1])
    #     fh.write_matrix_to_text(reps, os.path.join(output_dir, 'test_representation.txt'))

    # Evaluate perplexity on dev and test dataa
    if dev_X is not None:
        perplexity = evaluate_perplexity(model, dev_X, eta_bn_prop=0.0)
        print("Dev perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(output_dir, 'perplexity.dev.txt'))

    if test_X is not None:
        perplexity = evaluate_perplexity(model,
                                         test_X,
                                         test_labels,
                                         eta_bn_prop=0.0)
        print("Test perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(output_dir, 'perplexity.test.txt'))

        # evaluate accuracy on labels
    if n_labels > 0:
        print("Predicting labels")
        predict_labels_and_evaluate(model,
                                    train_X,
                                    train_labels,
                                    None,
                                    output_dir,
                                    subset='train')

        if dev_X is not None:
            predict_labels_and_evaluate(model,
                                        dev_X,
                                        dev_labels,
                                        None,
                                        output_dir,
                                        subset='dev')

        if test_X is not None:
            predict_labels_and_evaluate(model,
                                        test_X,
                                        test_labels,
                                        None,
                                        output_dir,
                                        subset='test')

    # save document representations
    theta = model.compute_theta(train_X, train_labels)
    np.savez(os.path.join(output_dir, 'theta.train.npz'), theta=theta)
    compute_npmi_at_n(topics, vocab, train_X)