示例#1
0
def perform_experiment(n_dw_matrix, optimizer, T, samples):
    optimizer.iteration_callback = experiments.default_callback(
        train_n_dw_matrix=n_dw_matrix,
        top_pmi_sizes=[5, 10, 20, 30],
        top_avg_jaccard_sizes=[10, 50, 100, 200],
        measure_time=True)
    for seed in range(samples):
        print(seed)
        experiments.default_sample(n_dw_matrix, T, seed, optimizer)
        print(timed_default.SimpleTimer.total_times)
示例#2
0
def perform_experiment((train_n_dw_matrix, test_n_dw_matrix, optimizer, T,
                        samples, output_path)):
    optimizer.iteration_callback = experiments.default_callback(
        train_n_dw_matrix=train_n_dw_matrix,
        test_n_dw_matrix=test_n_dw_matrix,
        top_pmi_sizes=[5, 10, 20, 30],
        top_avg_jaccard_sizes=[10, 50, 100, 200])
    for seed in range(samples):
        print(seed)
        experiments.default_sample(train_n_dw_matrix, T, seed, optimizer)
    optimizer.iteration_callback.save_results(output_path)
示例#3
0
def perform_iteration_dependency_experiment((
    train_n_dw_matrix, test_n_dw_matrix, optimizer,
    T, samples, output_path
)):
    optimizer.iteration_callback = experiments.default_callback(
        train_n_dw_matrix=train_n_dw_matrix,
        test_n_dw_matrix=test_n_dw_matrix,
        uniqueness_measures=True,
        iter_eval_step=5
    )
    for seed in range(samples):
        print(seed)
        experiments.default_sample(train_n_dw_matrix, T, seed, optimizer)
    optimizer.iteration_callback.save_results(output_path)
示例#4
0
def perform_experiment((
   train_n_dw_matrix, test_n_dw_matrix, optimizer,
   T, samples, init_iters, output_path
)):
    init_optimizer = default.Optimizer([regularizers.Trivial()] * init_iters)
    callback = experiments.default_callback(
        train_n_dw_matrix=train_n_dw_matrix,
        test_n_dw_matrix=test_n_dw_matrix
    )
    init_optimizer.iteration_callback = callback
    optimizer.iteration_callback = callback
    for seed in range(samples):
        print(seed)
        plsa_phi, plsa_theta = experiments.default_sample(
            train_n_dw_matrix=train_n_dw_matrix,
            T=T,
            seed=seed,
            optimizer=init_optimizer,
            finish_launch=False,
        )
        optimizer.run(train_n_dw_matrix, plsa_phi, plsa_theta)
        if optimizer.iteration_callback:
            optimizer.iteration_callback.finish_launch()

    optimizer.iteration_callback.save_results(output_path)
示例#5
0
def perform_plots((train_n_dw_matrix, T, output_path)):
    matrices = []
    gamma_callback = (
        lambda it, n_tw, n_dt, gamma_tw, gamma_dt:
        matrices.append((it, np.copy(n_tw), np.copy(n_dt), np.copy(gamma_tw), np.copy(gamma_dt)))
        if it % 20 == 0
        else
        None
    )
    init_optimizer = obd.Optimizer(
        [regularizers.Trivial()] * 100,
        gamma_tw_min_delta=-100000,
        gamma_callback=gamma_callback
    )
    post_optimizer = obd.Optimizer(
        [regularizers.Trivial()] * 121,
        gamma_tw_min_delta=0.1,
        gamma_tw_max_delta=40,
        gamma_tw_delta_percentile=0.5,
        gamma_callback=gamma_callback
    )

    plsa_phi, plsa_theta = experiments.default_sample(
        train_n_dw_matrix=train_n_dw_matrix,
        T=T,
        seed=42,
        optimizer=init_optimizer,
    )
    post_optimizer.run(train_n_dw_matrix, plsa_phi, plsa_theta)

    save_results(matrices, output_path)
示例#6
0
def perform_alpha_dependency_experiment((
    train_n_dw_matrix, optimizer, T, samples, output_path
)):
    callback = experiments.default_callback(
        train_n_dw_matrix=train_n_dw_matrix,
        uniqueness_measures=True
    )
    callback.start_launch()
    for seed in range(samples):
        print(seed)
        callback(0, *experiments.default_sample(
            train_n_dw_matrix, T, seed, optimizer
        ))
    callback.finish_launch()
    callback.save_results(output_path)
def perform_experiment(train_n_dw_matrix, test_n_dw_matrix, optimizer, T,
                       samples, output_path, tau, path_phi_output):

    optimizer.iteration_callback = experiments.default_callback(
        train_n_dw_matrix=train_n_dw_matrix,
        test_n_dw_matrix=test_n_dw_matrix,
        top_pmi_sizes=[5, 10, 20, 30],
        top_avg_jaccard_sizes=[10, 50, 100, 200],
        measure_time=False)

    for seed in range(samples):
        expphi, exptheta = experiments.default_sample(train_n_dw_matrix, T,
                                                      seed, optimizer, tau)
    optimizer.iteration_callback.save_results(output_path)
    with open(path_phi_output, 'wb') as resource_file:
        pickle.dump(expphi, resource_file)
    return (expphi, exptheta)
示例#8
0
def perform_lda(
    n_dw_matrix, optimizer, T, samples,
    output_path, init_phi=None, init_theta=None
):
    calc_perplexity = metrics.calc_perplexity_function(n_dw_matrix)
    phis = list()
    perplexities = list()
    for seed in range(samples):
        print(seed)
        phi, theta = experiments.default_sample(
            n_dw_matrix, T, seed, optimizer,
            init_phi_zeros=init_phi, init_theta_zeros=init_theta
        )
        phis.append(phi.flatten())
        perplexities.append(calc_perplexity(phi, theta))
    if not os.path.exists(os.path.dirname(output_path)):
        os.makedirs(os.path.dirname(output_path))
    with open(output_path, 'w') as output_file:
        pickle.dump({
            'init_phi': init_phi,
            'init_theta': init_theta,
            'perplexities': perplexities,
            'phis': phis
        }, output_file)
    for index, target in enumerate(doc_targets):
        if target == 0:
            topic_0_indices.append(index)
        elif target == 1:
            topic_1_indices.append(index)

    thetaless_rels = []
    lda_rels = []
    for balance in range(10, 201, 10):
        print(balance)
        n_dw_matrix = _n_dw_matrix[topic_0_indices +
                                   topic_1_indices * balance, :]
        regularization_list = [regularizers.Additive(-0.1, 0.)] * 100
        lda_phi, lda_theta = experiments.default_sample(
            n_dw_matrix,
            T=2,
            seed=42,
            optimizer=default.Optimizer(regularization_list, verbose=False))
        thetaless_phi, thetaless_theta = experiments.default_sample(
            n_dw_matrix,
            T=2,
            seed=42,
            optimizer=thetaless.Optimizer(regularization_list, verbose=False))
        # print(np.argmax(thetaless_theta[:len(topic_0_indices), :2], axis=1).mean())
        # print(np.argmax(thetaless_theta[len(topic_0_indices):, :2], axis=1).mean())
        # print('!')
        # for topic_set in metrics.get_top_words(thetaless_phi, 10):
        #     print('\n\t'.join(map(num_2_token.get, topic_set)))
        #     print()
        # for topic_set in metrics.get_top_words(thetaless_phi, 5):
        #     print('\n\t'.join(map(num_2_token.get, topic_set)))
示例#10
0
    n_dw_matrix, _, _, _ = main_cases.get_20newsgroups([
        'sci.crypt',
        'sci.electronics',
        'sci.med',
        'sci.space'
    ])

    print('Original PLSA')
    perform_lda(
        n_dw_matrix, optimizer=get_optimizer(0., 100), T=10,
        samples=300, output_path='stability_exp/plsa.pkl'
    )

    print('Full initialized PLSA')
    phi, theta = experiments.default_sample(
        n_dw_matrix, T=10, seed=42, optimizer=get_optimizer(-0.1, 100)
    )
    init_phi, init_theta = experiments.default_sample(
        n_dw_matrix, T=10, seed=42, optimizer=get_optimizer(0., 100),
        init_phi_zeros=phi, init_theta_zeros=theta
    )
    perform_lda(
        n_dw_matrix, optimizer=get_optimizer(0., 100), T=10,
        samples=300, output_path='stability_exp/full_initialized_plsa.pkl',
        init_phi=init_phi, init_theta=init_theta
    )

    print('Synthetic PLSA')
    matrix = np.dot(init_theta, init_phi)
    matrix[np.isnan(matrix)] = 0.
    synthetic_n_dw_matrix = scipy.sparse.csr_matrix(matrix)
示例#11
0
def perform_doc_experiment(
    (n_dw_matrix_doc_train, doc_targets_doc_train, n_dw_matrix_doc_test,
     doc_targets_doc_test, optimizer, T, samples, output_path)):
    D, _ = n_dw_matrix_doc_test.shape
    svm_train_score = metrics.create_svm_score_function(doc_targets_doc_train,
                                                        verbose=False)
    opt_plsa_not_const_phi = default.Optimizer(
        regularization_list=optimizer.regularization_list[:10],
        const_phi=False)
    opt_plsa_const_phi = default.Optimizer(
        regularization_list=optimizer.regularization_list[:10], const_phi=True)
    opt_artm_thetaless = thetaless.Optimizer(
        regularization_list=optimizer.regularization_list[:10])

    res_plsa_not_const_phi = []
    res_plsa_const_phi = []
    res_artm_thetaless = []
    cv_fold_scores = []
    cv_test_scores = []

    for seed in range(samples):
        print(seed)
        phi, theta = experiments.default_sample(n_dw_matrix_doc_train, T, seed,
                                                optimizer)
        (best_C, best_gamma, cv_fold_score,
         cv_test_score) = svm_train_score(theta)
        cv_fold_scores.append(cv_fold_score)
        cv_test_scores.append(cv_test_score)

        print('Fold score: {}\tTest score: {}'.format(cv_fold_score,
                                                      cv_test_score))
        algo = SVC(C=best_C, gamma=best_gamma).fit(theta,
                                                   doc_targets_doc_train)
        init_theta = common.get_prob_matrix_by_counters(
            np.ones(shape=(D, T), dtype=np.float64))

        plsa_not_const_phi = []
        plsa_const_phi = []
        artm_thetaless = []

        opt_plsa_not_const_phi.iteration_callback = (
            lambda it, phi, theta: plsa_not_const_phi.append(
                accuracy_score(algo.predict(theta), doc_targets_doc_test)))
        opt_plsa_const_phi.iteration_callback = (
            lambda it, phi, theta: plsa_const_phi.append(
                accuracy_score(algo.predict(theta), doc_targets_doc_test)))
        opt_artm_thetaless.iteration_callback = (
            lambda it, phi, theta: artm_thetaless.append(
                accuracy_score(algo.predict(theta), doc_targets_doc_test)))

        for opt in [
                opt_plsa_not_const_phi, opt_plsa_const_phi, opt_artm_thetaless
        ]:
            opt.run(n_dw_matrix_doc_test, phi, init_theta)

        res_plsa_not_const_phi.append(plsa_not_const_phi)
        res_plsa_const_phi.append(plsa_const_phi)
        res_artm_thetaless.append(artm_thetaless)

    callbacks.save_results(
        {
            'res_plsa_not_const_phi': res_plsa_not_const_phi,
            'res_plsa_const_phi': res_plsa_const_phi,
            'res_artm_thetaless': res_artm_thetaless,
            'cv_fold_scores': cv_fold_scores,
            'cv_test_scores': cv_test_scores
        }, output_path)
示例#12
0
        'talk.politics.guns',
    ])
    topic_0_indices, topic_1_indices = [], []
    for index, target in enumerate(doc_targets):
        if target == 0:
            topic_0_indices.append(index)
        elif target == 1:
            topic_1_indices.append(index)

    for balance in [1, 2, 5, 10, 20, 50, 100, 200, 300, 500]:
        n_dw_matrix = _n_dw_matrix[topic_0_indices +
                                   topic_1_indices * balance, :]
        regularization_list = [regularizers.Additive(-0.1, 0.)] * 100
        lda_phi, lda_theta = experiments.default_sample(
            n_dw_matrix,
            T=2,
            seed=42,
            optimizer=default.Optimizer(regularization_list, verbose=False))
        balanced_phi, balanced_theta = experiments.default_sample(
            n_dw_matrix,
            T=2,
            seed=42,
            optimizer=balanced.Optimizer(regularization_list, verbose=False))
        balanced_ptdw_phi, balanced_ptdw_theta = experiments.default_sample(
            n_dw_matrix,
            T=2,
            seed=42,
            optimizer=balanced_ptdw.Optimizer(regularization_list,
                                              verbose=False))
        for topic_set in metrics.get_top_words(lda_phi, 10):
            print('\n\t'.join(map(num_2_token.get, topic_set)))