示例#1
0
def test_spn_eval_opt():

    logging.basicConfig(level=logging.INFO)
    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    train_feature_vals = [2 for i in range(train.shape[1])]
    print('Loaded dataset', dataset_name)

    #
    # initing the algo
    learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen)

    learn_start_t = perf_counter()
    #
    # start learning
    spn = learnSPN.fit_structure(train,
                                 train_feature_vals)
    learn_end_t = perf_counter()
    print('Network learned in', (learn_end_t - learn_start_t), 'secs')

    # now checking performances
    infer_start_t = perf_counter()
    train_ll = 0.0
    print('Starting inference')
    for instance in train:
        (pred_ll, ) = spn.eval(instance)
        train_ll += pred_ll
    train_avg_ll = train_ll / train.shape[0]
    infer_end_t = perf_counter()
    # n avg ll -6.0180987340354 done in 43.947853350000514 secs
    print('train avg ll', train_avg_ll, 'done in',
          infer_end_t - infer_start_t, 'secs')
示例#2
0
def test_random_spn_em():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    n_layers = 2
    n_max_children = 4
    n_scope_children = 5
    max_scope_split = 3
    merge_prob = 0.5
    print('Build random spn')
    spn = SpnFactory.linked_random_spn_top_down(features, n_layers,
                                                n_max_children,
                                                n_scope_children,
                                                max_scope_split, merge_prob)

    assert spn.is_valid()
    print('Stats\n')
    print(spn.stats())

    spn.fit_em(train, valid, test, hard=False, n_epochs=10)
示例#3
0
def test_learnspn_oneshot():

    logging.basicConfig(level=logging.INFO)
    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    train_feature_vals = [2 for i in range(train.shape[1])]
    print('Loaded dataset', dataset_name)

    #
    # initing the algo
    learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen)

    #
    # start learning
    spn = learnSPN.fit_structure(train,
                                 train_feature_vals)

    # print(spn)

    #
    # testing on-the-fly
    ll = 0.0
    for instance in test:
        ll += spn.single_eval(instance)[0]

    print('avg ll', ll / test.shape[0])
示例#4
0
def test_linked_nltcs_kernel_spn_perf():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('ninst', n_instances, 'feats', features)
    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances,
                                                      features)
    print(spn.stats())
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    eval_start_t = perf_counter()
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    eval_end_t = perf_counter()
    print('AVG LL {0} in {1} secs'.format(avg_ll, eval_end_t - eval_start_t))
示例#5
0
def test_spn_eval_opt():

    logging.basicConfig(level=logging.INFO)
    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    train_feature_vals = [2 for i in range(train.shape[1])]
    print('Loaded dataset', dataset_name)

    #
    # initing the algo
    learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen)

    learn_start_t = perf_counter()
    #
    # start learning
    spn = learnSPN.fit_structure(train, train_feature_vals)
    learn_end_t = perf_counter()
    print('Network learned in', (learn_end_t - learn_start_t), 'secs')

    # now checking performances
    infer_start_t = perf_counter()
    train_ll = 0.0
    print('Starting inference')
    for instance in train:
        (pred_ll, ) = spn.eval(instance)
        train_ll += pred_ll
    train_avg_ll = train_ll / train.shape[0]
    infer_end_t = perf_counter()
    # n avg ll -6.0180987340354 done in 43.947853350000514 secs
    print('train avg ll', train_avg_ll, 'done in', infer_end_t - infer_start_t,
          'secs')
示例#6
0
def test_learnspn_oneshot():

    logging.basicConfig(level=logging.INFO)
    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    train_feature_vals = [2 for i in range(train.shape[1])]
    print('Loaded dataset', dataset_name)

    #
    # initing the algo
    learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen)

    #
    # start learning
    spn = learnSPN.fit_structure(train, train_feature_vals)

    # print(spn)

    #
    # testing on-the-fly
    ll = 0.0
    for instance in test:
        ll += spn.single_eval(instance)[0]

    print('avg ll', ll / test.shape[0])
示例#7
0
def test_linked_nltcs_kernel_spn_perf():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('ninst', n_instances, 'feats', features)
    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances,
                                                      features)
    print(spn.stats())
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    eval_start_t = perf_counter()
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    eval_end_t = perf_counter()
    print('AVG LL {0} in {1} secs'.format(avg_ll, eval_end_t - eval_start_t))
示例#8
0
def atest_theano_nltcs_kernel_spn():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.theano_kernel_density_estimation(
        n_instances,
        features,
        batch_size=n_test_instances,
        sparse=True)
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    print(avg_ll)
示例#9
0
def atest_theano_nltcs_kernel_spn():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.theano_kernel_density_estimation(
        n_instances,
        features,
        batch_size=n_test_instances,
        sparse=True)
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    print(avg_ll)
示例#10
0
def test_DPGMM():

    #
    # random generator
    seed = 1337
    rand_gen = numpy.random.RandomState(seed)

    verbose = True

    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)

    #
    # this is the max number of clustering for a truncated DP
    n_components = 100

    cov_type = 'diag'
    n_iters = 1000

    # 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'.
    cov_type = 'diag'
    concentration = 1.0
    # a higher alpha means more clusters
    # as the expected number of clusters is alpha*log(N).
    dpgmm_c = mixture.DPGMM(n_components=n_components,
                            covariance_type=cov_type,
                            random_state=rand_gen,
                            n_iter=n_iters,
                            alpha=concentration,
                            verbose=verbose)

    #
    # fitting to training set
    fit_start_t = perf_counter()
    dpgmm_c.fit(train)
    fit_end_t = perf_counter()

    #
    # getting the cluster assignment
    pred_start_t = perf_counter()
    clustering = dpgmm_c.predict(train)
    pred_end_t = perf_counter()

    print('Clustering')
    print('for instances: ', clustering.shape[0])
    print(clustering)
    print('smallest cluster', numpy.min(clustering))
    print('biggest cluster', numpy.max(clustering))
    print('clustering done in', (fit_end_t - fit_start_t), 'secs')
    print('prediction done in', (pred_end_t - pred_start_t), 'secs')

    #
    # predicting probabilities
    pred_start_t = perf_counter()
    clustering_p = dpgmm_c.predict_proba(train)
    pred_end_t = perf_counter()
    print('prediction done in', (pred_end_t - pred_start_t), 'secs')
    print(clustering_p.shape[0], clustering_p.shape[1])
示例#11
0
def test_spectral_clustering():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    learner = SpectralStructureLearner(sigma=4.0)
    k = 5
    ids = [i for i in range(train.shape[0])]
    labels, clusters, valid = learner.spectral_clustering(train, ids, k)
    print('labels:{0}\nclusters:{1}'.format(labels, clusters))
示例#12
0
def test_spnsvd_eval_nltcs():

    import dataset

    logging.basicConfig(level=logging.INFO)
    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    train_feature_vals = numpy.array([2 for i in range(train.shape[1])])
    print('Loaded dataset', dataset_name)

    gamma = 2.01
    alpha = 0.1

    cltl = True
    #
    # initing the algo
    learner = SpnSVD(gamma=gamma,
                     alpha=alpha,
                     cltree_leaves=cltl)

    learn_start_t = perf_counter()
    #
    # start learning
    spn = learner.fit_structure(train,
                                train_feature_vals)
    learn_end_t = perf_counter()
    print('Network learned in', (learn_end_t - learn_start_t), 'secs')

    # print(spn)

    #
    # now checking performances

    # infer_start_t = perf_counter()
    # train_ll = 0.0
    # print('Starting inference')
    # for instance in train:
    #     (pred_ll, ) = spn.eval(instance)
    #     train_ll += pred_ll
    # train_avg_ll = train_ll / train.shape[0]
    # infer_end_t = perf_counter()
    # # n avg ll -6.0180987340354 done in 43.947853350000514 secs
    # print('train avg ll', train_avg_ll, 'done in',
    #       infer_end_t - infer_start_t, 'secs')

    infer_start_t = perf_counter()
    test_ll = 0.0
    print('Starting inference')
    for instance in test:
        (pred_ll, ) = spn.eval(instance)
        test_ll += pred_ll
    test_avg_ll = test_ll / test.shape[0]
    infer_end_t = perf_counter()
    # n avg ll -6.0180987340354 done in 43.947853350000514 secs
    print('test avg ll', test_avg_ll, 'done in',
          infer_end_t - infer_start_t, 'secs')
示例#13
0
def test_compare_spectral_performance():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    learner = SpectralStructureLearner()
    k = 5
    ids = [i for i in range(train.shape[1])]
    labels, clusters, valid = \
        learner.spectral_clustering(train.T, ids, k,
                                    affinity_metric='gtest',
                                    pair=True)
示例#14
0
def test_spectral_cluster_learner():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    print('features', synth_feats)
    learner = SpectralStructureLearner()
    k = 5
    spn = learner.fit_structure(synth_data,
                                synth_feats,
                                k_row_clusters=k,
                                min_instances_slice=2,
                                pairwise=True)
    print(spn.stats())
示例#15
0
def atest_nltcs_em_fit():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances, features)
    print('EM training')

    spn.fit_em(train, valid, test, hard=True, epochs=2)
示例#16
0
def test_GMM():

    #
    # random generator
    seed = 1337
    rand_gen = numpy.random.RandomState(seed)

    verbose = True

    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)

    #
    # creating the classfier object
    n_components = 10
    # 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'.
    cov_type = 'diag'
    n_iters = 1000
    n_restarts = 10

    gmm_c = mixture.GMM(n_components=n_components,
                        covariance_type=cov_type,
                        random_state=rand_gen,
                        n_iter=n_iters,
                        n_init=n_restarts)

    #
    # fitting to training set
    fit_start_t = perf_counter()
    gmm_c.fit(train)
    fit_end_t = perf_counter()

    #
    # getting the cluster assignment
    pred_start_t = perf_counter()
    clustering = gmm_c.predict(train)
    pred_end_t = perf_counter()

    print('Clustering')
    print('for instances: ', clustering.shape[0])
    print(clustering)
    print('smallest cluster', numpy.min(clustering))
    print('biggest cluster', numpy.max(clustering))
    print('clustering done in', (fit_end_t - fit_start_t), 'secs')
    print('prediction done in', (pred_end_t - pred_start_t), 'secs')
示例#17
0
def test_learnspn_mixture_oneshot():

    logging.basicConfig(level=logging.INFO)
    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    train_feature_vals = [2 for i in range(train.shape[1])]
    print('Loaded dataset', dataset_name)

    #
    # initing the algo
    learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen)

    #
    # start learning
    n_mixtures = 10
    spns, (train_m_lls, valid_m_lls, test_m_lls) = \
        learnSPN.fit_mixture_bootstrap(train,
                                       n_mix_components=n_mixtures,
                                       valid=valid,
                                       test=test,
                                       feature_sizes=train_feature_vals)

    assert len(spns) == n_mixtures

    #
    # printing some stats
    print('TRAIN', train_m_lls.shape[0], train_m_lls.shape[1])
    print('VALID', valid_m_lls.shape[0], valid_m_lls.shape[1])
    print('TEST', test_m_lls.shape[0], test_m_lls.shape[1])

    assert train_m_lls.shape[0] == train.shape[0]
    assert valid_m_lls.shape[0] == valid.shape[0]
    assert test_m_lls.shape[0] == test.shape[0]

    train_m_file = 'train.m.lls.csv'
    valid_m_file = 'valid.m.lls.csv'
    test_m_file = 'test.m.lls.csv'

    #
    # reversing to csv
    numpy.savetxt(train_m_file, train_m_lls, delimiter=',', fmt='%.8e')
    numpy.savetxt(valid_m_file, valid_m_lls, delimiter=',', fmt='%.8e')
    numpy.savetxt(test_m_file, test_m_lls, delimiter=',', fmt='%.8e')
示例#18
0
def test_learnspn_mixture_oneshot():

    logging.basicConfig(level=logging.INFO)
    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    train_feature_vals = [2 for i in range(train.shape[1])]
    print('Loaded dataset', dataset_name)

    #
    # initing the algo
    learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen)

    #
    # start learning
    n_mixtures = 10
    spns, (train_m_lls, valid_m_lls, test_m_lls) = \
        learnSPN.fit_mixture_bootstrap(train,
                                       n_mix_components=n_mixtures,
                                       valid=valid,
                                       test=test,
                                       feature_sizes=train_feature_vals)

    assert len(spns) == n_mixtures

    #
    # printing some stats
    print('TRAIN', train_m_lls.shape[0], train_m_lls.shape[1])
    print('VALID', valid_m_lls.shape[0], valid_m_lls.shape[1])
    print('TEST', test_m_lls.shape[0], test_m_lls.shape[1])

    assert train_m_lls.shape[0] == train.shape[0]
    assert valid_m_lls.shape[0] == valid.shape[0]
    assert test_m_lls.shape[0] == test.shape[0]

    train_m_file = 'train.m.lls.csv'
    valid_m_file = 'valid.m.lls.csv'
    test_m_file = 'test.m.lls.csv'

    #
    # reversing to csv
    numpy.savetxt(train_m_file, train_m_lls, delimiter=',', fmt='%.8e')
    numpy.savetxt(valid_m_file, valid_m_lls, delimiter=',', fmt='%.8e')
    numpy.savetxt(test_m_file, test_m_lls, delimiter=',', fmt='%.8e')
示例#19
0
def atest_theano_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.theano_naive_factorization(features,
                                                freqs,
                                                alpha=0,
                                                batch_size=n_test_instances)
    print('Evaluating on test')
    ll = spn.eval(test.T)
    avg_ll = ll.mean()
    print(avg_ll)
示例#20
0
def test_learnspn_oneshot():

    logging.basicConfig(level=logging.INFO)
    #
    # loading a very simple dataset
    dataset_name = 'dna'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    train_feature_vals = [2 for i in range(train.shape[1])]
    print('Loaded dataset', dataset_name)

    #
    # initing the algo
    learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen)

    #
    # start learning
    spn = learnSPN.fit_structure(train, train_feature_vals)
    return spn
示例#21
0
def atest_theano_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.theano_naive_factorization(features,
                                                freqs,
                                                alpha=0,
                                                batch_size=n_test_instances)
    print('Evaluating on test')
    ll = spn.eval(test.T)
    avg_ll = ll.mean()
    print(avg_ll)
示例#22
0
def test_cluster_rows_GMM():

    #
    # random generator
    seed = 1337
    rand_gen = numpy.random.RandomState(seed)

    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    print('Loaded dataset', dataset_name)

    #
    # specifying parameters
    n_components = 10
    # 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'.
    cov_type = 'diag'
    n_iters = 1000
    n_restarts = 10

    kwargs = {}
    kwargs['covariance_type'] = cov_type

    print('Clustering with GMM')
    clustering = algo.learnspn.cluster_rows(train,
                                            n_clusters=n_components,
                                            cluster_method='GMM',
                                            n_iters=n_iters,
                                            n_restarts=n_restarts,
                                            rand_gen=rand_gen,
                                            sklearn_args=kwargs)
    print('Clustering')
    print('numbers of clusters: ', len(clustering))

    assert len(clustering) == n_components

    tot_instances = 0
    for cluster in clustering:
        tot_instances += len(cluster)
        print('cluster length:', len(cluster))

    assert tot_instances == train.shape[0]
示例#23
0
def test_cluster_rows_GMM():

    #
    # random generator
    seed = 1337
    rand_gen = numpy.random.RandomState(seed)

    #
    # loading a very simple dataset
    dataset_name = 'nltcs'
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    print('Loaded dataset', dataset_name)

    #
    # specifying parameters
    n_components = 10
    # 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'.
    cov_type = 'diag'
    n_iters = 1000
    n_restarts = 10

    kwargs = {}
    kwargs['covariance_type'] = cov_type

    print('Clustering with GMM')
    clustering = algo.learnspn.cluster_rows(train,
                                            n_clusters=n_components,
                                            cluster_method='GMM',
                                            n_iters=n_iters,
                                            n_restarts=n_restarts,
                                            rand_gen=rand_gen,
                                            sklearn_args=kwargs)
    print('Clustering')
    print('numbers of clusters: ', len(clustering))

    assert len(clustering) == n_components

    tot_instances = 0
    for cluster in clustering:
        tot_instances += len(cluster)
        print('cluster length:', len(cluster))

    assert tot_instances == train.shape[0]
示例#24
0
def aatest_linked_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')

    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.linked_naive_factorization(features, freqs, alpha=0)
    print('Evaluating on test')
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    avg_ll = numpy.mean(lls)
    print(avg_ll)
示例#25
0
def test_random_spn_sgd():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    n_layers = 1
    n_max_children = 2000
    n_scope_children = 2000
    max_scope_split = -1
    merge_prob = 0.5
    seed = 1337
    rand_gen = random.Random(seed)

    print('Build random spn')
    spn = SpnFactory.linked_random_spn_top_down(features,
                                                n_layers,
                                                n_max_children,
                                                n_scope_children,
                                                max_scope_split,
                                                merge_prob,
                                                rand_gen=rand_gen)

    assert spn.is_valid()
    print('Stats\n')
    print(spn.stats())

    np_rand_gen = numpy.random.RandomState(seed)

    spn.fit_sgd(train,
                valid,
                test,
                learning_rate=0.2,
                n_epochs=10,
                batch_size=1,
                grad_method=1,
                validation_frequency=100,
                rand_gen=np_rand_gen,
                hard=False)
示例#26
0
def test_g_test_on_dataset():
    #
    # loading the precomputed g tests from gens code (see the spn++ repo)
    dataset_name = 'book'  # msnbc

    g_factor = 1.0  # locked value for this kind of tests

    i_vals = read_ivals_from_file(dataset_name, verbose=False)

    #
    # loading the dataset (using only the training portion)
    print('Loading dataset', dataset_name)
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    n_features = train.shape[1]
    n_instances = train.shape[0]
    instance_ids = [i for i in range(n_instances)]
    feature_vals = [2 for i in range(n_features)]  # they are all binary

    g_test_start_t = perf_counter()
    #
    # computing the g_tests
    for i in range(n_features):
        for j in range(i, n_features):
            single_test_start_t = perf_counter()
            independent = algo.learnspn.g_test(i, j,
                                               instance_ids,
                                               train,
                                               feature_vals,
                                               g_factor)
            single_test_end_t = perf_counter()

            real_independent = i_vals[i][j - i]

            # print((independent, real_independent))
            #
            # checking for correspondance
            assert int(real_independent) == independent
            print('processed features', i, j, 'in',
                  (single_test_end_t - single_test_start_t), 'secs')
        # print('')
    g_test_end_t = perf_counter()
    print('Elapsed time', (g_test_end_t - g_test_start_t), 'secs')
示例#27
0
def test_greedy_feature_split():

    # on synthetic data first
    g_factor = 2
    s_instance_ids = numpy.array([0, 2, 8, 4, 3])
    s_feature_ids = numpy.array([2, 1, 4, 0, 3])

    data_slice = DataSlice(s_instance_ids, s_feature_ids)

    feat_comp_1, feat_comp_2 = algo.learnspn.greedy_feature_split(
        data, data_slice, feature_vals, g_factor, rand_gen)
    print(feat_comp_1, feat_comp_2)
    assert set(
        list(s_feature_ids)) == set(list(feat_comp_1) + list(feat_comp_2))

    #
    # loading the dataset (using only the training portion)
    dataset_name = 'nltcs'
    print('Loading dataset', dataset_name)
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
示例#28
0
def test_sampling():
    # loading nltcs
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')

    # checking for their shape
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    n_valid_instances = valid.shape[0]

    nltcs_train = 16181
    nltcs_valid = 2157
    nltcs_test = 3236

    print('Training set with {0} instances\n'.format(n_instances) +
          'Validation set with {0} instances\n'.format(n_valid_instances) +
          'Test set with {0} instances'.format(n_test_instances))

    assert n_instances == nltcs_train
    assert n_valid_instances == nltcs_valid
    assert n_test_instances == nltcs_test

    # random sampling
    perc = 0.1
    sample_train, sample_valid, sample_test = \
        dataset.sample_sets((train, valid, test), perc)

    n_s_instances = sample_train.shape[0]
    n_s_valid_instances = sample_valid.shape[0]
    n_s_test_instances = sample_test.shape[0]

    print('Sampled training set with {0} instances\n'
          .format(n_s_instances) +
          'Sampled validation set with {0} instances\n'
          .format(n_s_valid_instances) +
          'Sampled test set with {0} instances'
          .format(n_s_test_instances))

    assert n_s_instances == int(nltcs_train * perc)
    assert n_s_valid_instances == int(nltcs_valid * perc)
    assert n_s_test_instances == int(nltcs_test * perc)
示例#29
0
def aatest_linked_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')

    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.linked_naive_factorization(features,
                                                freqs,
                                                alpha=0)
    print('Evaluating on test')
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    avg_ll = numpy.mean(lls)
    print(avg_ll)
示例#30
0
def test_sgd():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances, features)

    print('Created SPN with\n' + spn.stats())

    print('Starting SGD')
    spn.fit_sgd(train,
                valid,
                test,
                learning_rate=0.1,
                n_epochs=20,
                batch_size=1,
                hard=False)
示例#31
0
def test_sampling():
    # loading nltcs
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')

    # checking for their shape
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    n_valid_instances = valid.shape[0]

    nltcs_train = 16181
    nltcs_valid = 2157
    nltcs_test = 3236

    print('Training set with {0} instances\n'.format(n_instances) +
          'Validation set with {0} instances\n'.format(n_valid_instances) +
          'Test set with {0} instances'.format(n_test_instances))

    assert n_instances == nltcs_train
    assert n_valid_instances == nltcs_valid
    assert n_test_instances == nltcs_test

    # random sampling
    perc = 0.1
    sample_train, sample_valid, sample_test = \
        dataset.sample_sets((train, valid, test), perc)

    n_s_instances = sample_train.shape[0]
    n_s_valid_instances = sample_valid.shape[0]
    n_s_test_instances = sample_test.shape[0]

    print('Sampled training set with {0} instances\n'.format(n_s_instances) +
          'Sampled validation set with {0} instances\n'.format(
              n_s_valid_instances) +
          'Sampled test set with {0} instances'.format(n_s_test_instances))

    assert n_s_instances == int(nltcs_train * perc)
    assert n_s_valid_instances == int(nltcs_valid * perc)
    assert n_s_test_instances == int(nltcs_test * perc)
示例#32
0
def test_g_test_on_dataset():
    #
    # loading the precomputed g tests from gens code (see the spn++ repo)
    dataset_name = 'book'  # msnbc

    g_factor = 1.0  # locked value for this kind of tests

    i_vals = read_ivals_from_file(dataset_name, verbose=False)

    #
    # loading the dataset (using only the training portion)
    print('Loading dataset', dataset_name)
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    n_features = train.shape[1]
    n_instances = train.shape[0]
    instance_ids = [i for i in range(n_instances)]
    feature_vals = [2 for i in range(n_features)]  # they are all binary

    g_test_start_t = perf_counter()
    #
    # computing the g_tests
    for i in range(n_features):
        for j in range(i, n_features):
            single_test_start_t = perf_counter()
            independent = algo.learnspn.g_test(i, j, instance_ids, train,
                                               feature_vals, g_factor)
            single_test_end_t = perf_counter()

            real_independent = i_vals[i][j - i]

            # print((independent, real_independent))
            #
            # checking for correspondance
            assert int(real_independent) == independent
            print('processed features', i, j, 'in',
                  (single_test_end_t - single_test_start_t), 'secs')
        # print('')
    g_test_end_t = perf_counter()
    print('Elapsed time', (g_test_end_t - g_test_start_t), 'secs')
示例#33
0
def test_greedy_feature_split():

    # on synthetic data first
    g_factor = 2
    s_instance_ids = numpy.array([0, 2, 8, 4, 3])
    s_feature_ids = numpy.array([2, 1, 4, 0, 3])

    data_slice = DataSlice(s_instance_ids, s_feature_ids)

    feat_comp_1, feat_comp_2 = algo.learnspn.greedy_feature_split(data,
                                                                  data_slice,
                                                                  feature_vals,
                                                                  g_factor,
                                                                  rand_gen)
    print(feat_comp_1, feat_comp_2)
    assert set(list(s_feature_ids)) == set(
        list(feat_comp_1) + list(feat_comp_2))

    #
    # loading the dataset (using only the training portion)
    dataset_name = 'nltcs'
    print('Loading dataset', dataset_name)
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
示例#34
0
def test_greedy_split_features():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    learner = SpectralStructureLearner()
    k = 2
    ids = [i for i in range(train.shape[1])]
    g_factor = 9
    seed = 1337
    rand_gen = numpy.random.RandomState(seed)
    data_slice = train[:100, :]
    # splitting on the features
    clustering = learner.greedy_split_features(data_slice.T, ids, g_factor,
                                               rand_gen)
    print(clustering)

    labels, clustering, valid = \
        learner.spectral_clustering(data_slice.T,
                                    ids,
                                    k,
                                    affinity_metric='gtest',
                                    validity_check=True,
                                    threshold=0.8,
                                    rand_gen=rand_gen)
    print(clustering)
示例#35
0
    dataset_splits = None
    if dataset_name == 'bmnist':
        logging.info('Loading bmnist from pickle')
        dataset_splits = load_mnist_pickle(BMNIST_PATH)
    elif dataset_name == 'caltech101':
        logging.info('Loading caltech101-silhouettes from pickle')
        dataset_splits = load_mnist_pickle(CALTECH101_PATH)
    elif dataset_name == '20newsgroups':
        logging.info('Loading 20newsgroups from pickle')
        dataset_splits = load_20newsgroups_pickle(NEWSGROUPS_PATH)
    elif dataset_name == 'ocr_letters':
        logging.info('Loading ocr letters from pickle')
        dataset_splits = load_20newsgroups_pickle(OCR_LETTERS_PATH)
    else:
        dataset_splits = dataset.load_train_val_test_csvs(dataset_name,
                                                          type=args.dtype,
                                                          suffixes=args.splits)
    for i, split in enumerate(dataset_splits):
        logging.info('\tsplit {}, shape {}, labels {}'.format(
            i, split[0].shape, split[1].shape))

    #
    # loading the learned representations
    #
    logging.info('Loading repr splits from {}'.format(args.repr_data))
    repr_splits = None
    pickle_split_path = os.path.join(
        args.repr_dir, '{}.{}'.format(args.repr_data, PICKLE_SPLIT_EXT))

    #
    # Opening the file for test prediction
示例#36
0
    os.makedirs(args.output, exist_ok=True)
    #
    # setting verbosity level
    if args.verbose == 1:
        logging.basicConfig(level=logging.INFO)
    elif args.verbose == 2:
        logging.basicConfig(level=logging.DEBUG)

    logging.info("Starting with arguments:\n%s", args)

    #
    # loading dataset splits
    logging.info('Loading datasets: %s', args.dataset)
    dataset_name = args.dataset
    train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
    logging.info('train shape: {}\nvalid shape: {}\ntest shape: {}'.format(
        train.shape, valid.shape, test.shape))

    n_instances = train.shape[0]
    n_features = train.shape[1]
    assert valid.shape[1] == n_features
    assert test.shape[1] == n_features

    feature_file_path = '{}.{}.{}'.format(args.suffix, dataset_name,
                                          FEATURE_FILE_EXT)
    feature_file_path = os.path.join(args.output, feature_file_path)
    logging.info('Saving features to {}'.format(feature_file_path))

    if args.rand_marg:
        logging.info('Rand mask feature generation')
示例#37
0
model_path = args.model

logging.info('\nLoading spn model from: {}'.format(model_path))
spn = None
with open(model_path, 'rb') as model_file:
    load_start_t = perf_counter()
    spn = pickle.load(model_file)
    load_end_t = perf_counter()
    logging.info('done in {}'.format(load_end_t - load_start_t))

#
# loading dataset
dataset_name = args.dataset
logging.info('Loading dataset {}'.format(dataset_name))
train, valid, test = dataset.load_train_val_test_csvs(dataset_name,
                                                      path='data/')

logging.info('\nEvaluating on training set')
eval_s_t = perf_counter()
train_preds = evaluate_on_dataset(spn, train)
eval_e_t = perf_counter()
train_avg_ll = numpy.mean(train_preds)
logging.info('\t{}'.format(train_avg_ll))
logging.info('\tdone in {}'.format(eval_e_t - eval_s_t))

logging.info('Evaluating on validation set')
eval_s_t = perf_counter()
valid_preds = evaluate_on_dataset(spn, valid)
eval_e_t = perf_counter()
valid_avg_ll = numpy.mean(valid_preds)
logging.info('\t{}'.format(valid_avg_ll))
示例#38
0
else:
    sklearn_args = {}
logging.info(sklearn_args)

# initing the random generators
seed = args.seed
MAX_RAND_SEED = 99999999  # sys.maxsize
rand_gen = random.Random(seed)
numpy_rand_gen = numpy.random.RandomState(seed)

#
# elaborating the dataset
#
logging.info('Loading datasets: %s', args.dataset)
(dataset_name,) = args.dataset
train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
n_instances = train.shape[0]
n_test_instances = test.shape[0]
#
# estimating the frequencies for the features
logging.info('Estimating features on training set...')
freqs, features = dataset.data_2_freqs(train)


#
# Opening the file for test prediction
#
logging.info('Opening log file...')
date_string = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
out_path = args.output + dataset_name + '_' + date_string
out_log_path = out_path + '/exp.log'
示例#39
0
import dataset

train, valid, test = dataset.load_train_val_test_csvs("bnetflix")
freqs, features = dataset.data_2_freqs(train)
print(train.shape)
print(len(features))