Exemplo n.º 1
0
    def __init__(self, var, var_values, alpha=0.1,
                 freqs=None, data=None, instances=None):
        """
        WRITEME
        """

        Node.__init__(self, frozenset({var}))

        self.var = var
        self.var_val = var_values

        # building storing freqs
        if data is None:
            if freqs is None:
                self._var_freqs = [1 for i in range(var_values)]
            else:
                self._var_freqs = freqs[:]
        else:
            # better checking for numpy arrays shape
            assert data.shape[1] == 1
            (freqs_dict,), _features = dataset.data_2_freqs(data)
            self._var_freqs = freqs_dict['freqs']

        # computing the smoothed ll
        self._var_probs = CategoricalSmoothedNode.smooth_ll(self._var_freqs[:],
                                                            alpha)

        # storing instance ids (it is a list)
        self._instances = instances
Exemplo n.º 2
0
def atest_theano_nltcs_kernel_spn():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.theano_kernel_density_estimation(
        n_instances,
        features,
        batch_size=n_test_instances,
        sparse=True)
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    print(avg_ll)
Exemplo n.º 3
0
    def __init__(self, var, var_values, alpha=0.1,
                 freqs=None, data=None, instances=None):
        """
        WRITEME
        """

        Node.__init__(self, frozenset({var}))

        self.var = var
        self.var_val = var_values

        # building storing freqs
        if data is None:
            if freqs is None:
                self._var_freqs = [1 for i in range(var_values)]
            else:
                self._var_freqs = freqs[:]
        else:
            # better checking for numpy arrays shape
            assert data.shape[1] == 1
            (freqs_dict,), _features = dataset.data_2_freqs(data)
            self._var_freqs = freqs_dict['freqs']

        # computing the smoothed ll
        self._var_probs = CategoricalSmoothedNode.smooth_ll(self._var_freqs[:],
                                                            alpha)

        # storing instance ids (it is a list)
        self._instances = instances
Exemplo n.º 4
0
def atest_theano_nltcs_kernel_spn():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.theano_kernel_density_estimation(
        n_instances,
        features,
        batch_size=n_test_instances,
        sparse=True)
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    print(avg_ll)
Exemplo n.º 5
0
def test_linked_nltcs_kernel_spn_perf():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('ninst', n_instances, 'feats', features)
    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances,
                                                      features)
    print(spn.stats())
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    eval_start_t = perf_counter()
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    eval_end_t = perf_counter()
    print('AVG LL {0} in {1} secs'.format(avg_ll, eval_end_t - eval_start_t))
Exemplo n.º 6
0
def test_random_spn_em():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    n_layers = 2
    n_max_children = 4
    n_scope_children = 5
    max_scope_split = 3
    merge_prob = 0.5
    print('Build random spn')
    spn = SpnFactory.linked_random_spn_top_down(features, n_layers,
                                                n_max_children,
                                                n_scope_children,
                                                max_scope_split, merge_prob)

    assert spn.is_valid()
    print('Stats\n')
    print(spn.stats())

    spn.fit_em(train, valid, test, hard=False, n_epochs=10)
Exemplo n.º 7
0
def test_linked_nltcs_kernel_spn_perf():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('ninst', n_instances, 'feats', features)
    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances,
                                                      features)
    print(spn.stats())
    print('Evaluating on test')
    # evaluating one at a time since we are using a sparse representation
    lls = []
    eval_start_t = perf_counter()
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    # avg_ll = sum(lls) / float(len(lls))
    avg_ll = numpy.mean(lls)
    eval_end_t = perf_counter()
    print('AVG LL {0} in {1} secs'.format(avg_ll, eval_end_t - eval_start_t))
Exemplo n.º 8
0
    def smooth_freq_from_data(data, alpha):
        """
        WRITEME
        """
        # data here shall have only one feature
        assert data.shape[1] == 1
        (freqs_dict,), _features = dataset.data_2_freqs(data)

        return CategoricalSmoothedNode.smooth_ll(freqs_dict['freqs'], alpha)
Exemplo n.º 9
0
    def smooth_freq_from_data(data, alpha):
        """
        WRITEME
        """
        # data here shall have only one feature
        assert data.shape[1] == 1
        (freqs_dict,), _features = dataset.data_2_freqs(data)

        return CategoricalSmoothedNode.smooth_ll(freqs_dict['freqs'], alpha)
Exemplo n.º 10
0
def atest_nltcs_em_fit():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances, features)
    print('EM training')

    spn.fit_em(train, valid, test, hard=True, epochs=2)
Exemplo n.º 11
0
def atest_theano_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.theano_naive_factorization(features,
                                                freqs,
                                                alpha=0,
                                                batch_size=n_test_instances)
    print('Evaluating on test')
    ll = spn.eval(test.T)
    avg_ll = ll.mean()
    print(avg_ll)
Exemplo n.º 12
0
def atest_theano_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.theano_naive_factorization(features,
                                                freqs,
                                                alpha=0,
                                                batch_size=n_test_instances)
    print('Evaluating on test')
    ll = spn.eval(test.T)
    avg_ll = ll.mean()
    print(avg_ll)
Exemplo n.º 13
0
def aatest_linked_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')

    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.linked_naive_factorization(features, freqs, alpha=0)
    print('Evaluating on test')
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    avg_ll = numpy.mean(lls)
    print(avg_ll)
Exemplo n.º 14
0
def test_random_spn_sgd():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    n_layers = 1
    n_max_children = 2000
    n_scope_children = 2000
    max_scope_split = -1
    merge_prob = 0.5
    seed = 1337
    rand_gen = random.Random(seed)

    print('Build random spn')
    spn = SpnFactory.linked_random_spn_top_down(features,
                                                n_layers,
                                                n_max_children,
                                                n_scope_children,
                                                max_scope_split,
                                                merge_prob,
                                                rand_gen=rand_gen)

    assert spn.is_valid()
    print('Stats\n')
    print(spn.stats())

    np_rand_gen = numpy.random.RandomState(seed)

    spn.fit_sgd(train,
                valid,
                test,
                learning_rate=0.2,
                n_epochs=10,
                batch_size=1,
                grad_method=1,
                validation_frequency=100,
                rand_gen=np_rand_gen,
                hard=False)
Exemplo n.º 15
0
def aatest_linked_nltcs_naive_spn():
    # load the dataset
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('tmovie')

    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Building naive factorization')
    spn = SpnFactory.linked_naive_factorization(features,
                                                freqs,
                                                alpha=0)
    print('Evaluating on test')
    lls = []
    for i in range(test.shape[0]):
        print('instance', i)
        lls.append(spn.eval(test[i, :]))
    print('Mean lls')
    avg_ll = numpy.mean(lls)
    print(avg_ll)
Exemplo n.º 16
0
def test_sgd():
    print('Loading datasets')
    train, valid, test = dataset.load_train_val_test_csvs('nltcs')
    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    # estimating the frequencies for the features
    print('Estimating features')
    freqs, features = dataset.data_2_freqs(train)

    print('Build kernel density estimation')
    spn = SpnFactory.linked_kernel_density_estimation(n_instances, features)

    print('Created SPN with\n' + spn.stats())

    print('Starting SGD')
    spn.fit_sgd(train,
                valid,
                test,
                learning_rate=0.1,
                n_epochs=20,
                batch_size=1,
                hard=False)
Exemplo n.º 17
0
    logging.info("Starting with arguments:\n%s", args)

    #
    # loading dataset splits
    logging.info('Loading datasets: %s', args.dataset)
    dataset_path = args.dataset
    train, valid, test = dataset.load_dataset_splits(
        dataset_path,
        filter_regex=[args.train_ext, args.valid_ext, args.test_ext])
    dataset_name = args.train_ext.split('.')[0]

    n_instances = train.shape[0]
    n_test_instances = test.shape[0]
    logging.info('\ttrain: {}\n\tvalid: {}\n\ttest: {}'.format(
        train.shape, valid.shape, test.shape))
    freqs, feature_vals = dataset.data_2_freqs(train)

    logging.info('Opening log file...')
    date_string = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    out_path = args.output + dataset_name + '_' + date_string
    out_log_path = out_path + '/exp.log'
    test_lls_path = out_path + '/test.lls'
    os.makedirs(out_path, exist_ok=True)

    repr_train = None
    repr_valid = None
    repr_test = None

    #
    #
    # performing a grid search along the hyperparameter space
Exemplo n.º 18
0
    def learn_model(self, cltree_leaves, args, comp, bgg):
        #set parameters for learning AC (cltree_leaves=True)and AL(cltree_leaves=false)
        print('-------MODELS CONSTRUCTION-----------')
        verbose = 1
        n_row_clusters = 2
        cluster_method = 'GMM'
        seed = 1337
        n_iters = 100
        n_restarts = 4
        cluster_penalties = [1.0]
        sklearn_Args = None
        if not args:
            g_factors = [5, 10, 15]
            min_inst_slices = [10, 50, 100]
            alphas = [0.1, 0.5, 1.0, 2.0]
        else:
            g_factors = [args[0]]
            min_inst_slices = [args[1]]
            alphas = [args[2]]
        # setting verbosity level
        if verbose == 1:
            logging.basicConfig(level=logging.INFO)
        elif verbose == 2:
            logging.basicConfig(level=logging.DEBUG)

        # logging.info("Starting with arguments:\n")

        if sklearn_Args is not None:
            sklearn_key_value_pairs = sklearn_translate({
                ord('['): '',
                ord(']'): ''
            }).split(',')
            sklearn_args = {
                key.strip(): value.strip()
                for key, value in
                [pair.strip().split('=') for pair in sklearn_key_value_pairs]
            }
        else:
            sklearn_args = {}
        # logging.info(sklearn_args)

        # initing the random generators
        MAX_RAND_SEED = 99999999  # sys.maxsize
        rand_gen = random.Random(seed)
        numpy_rand_gen = numpy.random.RandomState(seed)

        #
        # elaborating the dataset
        #

        dataset_name = self.dataset
        # logging.info('Loading datasets: %s', dataset_name)
        train = self.train
        n_instances = train.shape[0]

        #
        # estimating the frequencies for the features
        # logging.info('')
        freqs, features = dataset.data_2_freqs(train)
        best_train_avg_ll = NEG_INF
        best_state = {}
        best_test_lls = None
        index = 0
        spns = []
        for g_factor in g_factors:
            for cluster_penalty in cluster_penalties:
                for min_inst_slice in min_inst_slices:
                    print('model')
                    # Creating the structure learner
                    learner = LearnSPN(
                        g_factor=g_factor,
                        min_instances_slice=min_inst_slice,
                        # alpha=alpha,
                        row_cluster_method=cluster_method,
                        cluster_penalty=cluster_penalty,
                        n_cluster_splits=n_row_clusters,
                        n_iters=n_iters,
                        n_restarts=n_restarts,
                        sklearn_args=sklearn_args,
                        cltree_leaves=cltree_leaves,
                        rand_gen=numpy_rand_gen)

                    learn_start_t = perf_counter()

                    # build an spn on the training set
                    if (bgg):
                        spn = learner.fit_structure_bagging(
                            data=train,
                            feature_sizes=features,
                            n_components=comp)
                    else:
                        spn = learner.fit_structure(data=train,
                                                    feature_sizes=features)

                    learn_end_t = perf_counter()
                    n_edges = spn.n_edges()
                    n_levels = spn.n_layers()
                    n_weights = spn.n_weights()
                    n_leaves = spn.n_leaves()

                    #
                    # smoothing can be done after the spn has been built
                    for alpha in alphas:
                        # logging.info('Smoothing leaves with alpha = %f', alpha)
                        spn.smooth_leaves(alpha)
                        spns.append(spn)

                        # Compute LL on training set
                        # logging.info('Evaluating on training set')
                        train_ll = 0.0

                        for instance in train:
                            (pred_ll, ) = spn.eval(instance)
                            train_ll += pred_ll
                        train_avg_ll = train_ll / train.shape[0]

                        # updating best stats according to train ll
                        if train_avg_ll > best_train_avg_ll:
                            best_train_avg_ll = train_avg_ll
                            best_state['alpha'] = alpha
                            best_state['min_inst_slice'] = min_inst_slice
                            best_state['g_factor'] = g_factor
                            best_state['cluster_penalty'] = cluster_penalty
                            best_state['train_ll'] = train_avg_ll
                            best_state['index'] = index
                            best_state['name'] = self.dataset

                        # writing to file a line for the grid
                        # stats = stats_format([g_factor,
                        #                       cluster_penalty,
                        #                       min_inst_slice,
                        #                       alpha,
                        #                       n_edges, n_levels,
                        #                       n_weights, n_leaves,
                        #                       train_avg_ll],
                        #                      '\t',
                        #                      digits=5)
                        # index = index + 1

        best_spn = spns[best_state['index']]
        # logging.info('Grid search ended.')
        # logging.info('Best params:\n\t%s', best_state)

        return best_spn, best_state['g_factor'], best_state[
            'min_inst_slice'], best_state['alpha']
Exemplo n.º 19
0
MAX_RAND_SEED = 99999999  # sys.maxsize
rand_gen = random.Random(seed)
numpy_rand_gen = numpy.random.RandomState(seed)

#
# elaborating the dataset
#
logging.info('Loading datasets: %s', args.dataset)
(dataset_name,) = args.dataset
train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
n_instances = train.shape[0]
n_test_instances = test.shape[0]
#
# estimating the frequencies for the features
logging.info('Estimating features on training set...')
freqs, features = dataset.data_2_freqs(train)


#
# Opening the file for test prediction
#
logging.info('Opening log file...')
date_string = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
out_path = args.output + dataset_name + '_' + date_string
out_log_path = out_path + '/exp.log'
test_lls_path = out_path + '/test.lls'

#
# creating dir if non-existant
if not os.path.exists(os.path.dirname(out_log_path)):
    os.makedirs(os.path.dirname(out_log_path))