def __init__(self, var, var_values, alpha=0.1, freqs=None, data=None, instances=None): """ WRITEME """ Node.__init__(self, frozenset({var})) self.var = var self.var_val = var_values # building storing freqs if data is None: if freqs is None: self._var_freqs = [1 for i in range(var_values)] else: self._var_freqs = freqs[:] else: # better checking for numpy arrays shape assert data.shape[1] == 1 (freqs_dict,), _features = dataset.data_2_freqs(data) self._var_freqs = freqs_dict['freqs'] # computing the smoothed ll self._var_probs = CategoricalSmoothedNode.smooth_ll(self._var_freqs[:], alpha) # storing instance ids (it is a list) self._instances = instances
def atest_theano_nltcs_kernel_spn(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] n_test_instances = test.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('Build kernel density estimation') spn = SpnFactory.theano_kernel_density_estimation( n_instances, features, batch_size=n_test_instances, sparse=True) print('Evaluating on test') # evaluating one at a time since we are using a sparse representation lls = [] for i in range(test.shape[0]): print('instance', i) lls.append(spn.eval(test[i, :])) print('Mean lls') # avg_ll = sum(lls) / float(len(lls)) avg_ll = numpy.mean(lls) print(avg_ll)
def test_linked_nltcs_kernel_spn_perf(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('ninst', n_instances, 'feats', features) print('Build kernel density estimation') spn = SpnFactory.linked_kernel_density_estimation(n_instances, features) print(spn.stats()) print('Evaluating on test') # evaluating one at a time since we are using a sparse representation lls = [] eval_start_t = perf_counter() for i in range(test.shape[0]): print('instance', i) lls.append(spn.eval(test[i, :])) print('Mean lls') # avg_ll = sum(lls) / float(len(lls)) avg_ll = numpy.mean(lls) eval_end_t = perf_counter() print('AVG LL {0} in {1} secs'.format(avg_ll, eval_end_t - eval_start_t))
def test_random_spn_em(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] n_test_instances = test.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) n_layers = 2 n_max_children = 4 n_scope_children = 5 max_scope_split = 3 merge_prob = 0.5 print('Build random spn') spn = SpnFactory.linked_random_spn_top_down(features, n_layers, n_max_children, n_scope_children, max_scope_split, merge_prob) assert spn.is_valid() print('Stats\n') print(spn.stats()) spn.fit_em(train, valid, test, hard=False, n_epochs=10)
def smooth_freq_from_data(data, alpha): """ WRITEME """ # data here shall have only one feature assert data.shape[1] == 1 (freqs_dict,), _features = dataset.data_2_freqs(data) return CategoricalSmoothedNode.smooth_ll(freqs_dict['freqs'], alpha)
def atest_nltcs_em_fit(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('Build kernel density estimation') spn = SpnFactory.linked_kernel_density_estimation(n_instances, features) print('EM training') spn.fit_em(train, valid, test, hard=True, epochs=2)
def atest_theano_nltcs_naive_spn(): # load the dataset print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('tmovie') n_test_instances = test.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('Building naive factorization') spn = SpnFactory.theano_naive_factorization(features, freqs, alpha=0, batch_size=n_test_instances) print('Evaluating on test') ll = spn.eval(test.T) avg_ll = ll.mean() print(avg_ll)
def aatest_linked_nltcs_naive_spn(): # load the dataset print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('tmovie') # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('Building naive factorization') spn = SpnFactory.linked_naive_factorization(features, freqs, alpha=0) print('Evaluating on test') lls = [] for i in range(test.shape[0]): print('instance', i) lls.append(spn.eval(test[i, :])) print('Mean lls') avg_ll = numpy.mean(lls) print(avg_ll)
def test_random_spn_sgd(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] n_test_instances = test.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) n_layers = 1 n_max_children = 2000 n_scope_children = 2000 max_scope_split = -1 merge_prob = 0.5 seed = 1337 rand_gen = random.Random(seed) print('Build random spn') spn = SpnFactory.linked_random_spn_top_down(features, n_layers, n_max_children, n_scope_children, max_scope_split, merge_prob, rand_gen=rand_gen) assert spn.is_valid() print('Stats\n') print(spn.stats()) np_rand_gen = numpy.random.RandomState(seed) spn.fit_sgd(train, valid, test, learning_rate=0.2, n_epochs=10, batch_size=1, grad_method=1, validation_frequency=100, rand_gen=np_rand_gen, hard=False)
def test_sgd(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] n_test_instances = test.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('Build kernel density estimation') spn = SpnFactory.linked_kernel_density_estimation(n_instances, features) print('Created SPN with\n' + spn.stats()) print('Starting SGD') spn.fit_sgd(train, valid, test, learning_rate=0.1, n_epochs=20, batch_size=1, hard=False)
logging.info("Starting with arguments:\n%s", args) # # loading dataset splits logging.info('Loading datasets: %s', args.dataset) dataset_path = args.dataset train, valid, test = dataset.load_dataset_splits( dataset_path, filter_regex=[args.train_ext, args.valid_ext, args.test_ext]) dataset_name = args.train_ext.split('.')[0] n_instances = train.shape[0] n_test_instances = test.shape[0] logging.info('\ttrain: {}\n\tvalid: {}\n\ttest: {}'.format( train.shape, valid.shape, test.shape)) freqs, feature_vals = dataset.data_2_freqs(train) logging.info('Opening log file...') date_string = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") out_path = args.output + dataset_name + '_' + date_string out_log_path = out_path + '/exp.log' test_lls_path = out_path + '/test.lls' os.makedirs(out_path, exist_ok=True) repr_train = None repr_valid = None repr_test = None # # # performing a grid search along the hyperparameter space
def learn_model(self, cltree_leaves, args, comp, bgg): #set parameters for learning AC (cltree_leaves=True)and AL(cltree_leaves=false) print('-------MODELS CONSTRUCTION-----------') verbose = 1 n_row_clusters = 2 cluster_method = 'GMM' seed = 1337 n_iters = 100 n_restarts = 4 cluster_penalties = [1.0] sklearn_Args = None if not args: g_factors = [5, 10, 15] min_inst_slices = [10, 50, 100] alphas = [0.1, 0.5, 1.0, 2.0] else: g_factors = [args[0]] min_inst_slices = [args[1]] alphas = [args[2]] # setting verbosity level if verbose == 1: logging.basicConfig(level=logging.INFO) elif verbose == 2: logging.basicConfig(level=logging.DEBUG) # logging.info("Starting with arguments:\n") if sklearn_Args is not None: sklearn_key_value_pairs = sklearn_translate({ ord('['): '', ord(']'): '' }).split(',') sklearn_args = { key.strip(): value.strip() for key, value in [pair.strip().split('=') for pair in sklearn_key_value_pairs] } else: sklearn_args = {} # logging.info(sklearn_args) # initing the random generators MAX_RAND_SEED = 99999999 # sys.maxsize rand_gen = random.Random(seed) numpy_rand_gen = numpy.random.RandomState(seed) # # elaborating the dataset # dataset_name = self.dataset # logging.info('Loading datasets: %s', dataset_name) train = self.train n_instances = train.shape[0] # # estimating the frequencies for the features # logging.info('') freqs, features = dataset.data_2_freqs(train) best_train_avg_ll = NEG_INF best_state = {} best_test_lls = None index = 0 spns = [] for g_factor in g_factors: for cluster_penalty in cluster_penalties: for min_inst_slice in min_inst_slices: print('model') # Creating the structure learner learner = LearnSPN( g_factor=g_factor, min_instances_slice=min_inst_slice, # alpha=alpha, row_cluster_method=cluster_method, cluster_penalty=cluster_penalty, n_cluster_splits=n_row_clusters, n_iters=n_iters, n_restarts=n_restarts, sklearn_args=sklearn_args, cltree_leaves=cltree_leaves, rand_gen=numpy_rand_gen) learn_start_t = perf_counter() # build an spn on the training set if (bgg): spn = learner.fit_structure_bagging( data=train, feature_sizes=features, n_components=comp) else: spn = learner.fit_structure(data=train, feature_sizes=features) learn_end_t = perf_counter() n_edges = spn.n_edges() n_levels = spn.n_layers() n_weights = spn.n_weights() n_leaves = spn.n_leaves() # # smoothing can be done after the spn has been built for alpha in alphas: # logging.info('Smoothing leaves with alpha = %f', alpha) spn.smooth_leaves(alpha) spns.append(spn) # Compute LL on training set # logging.info('Evaluating on training set') train_ll = 0.0 for instance in train: (pred_ll, ) = spn.eval(instance) train_ll += pred_ll train_avg_ll = train_ll / train.shape[0] # updating best stats according to train ll if train_avg_ll > best_train_avg_ll: best_train_avg_ll = train_avg_ll best_state['alpha'] = alpha best_state['min_inst_slice'] = min_inst_slice best_state['g_factor'] = g_factor best_state['cluster_penalty'] = cluster_penalty best_state['train_ll'] = train_avg_ll best_state['index'] = index best_state['name'] = self.dataset # writing to file a line for the grid # stats = stats_format([g_factor, # cluster_penalty, # min_inst_slice, # alpha, # n_edges, n_levels, # n_weights, n_leaves, # train_avg_ll], # '\t', # digits=5) # index = index + 1 best_spn = spns[best_state['index']] # logging.info('Grid search ended.') # logging.info('Best params:\n\t%s', best_state) return best_spn, best_state['g_factor'], best_state[ 'min_inst_slice'], best_state['alpha']
MAX_RAND_SEED = 99999999 # sys.maxsize rand_gen = random.Random(seed) numpy_rand_gen = numpy.random.RandomState(seed) # # elaborating the dataset # logging.info('Loading datasets: %s', args.dataset) (dataset_name,) = args.dataset train, valid, test = dataset.load_train_val_test_csvs(dataset_name) n_instances = train.shape[0] n_test_instances = test.shape[0] # # estimating the frequencies for the features logging.info('Estimating features on training set...') freqs, features = dataset.data_2_freqs(train) # # Opening the file for test prediction # logging.info('Opening log file...') date_string = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") out_path = args.output + dataset_name + '_' + date_string out_log_path = out_path + '/exp.log' test_lls_path = out_path + '/test.lls' # # creating dir if non-existant if not os.path.exists(os.path.dirname(out_log_path)): os.makedirs(os.path.dirname(out_log_path))