def test_spn_eval_opt(): logging.basicConfig(level=logging.INFO) # # loading a very simple dataset dataset_name = 'nltcs' train, valid, test = dataset.load_train_val_test_csvs(dataset_name) train_feature_vals = [2 for i in range(train.shape[1])] print('Loaded dataset', dataset_name) # # initing the algo learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen) learn_start_t = perf_counter() # # start learning spn = learnSPN.fit_structure(train, train_feature_vals) learn_end_t = perf_counter() print('Network learned in', (learn_end_t - learn_start_t), 'secs') # now checking performances infer_start_t = perf_counter() train_ll = 0.0 print('Starting inference') for instance in train: (pred_ll, ) = spn.eval(instance) train_ll += pred_ll train_avg_ll = train_ll / train.shape[0] infer_end_t = perf_counter() # n avg ll -6.0180987340354 done in 43.947853350000514 secs print('train avg ll', train_avg_ll, 'done in', infer_end_t - infer_start_t, 'secs')
def test_random_spn_em(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] n_test_instances = test.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) n_layers = 2 n_max_children = 4 n_scope_children = 5 max_scope_split = 3 merge_prob = 0.5 print('Build random spn') spn = SpnFactory.linked_random_spn_top_down(features, n_layers, n_max_children, n_scope_children, max_scope_split, merge_prob) assert spn.is_valid() print('Stats\n') print(spn.stats()) spn.fit_em(train, valid, test, hard=False, n_epochs=10)
def test_learnspn_oneshot(): logging.basicConfig(level=logging.INFO) # # loading a very simple dataset dataset_name = 'nltcs' train, valid, test = dataset.load_train_val_test_csvs(dataset_name) train_feature_vals = [2 for i in range(train.shape[1])] print('Loaded dataset', dataset_name) # # initing the algo learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen) # # start learning spn = learnSPN.fit_structure(train, train_feature_vals) # print(spn) # # testing on-the-fly ll = 0.0 for instance in test: ll += spn.single_eval(instance)[0] print('avg ll', ll / test.shape[0])
def test_linked_nltcs_kernel_spn_perf(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('ninst', n_instances, 'feats', features) print('Build kernel density estimation') spn = SpnFactory.linked_kernel_density_estimation(n_instances, features) print(spn.stats()) print('Evaluating on test') # evaluating one at a time since we are using a sparse representation lls = [] eval_start_t = perf_counter() for i in range(test.shape[0]): print('instance', i) lls.append(spn.eval(test[i, :])) print('Mean lls') # avg_ll = sum(lls) / float(len(lls)) avg_ll = numpy.mean(lls) eval_end_t = perf_counter() print('AVG LL {0} in {1} secs'.format(avg_ll, eval_end_t - eval_start_t))
def atest_theano_nltcs_kernel_spn(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] n_test_instances = test.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('Build kernel density estimation') spn = SpnFactory.theano_kernel_density_estimation( n_instances, features, batch_size=n_test_instances, sparse=True) print('Evaluating on test') # evaluating one at a time since we are using a sparse representation lls = [] for i in range(test.shape[0]): print('instance', i) lls.append(spn.eval(test[i, :])) print('Mean lls') # avg_ll = sum(lls) / float(len(lls)) avg_ll = numpy.mean(lls) print(avg_ll)
def test_DPGMM(): # # random generator seed = 1337 rand_gen = numpy.random.RandomState(seed) verbose = True # # loading a very simple dataset dataset_name = 'nltcs' train, valid, test = dataset.load_train_val_test_csvs(dataset_name) # # this is the max number of clustering for a truncated DP n_components = 100 cov_type = 'diag' n_iters = 1000 # 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'. cov_type = 'diag' concentration = 1.0 # a higher alpha means more clusters # as the expected number of clusters is alpha*log(N). dpgmm_c = mixture.DPGMM(n_components=n_components, covariance_type=cov_type, random_state=rand_gen, n_iter=n_iters, alpha=concentration, verbose=verbose) # # fitting to training set fit_start_t = perf_counter() dpgmm_c.fit(train) fit_end_t = perf_counter() # # getting the cluster assignment pred_start_t = perf_counter() clustering = dpgmm_c.predict(train) pred_end_t = perf_counter() print('Clustering') print('for instances: ', clustering.shape[0]) print(clustering) print('smallest cluster', numpy.min(clustering)) print('biggest cluster', numpy.max(clustering)) print('clustering done in', (fit_end_t - fit_start_t), 'secs') print('prediction done in', (pred_end_t - pred_start_t), 'secs') # # predicting probabilities pred_start_t = perf_counter() clustering_p = dpgmm_c.predict_proba(train) pred_end_t = perf_counter() print('prediction done in', (pred_end_t - pred_start_t), 'secs') print(clustering_p.shape[0], clustering_p.shape[1])
def test_spectral_clustering(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') learner = SpectralStructureLearner(sigma=4.0) k = 5 ids = [i for i in range(train.shape[0])] labels, clusters, valid = learner.spectral_clustering(train, ids, k) print('labels:{0}\nclusters:{1}'.format(labels, clusters))
def test_spnsvd_eval_nltcs(): import dataset logging.basicConfig(level=logging.INFO) # # loading a very simple dataset dataset_name = 'nltcs' train, valid, test = dataset.load_train_val_test_csvs(dataset_name) train_feature_vals = numpy.array([2 for i in range(train.shape[1])]) print('Loaded dataset', dataset_name) gamma = 2.01 alpha = 0.1 cltl = True # # initing the algo learner = SpnSVD(gamma=gamma, alpha=alpha, cltree_leaves=cltl) learn_start_t = perf_counter() # # start learning spn = learner.fit_structure(train, train_feature_vals) learn_end_t = perf_counter() print('Network learned in', (learn_end_t - learn_start_t), 'secs') # print(spn) # # now checking performances # infer_start_t = perf_counter() # train_ll = 0.0 # print('Starting inference') # for instance in train: # (pred_ll, ) = spn.eval(instance) # train_ll += pred_ll # train_avg_ll = train_ll / train.shape[0] # infer_end_t = perf_counter() # # n avg ll -6.0180987340354 done in 43.947853350000514 secs # print('train avg ll', train_avg_ll, 'done in', # infer_end_t - infer_start_t, 'secs') infer_start_t = perf_counter() test_ll = 0.0 print('Starting inference') for instance in test: (pred_ll, ) = spn.eval(instance) test_ll += pred_ll test_avg_ll = test_ll / test.shape[0] infer_end_t = perf_counter() # n avg ll -6.0180987340354 done in 43.947853350000514 secs print('test avg ll', test_avg_ll, 'done in', infer_end_t - infer_start_t, 'secs')
def test_compare_spectral_performance(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') learner = SpectralStructureLearner() k = 5 ids = [i for i in range(train.shape[1])] labels, clusters, valid = \ learner.spectral_clustering(train.T, ids, k, affinity_metric='gtest', pair=True)
def test_spectral_cluster_learner(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') print('features', synth_feats) learner = SpectralStructureLearner() k = 5 spn = learner.fit_structure(synth_data, synth_feats, k_row_clusters=k, min_instances_slice=2, pairwise=True) print(spn.stats())
def atest_nltcs_em_fit(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('Build kernel density estimation') spn = SpnFactory.linked_kernel_density_estimation(n_instances, features) print('EM training') spn.fit_em(train, valid, test, hard=True, epochs=2)
def test_GMM(): # # random generator seed = 1337 rand_gen = numpy.random.RandomState(seed) verbose = True # # loading a very simple dataset dataset_name = 'nltcs' train, valid, test = dataset.load_train_val_test_csvs(dataset_name) # # creating the classfier object n_components = 10 # 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'. cov_type = 'diag' n_iters = 1000 n_restarts = 10 gmm_c = mixture.GMM(n_components=n_components, covariance_type=cov_type, random_state=rand_gen, n_iter=n_iters, n_init=n_restarts) # # fitting to training set fit_start_t = perf_counter() gmm_c.fit(train) fit_end_t = perf_counter() # # getting the cluster assignment pred_start_t = perf_counter() clustering = gmm_c.predict(train) pred_end_t = perf_counter() print('Clustering') print('for instances: ', clustering.shape[0]) print(clustering) print('smallest cluster', numpy.min(clustering)) print('biggest cluster', numpy.max(clustering)) print('clustering done in', (fit_end_t - fit_start_t), 'secs') print('prediction done in', (pred_end_t - pred_start_t), 'secs')
def test_learnspn_mixture_oneshot(): logging.basicConfig(level=logging.INFO) # # loading a very simple dataset dataset_name = 'nltcs' train, valid, test = dataset.load_train_val_test_csvs(dataset_name) train_feature_vals = [2 for i in range(train.shape[1])] print('Loaded dataset', dataset_name) # # initing the algo learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen) # # start learning n_mixtures = 10 spns, (train_m_lls, valid_m_lls, test_m_lls) = \ learnSPN.fit_mixture_bootstrap(train, n_mix_components=n_mixtures, valid=valid, test=test, feature_sizes=train_feature_vals) assert len(spns) == n_mixtures # # printing some stats print('TRAIN', train_m_lls.shape[0], train_m_lls.shape[1]) print('VALID', valid_m_lls.shape[0], valid_m_lls.shape[1]) print('TEST', test_m_lls.shape[0], test_m_lls.shape[1]) assert train_m_lls.shape[0] == train.shape[0] assert valid_m_lls.shape[0] == valid.shape[0] assert test_m_lls.shape[0] == test.shape[0] train_m_file = 'train.m.lls.csv' valid_m_file = 'valid.m.lls.csv' test_m_file = 'test.m.lls.csv' # # reversing to csv numpy.savetxt(train_m_file, train_m_lls, delimiter=',', fmt='%.8e') numpy.savetxt(valid_m_file, valid_m_lls, delimiter=',', fmt='%.8e') numpy.savetxt(test_m_file, test_m_lls, delimiter=',', fmt='%.8e')
def atest_theano_nltcs_naive_spn(): # load the dataset print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('tmovie') n_test_instances = test.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('Building naive factorization') spn = SpnFactory.theano_naive_factorization(features, freqs, alpha=0, batch_size=n_test_instances) print('Evaluating on test') ll = spn.eval(test.T) avg_ll = ll.mean() print(avg_ll)
def test_learnspn_oneshot(): logging.basicConfig(level=logging.INFO) # # loading a very simple dataset dataset_name = 'dna' train, valid, test = dataset.load_train_val_test_csvs(dataset_name) train_feature_vals = [2 for i in range(train.shape[1])] print('Loaded dataset', dataset_name) # # initing the algo learnSPN = algo.learnspn.LearnSPN(rand_gen=rand_gen) # # start learning spn = learnSPN.fit_structure(train, train_feature_vals) return spn
def test_cluster_rows_GMM(): # # random generator seed = 1337 rand_gen = numpy.random.RandomState(seed) # # loading a very simple dataset dataset_name = 'nltcs' train, valid, test = dataset.load_train_val_test_csvs(dataset_name) print('Loaded dataset', dataset_name) # # specifying parameters n_components = 10 # 'spherical', 'tied', 'diag', 'full'. Defaults to 'diag'. cov_type = 'diag' n_iters = 1000 n_restarts = 10 kwargs = {} kwargs['covariance_type'] = cov_type print('Clustering with GMM') clustering = algo.learnspn.cluster_rows(train, n_clusters=n_components, cluster_method='GMM', n_iters=n_iters, n_restarts=n_restarts, rand_gen=rand_gen, sklearn_args=kwargs) print('Clustering') print('numbers of clusters: ', len(clustering)) assert len(clustering) == n_components tot_instances = 0 for cluster in clustering: tot_instances += len(cluster) print('cluster length:', len(cluster)) assert tot_instances == train.shape[0]
def aatest_linked_nltcs_naive_spn(): # load the dataset print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('tmovie') # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('Building naive factorization') spn = SpnFactory.linked_naive_factorization(features, freqs, alpha=0) print('Evaluating on test') lls = [] for i in range(test.shape[0]): print('instance', i) lls.append(spn.eval(test[i, :])) print('Mean lls') avg_ll = numpy.mean(lls) print(avg_ll)
def test_random_spn_sgd(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] n_test_instances = test.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) n_layers = 1 n_max_children = 2000 n_scope_children = 2000 max_scope_split = -1 merge_prob = 0.5 seed = 1337 rand_gen = random.Random(seed) print('Build random spn') spn = SpnFactory.linked_random_spn_top_down(features, n_layers, n_max_children, n_scope_children, max_scope_split, merge_prob, rand_gen=rand_gen) assert spn.is_valid() print('Stats\n') print(spn.stats()) np_rand_gen = numpy.random.RandomState(seed) spn.fit_sgd(train, valid, test, learning_rate=0.2, n_epochs=10, batch_size=1, grad_method=1, validation_frequency=100, rand_gen=np_rand_gen, hard=False)
def test_g_test_on_dataset(): # # loading the precomputed g tests from gens code (see the spn++ repo) dataset_name = 'book' # msnbc g_factor = 1.0 # locked value for this kind of tests i_vals = read_ivals_from_file(dataset_name, verbose=False) # # loading the dataset (using only the training portion) print('Loading dataset', dataset_name) train, valid, test = dataset.load_train_val_test_csvs(dataset_name) n_features = train.shape[1] n_instances = train.shape[0] instance_ids = [i for i in range(n_instances)] feature_vals = [2 for i in range(n_features)] # they are all binary g_test_start_t = perf_counter() # # computing the g_tests for i in range(n_features): for j in range(i, n_features): single_test_start_t = perf_counter() independent = algo.learnspn.g_test(i, j, instance_ids, train, feature_vals, g_factor) single_test_end_t = perf_counter() real_independent = i_vals[i][j - i] # print((independent, real_independent)) # # checking for correspondance assert int(real_independent) == independent print('processed features', i, j, 'in', (single_test_end_t - single_test_start_t), 'secs') # print('') g_test_end_t = perf_counter() print('Elapsed time', (g_test_end_t - g_test_start_t), 'secs')
def test_greedy_feature_split(): # on synthetic data first g_factor = 2 s_instance_ids = numpy.array([0, 2, 8, 4, 3]) s_feature_ids = numpy.array([2, 1, 4, 0, 3]) data_slice = DataSlice(s_instance_ids, s_feature_ids) feat_comp_1, feat_comp_2 = algo.learnspn.greedy_feature_split( data, data_slice, feature_vals, g_factor, rand_gen) print(feat_comp_1, feat_comp_2) assert set( list(s_feature_ids)) == set(list(feat_comp_1) + list(feat_comp_2)) # # loading the dataset (using only the training portion) dataset_name = 'nltcs' print('Loading dataset', dataset_name) train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
def test_sampling(): # loading nltcs print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') # checking for their shape n_instances = train.shape[0] n_test_instances = test.shape[0] n_valid_instances = valid.shape[0] nltcs_train = 16181 nltcs_valid = 2157 nltcs_test = 3236 print('Training set with {0} instances\n'.format(n_instances) + 'Validation set with {0} instances\n'.format(n_valid_instances) + 'Test set with {0} instances'.format(n_test_instances)) assert n_instances == nltcs_train assert n_valid_instances == nltcs_valid assert n_test_instances == nltcs_test # random sampling perc = 0.1 sample_train, sample_valid, sample_test = \ dataset.sample_sets((train, valid, test), perc) n_s_instances = sample_train.shape[0] n_s_valid_instances = sample_valid.shape[0] n_s_test_instances = sample_test.shape[0] print('Sampled training set with {0} instances\n' .format(n_s_instances) + 'Sampled validation set with {0} instances\n' .format(n_s_valid_instances) + 'Sampled test set with {0} instances' .format(n_s_test_instances)) assert n_s_instances == int(nltcs_train * perc) assert n_s_valid_instances == int(nltcs_valid * perc) assert n_s_test_instances == int(nltcs_test * perc)
def test_sgd(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') n_instances = train.shape[0] n_test_instances = test.shape[0] # estimating the frequencies for the features print('Estimating features') freqs, features = dataset.data_2_freqs(train) print('Build kernel density estimation') spn = SpnFactory.linked_kernel_density_estimation(n_instances, features) print('Created SPN with\n' + spn.stats()) print('Starting SGD') spn.fit_sgd(train, valid, test, learning_rate=0.1, n_epochs=20, batch_size=1, hard=False)
def test_sampling(): # loading nltcs print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') # checking for their shape n_instances = train.shape[0] n_test_instances = test.shape[0] n_valid_instances = valid.shape[0] nltcs_train = 16181 nltcs_valid = 2157 nltcs_test = 3236 print('Training set with {0} instances\n'.format(n_instances) + 'Validation set with {0} instances\n'.format(n_valid_instances) + 'Test set with {0} instances'.format(n_test_instances)) assert n_instances == nltcs_train assert n_valid_instances == nltcs_valid assert n_test_instances == nltcs_test # random sampling perc = 0.1 sample_train, sample_valid, sample_test = \ dataset.sample_sets((train, valid, test), perc) n_s_instances = sample_train.shape[0] n_s_valid_instances = sample_valid.shape[0] n_s_test_instances = sample_test.shape[0] print('Sampled training set with {0} instances\n'.format(n_s_instances) + 'Sampled validation set with {0} instances\n'.format( n_s_valid_instances) + 'Sampled test set with {0} instances'.format(n_s_test_instances)) assert n_s_instances == int(nltcs_train * perc) assert n_s_valid_instances == int(nltcs_valid * perc) assert n_s_test_instances == int(nltcs_test * perc)
def test_greedy_feature_split(): # on synthetic data first g_factor = 2 s_instance_ids = numpy.array([0, 2, 8, 4, 3]) s_feature_ids = numpy.array([2, 1, 4, 0, 3]) data_slice = DataSlice(s_instance_ids, s_feature_ids) feat_comp_1, feat_comp_2 = algo.learnspn.greedy_feature_split(data, data_slice, feature_vals, g_factor, rand_gen) print(feat_comp_1, feat_comp_2) assert set(list(s_feature_ids)) == set( list(feat_comp_1) + list(feat_comp_2)) # # loading the dataset (using only the training portion) dataset_name = 'nltcs' print('Loading dataset', dataset_name) train, valid, test = dataset.load_train_val_test_csvs(dataset_name)
def test_greedy_split_features(): print('Loading datasets') train, valid, test = dataset.load_train_val_test_csvs('nltcs') learner = SpectralStructureLearner() k = 2 ids = [i for i in range(train.shape[1])] g_factor = 9 seed = 1337 rand_gen = numpy.random.RandomState(seed) data_slice = train[:100, :] # splitting on the features clustering = learner.greedy_split_features(data_slice.T, ids, g_factor, rand_gen) print(clustering) labels, clustering, valid = \ learner.spectral_clustering(data_slice.T, ids, k, affinity_metric='gtest', validity_check=True, threshold=0.8, rand_gen=rand_gen) print(clustering)
dataset_splits = None if dataset_name == 'bmnist': logging.info('Loading bmnist from pickle') dataset_splits = load_mnist_pickle(BMNIST_PATH) elif dataset_name == 'caltech101': logging.info('Loading caltech101-silhouettes from pickle') dataset_splits = load_mnist_pickle(CALTECH101_PATH) elif dataset_name == '20newsgroups': logging.info('Loading 20newsgroups from pickle') dataset_splits = load_20newsgroups_pickle(NEWSGROUPS_PATH) elif dataset_name == 'ocr_letters': logging.info('Loading ocr letters from pickle') dataset_splits = load_20newsgroups_pickle(OCR_LETTERS_PATH) else: dataset_splits = dataset.load_train_val_test_csvs(dataset_name, type=args.dtype, suffixes=args.splits) for i, split in enumerate(dataset_splits): logging.info('\tsplit {}, shape {}, labels {}'.format( i, split[0].shape, split[1].shape)) # # loading the learned representations # logging.info('Loading repr splits from {}'.format(args.repr_data)) repr_splits = None pickle_split_path = os.path.join( args.repr_dir, '{}.{}'.format(args.repr_data, PICKLE_SPLIT_EXT)) # # Opening the file for test prediction
os.makedirs(args.output, exist_ok=True) # # setting verbosity level if args.verbose == 1: logging.basicConfig(level=logging.INFO) elif args.verbose == 2: logging.basicConfig(level=logging.DEBUG) logging.info("Starting with arguments:\n%s", args) # # loading dataset splits logging.info('Loading datasets: %s', args.dataset) dataset_name = args.dataset train, valid, test = dataset.load_train_val_test_csvs(dataset_name) logging.info('train shape: {}\nvalid shape: {}\ntest shape: {}'.format( train.shape, valid.shape, test.shape)) n_instances = train.shape[0] n_features = train.shape[1] assert valid.shape[1] == n_features assert test.shape[1] == n_features feature_file_path = '{}.{}.{}'.format(args.suffix, dataset_name, FEATURE_FILE_EXT) feature_file_path = os.path.join(args.output, feature_file_path) logging.info('Saving features to {}'.format(feature_file_path)) if args.rand_marg: logging.info('Rand mask feature generation')
model_path = args.model logging.info('\nLoading spn model from: {}'.format(model_path)) spn = None with open(model_path, 'rb') as model_file: load_start_t = perf_counter() spn = pickle.load(model_file) load_end_t = perf_counter() logging.info('done in {}'.format(load_end_t - load_start_t)) # # loading dataset dataset_name = args.dataset logging.info('Loading dataset {}'.format(dataset_name)) train, valid, test = dataset.load_train_val_test_csvs(dataset_name, path='data/') logging.info('\nEvaluating on training set') eval_s_t = perf_counter() train_preds = evaluate_on_dataset(spn, train) eval_e_t = perf_counter() train_avg_ll = numpy.mean(train_preds) logging.info('\t{}'.format(train_avg_ll)) logging.info('\tdone in {}'.format(eval_e_t - eval_s_t)) logging.info('Evaluating on validation set') eval_s_t = perf_counter() valid_preds = evaluate_on_dataset(spn, valid) eval_e_t = perf_counter() valid_avg_ll = numpy.mean(valid_preds) logging.info('\t{}'.format(valid_avg_ll))
else: sklearn_args = {} logging.info(sklearn_args) # initing the random generators seed = args.seed MAX_RAND_SEED = 99999999 # sys.maxsize rand_gen = random.Random(seed) numpy_rand_gen = numpy.random.RandomState(seed) # # elaborating the dataset # logging.info('Loading datasets: %s', args.dataset) (dataset_name,) = args.dataset train, valid, test = dataset.load_train_val_test_csvs(dataset_name) n_instances = train.shape[0] n_test_instances = test.shape[0] # # estimating the frequencies for the features logging.info('Estimating features on training set...') freqs, features = dataset.data_2_freqs(train) # # Opening the file for test prediction # logging.info('Opening log file...') date_string = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") out_path = args.output + dataset_name + '_' + date_string out_log_path = out_path + '/exp.log'
import dataset train, valid, test = dataset.load_train_val_test_csvs("bnetflix") freqs, features = dataset.data_2_freqs(train) print(train.shape) print(len(features))