plt.show() #%% Fit new model (Set optimal K either directly of from cross-validation) idx = np.argmin(error_val) K_opt = Ks[idx] # K_opt = 30 if model_name == 'CP': model = m.CPGaussian(K_opt, M) else: model = m.TensorTrainGaussian(K_opt, M) epochs = 1000 # Split into batches ds_train = d.to_tf_dataset(X_train, batch_size=batch_size) ds_train_small = d.to_tf_dataset(X_train_small, batch_size=batch_size) ds_val = d.to_tf_dataset(X_val, batch_size=batch_size) ds_val_small = d.to_tf_dataset(X_val_small, batch_size=batch_size) # Train and plot losses_train, losses_val = model.fit_val(ds_train, ds_val, epochs, optimizer, N_init=N_init) f, ax = plt.subplots(figsize=(12, 5)) ax.plot(losses_train) ax.plot(losses_val) ax.set_title('Training loss')
K_CP = K**2 target = int(K + M * K * K + 6 * 2 * K * K + (K * K * dif_cats) + K * K) curr = int(K_CP + 6 * 2 * K_CP + K_CP * dif_cats + K_CP) while curr < target: K_CP += 1 curr = int(K_CP + 6 * 2 * K_CP + K_CP * dif_cats + K_CP) Ks_CP.append(K_CP) N_CP = [0] * len(Ks) N_repeats = 3 EPOCHS = 500 optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) subset_size = 1000 #train_data.shape[0] train_subset = train_data[ np.random.choice(train_data.shape[0], size=subset_size, replace=False), :] train_batched = d.to_tf_dataset(train_subset, batch_size=500) val_batched = d.to_tf_dataset(val_data, batch_size=500) dists = [ tfd.Normal, #Age tfd.Categorical, #Workclass tfd.Normal, #Final Weight tfd.Categorical, #Education tfd.Normal, #Education-Num tfd.Categorical, #Marital-Status tfd.Categorical, #Occupation tfd.Categorical, #Relationship tfd.Categorical, #Race tfd.Bernoulli, #Sex tfd.Normal, #Capital-Gain tfd.Normal, #Capital-Loss
ax[1].set_xlabel('K') ax[1].set_title('Selecting K for ' + model_name + ' model') ax[1].legend(['Train', 'Validation']) ax[1].grid('on') plt.show() #%% Train for optimal K K_opt = Ks[idx] if model_name == 'CP': model = m.CPGaussian(K_opt, M) else: model = m.TensorTrainGaussian(K_opt, M) # Split into batches ds = d.to_tf_dataset(X_train, batch_size=batch_size) ds_small = d.to_tf_dataset(X_train_small, batch_size=batch_size) # losses = model.fit(ds_small,epochs,optimizer,N_init=N_init) #%% Get test error # ds_test = d.to_tf_dataset(X_test, batch_size=batch_size) # errors_test = np.zeros(X_test.shape[0],dtype=np.float32) # for j,x in enumerate(ds_test): # errors_test[j*batch_size:j*batch_size+x.shape[0]] = model(x).numpy() # test_loss = -tf.reduce_mean(errors_test).numpy() # print(f'Test error : {test_loss}')
def CV_holdout(X_train,X_val, Ks=np.arange(4, 8, 2), model_name='TT', epochs=200, optimizer=None, batch_size=100, N_init = 5): """ Holdout Cross Validation to find optimal K Input data : The data to fit and test on. The method will split this into a training and testing self itself. Ks : Array or int of K values for the model model_name : Name of model to test ('TT', 'CP', 'GMM') epochs : How many epochs to use for fitting of the model optimizer : A tf.keras.optimizers to use for fitting the model batch_size : The desired batch size for the training data N_init : How many initalizations the model should do Return CV_dict with values error_train : Error on the training set error_val : Error on the testing set learning_curves : Learning curves for all the K """ if optimizer == None: optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) if np.isscalar(Ks): # Transform Ks = (Ks,) mute = True M = X_train.shape[1] # Dimension of data # create TF training dataset ds_train = d.to_tf_dataset(X_train, batch_size=batch_size) ds_val = d.to_tf_dataset(X_val, batch_size=batch_size) # Initialize error arrays error_train = np.zeros((len(Ks))) error_val = np.zeros((len(Ks))) train_learning_curves = [] val_learning_curves = [] for i,K in tqdm(enumerate(Ks),desc='Fitting for K',total=len(Ks),position=0,leave=True): # Fit model to training data if model_name == 'TT': model = m.TensorTrainGaussian(K, M) train_loss,val_loss = model.fit_val(ds_train,ds_val,epochs, optimizer, mute=mute, N_init=N_init) elif model_name == 'CP': model = m.CPGaussian(K, M) train_loss,val_loss = model.fit_val(ds_train,ds_val,epochs, optimizer, mute=mute, N_init=N_init) # elif model_name == 'GMM': # model = m.GMM(K,M) # train_loss = model.fit(X_train, EPOCHS=epochs, mu_init='random', mute=mute) # for j,x in enumerate(ds_test): # test_loss[j*batch_size:j*batch_size+x.shape[0]] = model(x).numpy() else: raise Exception('Provided model_name not valid') train_learning_curves.append(train_loss) val_learning_curves.append(val_loss) error_train[i] = train_loss[-1] error_val[i] = val_loss[-1] CV_dict = { 'error_train' : error_train, 'error_val' : error_val, 'train_learning_curves' : train_learning_curves, 'val_learning_curves' : val_learning_curves } return CV_dict
def CV_1_fold(data, Ks=np.arange(4, 8, 2), model_name='TT', CV_splits=5, epochs=200, optimizer=None, batch_size=100): """ 1-fold Cross Validation Input data : The data to fit and test on. The method will split this into a training and testing self itself. Ks : Array or int of K values for the model model_name : Name of model to test ('TT', 'CP', 'GMM') epochs : How many epochs to use for fitting of the model optimizer : A tf.keras.optimizers to use for fitting the model batch_size : The desired batch size for the training data Return err_tr : Error on the training set err_tst : Error on the testing set """ if optimizer == None: optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) if np.isscalar(Ks): # Transform Ks = (Ks,) M = data.shape[1] # Dimension of data # Split data and shuffle CV = KFold(n_splits=CV_splits, shuffle=True) # Initialize error arrays error_train = np.zeros((CV_splits, len(Ks))) error_test = np.zeros((CV_splits, len(Ks))) for i, (train_index, test_index) in enumerate(CV.split(data)): print(f'Cross-validation fold {i+1}/{CV_splits}') # split and normalize data X_train, X_test = data_split(data, train_index, test_index, batch_size) # create TF training dataset ds_train = d.to_tf_dataset(X_train, batch_size=batch_size) for j, K in enumerate(Ks): # Fit model to training data if model_name == 'TT': model = m.TensorTrainGaussian(K, M) train_loss = model.fit(ds_train, epochs, optimizer, mute=True) test_loss = model(X_test) elif model_name == 'CP': model = m.CPGaussian(K, M) train_loss = model.fit(ds_train, epochs, optimizer, mute=True, mu_init='random') test_loss = model(X_test) elif model_name == 'GMM': model = GaussianMixture(n_components=K, covariance_type='full', n_init=5, init_params='random') model.fit(X_train) train_loss = [-model.score(X_train)] test_loss = model.score_samples(X_test) else: raise Exception('Provided model_name not valid') error_train[i, j] = train_loss[-1] error_test[i, j] = -tf.reduce_mean(test_loss).numpy() # Get average error across splits err_tr = np.mean(error_train, axis=0) # mean training error over the CV folds err_tst = np.mean(error_test, axis=0) # mean test error over the CV folds return err_tr, err_tst
N = 2000 data_names = d.get_toy_names() name = data_names[7] data = d.get_ffjord_data(name, batch_size=N) # Inspect the data f, ax = plt.subplots(figsize=(5, 5)) ax.plot(data[:, 0], data[:, 1], '.') ax.axis('equal') ax.set_title(name + f' with {N} points') plt.show() # Split into batches batch_size = 200 dataset = d.to_tf_dataset(data, batch_size=batch_size) #%% Define model and training parameters K = 8 # Number of components M = data.shape[1] # Number of dimensions in data model = m.CPGaussian(K, M) # model = m.GMM(K,M) EPOCHS = 200 optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) #%% Train model losses = model.fit(dataset, EPOCHS, optimizer, 'kmeans') # losses = model.fit(data,10,'kmeans') f, ax = plt.subplots()