예제 #1
0
plt.show()

#%% Fit new model      (Set optimal K either directly of from cross-validation)
idx = np.argmin(error_val)
K_opt = Ks[idx]
# K_opt = 30

if model_name == 'CP':
    model = m.CPGaussian(K_opt, M)
else:
    model = m.TensorTrainGaussian(K_opt, M)

epochs = 1000

# Split into batches
ds_train = d.to_tf_dataset(X_train, batch_size=batch_size)
ds_train_small = d.to_tf_dataset(X_train_small, batch_size=batch_size)
ds_val = d.to_tf_dataset(X_val, batch_size=batch_size)
ds_val_small = d.to_tf_dataset(X_val_small, batch_size=batch_size)

# Train and plot
losses_train, losses_val = model.fit_val(ds_train,
                                         ds_val,
                                         epochs,
                                         optimizer,
                                         N_init=N_init)

f, ax = plt.subplots(figsize=(12, 5))
ax.plot(losses_train)
ax.plot(losses_val)
ax.set_title('Training loss')
예제 #2
0
    K_CP = K**2
    target = int(K + M * K * K + 6 * 2 * K * K + (K * K * dif_cats) + K * K)
    curr = int(K_CP + 6 * 2 * K_CP + K_CP * dif_cats + K_CP)
    while curr < target:
        K_CP += 1
        curr = int(K_CP + 6 * 2 * K_CP + K_CP * dif_cats + K_CP)
    Ks_CP.append(K_CP)
N_CP = [0] * len(Ks)
N_repeats = 3
EPOCHS = 500
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

subset_size = 1000  #train_data.shape[0]
train_subset = train_data[
    np.random.choice(train_data.shape[0], size=subset_size, replace=False), :]
train_batched = d.to_tf_dataset(train_subset, batch_size=500)
val_batched = d.to_tf_dataset(val_data, batch_size=500)

dists = [
    tfd.Normal,  #Age
    tfd.Categorical,  #Workclass
    tfd.Normal,  #Final Weight
    tfd.Categorical,  #Education
    tfd.Normal,  #Education-Num
    tfd.Categorical,  #Marital-Status
    tfd.Categorical,  #Occupation
    tfd.Categorical,  #Relationship
    tfd.Categorical,  #Race
    tfd.Bernoulli,  #Sex
    tfd.Normal,  #Capital-Gain
    tfd.Normal,  #Capital-Loss
예제 #3
0
ax[1].set_xlabel('K')
ax[1].set_title('Selecting K for ' + model_name + ' model')
ax[1].legend(['Train', 'Validation'])
ax[1].grid('on')
plt.show()

#%% Train for optimal K
K_opt = Ks[idx]

if model_name == 'CP':
    model = m.CPGaussian(K_opt, M)
else:
    model = m.TensorTrainGaussian(K_opt, M)

# Split into batches
ds = d.to_tf_dataset(X_train, batch_size=batch_size)
ds_small = d.to_tf_dataset(X_train_small, batch_size=batch_size)

# losses = model.fit(ds_small,epochs,optimizer,N_init=N_init)

#%% Get test error

# ds_test = d.to_tf_dataset(X_test, batch_size=batch_size)
# errors_test = np.zeros(X_test.shape[0],dtype=np.float32)

# for j,x in enumerate(ds_test):
#   errors_test[j*batch_size:j*batch_size+x.shape[0]] = model(x).numpy()

# test_loss = -tf.reduce_mean(errors_test).numpy()
# print(f'Test error : {test_loss}')
예제 #4
0
def CV_holdout(X_train,X_val, Ks=np.arange(4, 8, 2), model_name='TT', 
              epochs=200, optimizer=None, batch_size=100, N_init = 5):
    """ Holdout Cross Validation to find optimal K
    
    Input
        data        :   The data to fit and test on. The method will split this 
                        into a training and testing self itself.
        Ks          :   Array or int of K values for the model
        model_name  :   Name of model to test ('TT', 'CP', 'GMM')
        epochs      :   How many epochs to use for fitting of the model
        optimizer   :   A tf.keras.optimizers to use for fitting the model
        batch_size  :   The desired batch size for the training data
        N_init      :   How many initalizations the model should do
    
    Return CV_dict with values
        error_train      :   Error on the training set
        error_val        :   Error on the testing set
        learning_curves  : Learning curves for all the K
    """ 

    if optimizer == None:
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

    if np.isscalar(Ks): # Transform
        Ks = (Ks,)
        
    mute = True

    M = X_train.shape[1] # Dimension of data
    
    # create TF training dataset 
    ds_train = d.to_tf_dataset(X_train, batch_size=batch_size)
    ds_val = d.to_tf_dataset(X_val, batch_size=batch_size)
    
    # Initialize error arrays
    error_train = np.zeros((len(Ks)))
    error_val = np.zeros((len(Ks)))
    train_learning_curves = []
    val_learning_curves = []
    
    for i,K in tqdm(enumerate(Ks),desc='Fitting for K',total=len(Ks),position=0,leave=True):
        # Fit model to training data
        if model_name == 'TT':
            model = m.TensorTrainGaussian(K, M)
            train_loss,val_loss = model.fit_val(ds_train,ds_val,epochs,
                                                 optimizer, mute=mute, N_init=N_init)
        elif model_name == 'CP':
            model = m.CPGaussian(K, M)
            train_loss,val_loss = model.fit_val(ds_train,ds_val,epochs,
                                                 optimizer, mute=mute, N_init=N_init)
        # elif model_name == 'GMM':
        #     model = m.GMM(K,M)
        #     train_loss = model.fit(X_train, EPOCHS=epochs, mu_init='random', mute=mute)
        #     for j,x in enumerate(ds_test):
        #         test_loss[j*batch_size:j*batch_size+x.shape[0]] = model(x).numpy()
        else:
            raise Exception('Provided model_name not valid')
        
        train_learning_curves.append(train_loss)
        val_learning_curves.append(val_loss)
        error_train[i] = train_loss[-1]
        error_val[i] = val_loss[-1]
        
    CV_dict = {
        'error_train' : error_train,
        'error_val' : error_val,
        'train_learning_curves' : train_learning_curves,
        'val_learning_curves' : val_learning_curves
        }

    return CV_dict
예제 #5
0
def CV_1_fold(data, Ks=np.arange(4, 8, 2), model_name='TT', 
              CV_splits=5, epochs=200, optimizer=None, batch_size=100):
    """ 1-fold Cross Validation
    
    Input
        data        :   The data to fit and test on. The method will split this 
                        into a training and testing self itself.
        Ks          :   Array or int of K values for the model
        model_name  :   Name of model to test ('TT', 'CP', 'GMM')
        epochs      :   How many epochs to use for fitting of the model
        optimizer   :   A tf.keras.optimizers to use for fitting the model
        batch_size  :   The desired batch size for the training data
    
    Return
        err_tr      :   Error on the training set
        err_tst     :   Error on the testing set
    """ 

    if optimizer == None:
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

    if np.isscalar(Ks): # Transform
        Ks = (Ks,)

    M = data.shape[1] # Dimension of data
    
    # Split data and shuffle
    CV = KFold(n_splits=CV_splits, shuffle=True)
    
    # Initialize error arrays
    error_train = np.zeros((CV_splits, len(Ks)))
    error_test = np.zeros((CV_splits, len(Ks)))
    
    for i, (train_index, test_index) in enumerate(CV.split(data)):
        print(f'Cross-validation fold {i+1}/{CV_splits}')
        
        # split and normalize data
        X_train, X_test = data_split(data, train_index, test_index, batch_size)
        
        # create TF training dataset 
        ds_train = d.to_tf_dataset(X_train, batch_size=batch_size)
        
        for j, K in enumerate(Ks):
            # Fit model to training data
            if model_name == 'TT':
                model = m.TensorTrainGaussian(K, M)
                train_loss = model.fit(ds_train, epochs, optimizer, mute=True)
                test_loss = model(X_test)
            elif model_name == 'CP':
                model = m.CPGaussian(K, M)
                train_loss = model.fit(ds_train, epochs, optimizer, mute=True, mu_init='random')
                test_loss = model(X_test)
            elif model_name == 'GMM':
                model = GaussianMixture(n_components=K, covariance_type='full', n_init=5, init_params='random')
                model.fit(X_train)
                train_loss = [-model.score(X_train)]
                test_loss = model.score_samples(X_test)
            else:
                raise Exception('Provided model_name not valid')
            
            error_train[i, j] = train_loss[-1]
            error_test[i, j] = -tf.reduce_mean(test_loss).numpy()
    
        # Get average error across splits
        err_tr = np.mean(error_train, axis=0) # mean training error over the CV folds
        err_tst = np.mean(error_test, axis=0) # mean test error over the CV folds
    return err_tr, err_tst
예제 #6
0
N = 2000
data_names = d.get_toy_names()
name = data_names[7]

data = d.get_ffjord_data(name, batch_size=N)

# Inspect the data
f, ax = plt.subplots(figsize=(5, 5))
ax.plot(data[:, 0], data[:, 1], '.')
ax.axis('equal')
ax.set_title(name + f' with {N} points')
plt.show()

# Split into batches
batch_size = 200
dataset = d.to_tf_dataset(data, batch_size=batch_size)

#%% Define model and training parameters
K = 8  # Number of components
M = data.shape[1]  # Number of dimensions in data
model = m.CPGaussian(K, M)
# model = m.GMM(K,M)

EPOCHS = 200
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

#%% Train model
losses = model.fit(dataset, EPOCHS, optimizer, 'kmeans')
# losses = model.fit(data,10,'kmeans')

f, ax = plt.subplots()