예제 #1
0
def main():
    dimension = 32
    X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat')

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)

    # average matrix over train data
    avg_matrix = X_train.mean(axis=0)

    # generate random walks
    walk = random_walk(avg_matrix, steps=1000)
    seq = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        seq[i, :] = avg_matrix[pos]
    print(seq.shape)

    skipgram = Skip_Gram(268, dimension, 2, 0.1)
    skipgram.train_from_feature_seq(seq, epochs=200)

    embedded_train_matrix = np.zeros((len(X_train), 268 * dimension))
    for i in range(len(X_train)):
        embedding_train = skipgram.encode(X_train[i])
        embedded_train_matrix[i] = np.ndarray.flatten(embedding_train)

    embedded_test_matrix = np.zeros((len(X_test), 268 * dimension))
    for i in range(len(X_test)):
        embedding_test = skipgram.encode(X_test[i])
        embedded_test_matrix[i] = np.ndarray.flatten(embedding_test)

    lasso = Lasso(100, .01)

    lasso.train_coordinate_descent(embedded_train_matrix, y_train)

    predicted = lasso.predict(embedded_test_matrix)
    print(mean_squared_error(y_test, predicted))
예제 #2
0
def main():
    X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat')
    #X = data_processing.adjacency_matrix(X)

    avg_matrix = X.mean(axis = 0)
    print(avg_matrix.shape)

    model = AutoEncoder(X.shape[-1], 64, activation = 'relu')
    model.train(X, epochs = 200, learning_rate = 0.001, loss = 'mse')
    #generate_embedding_vis(avg_matrix, model.encode(avg_matrix), embedding_name='Neural Autoencoder')

    walk = random_walk(avg_matrix, steps = 1000)
    seq = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        seq[i, :] = avg_matrix[pos]
    print(seq.shape)

    skipgram = Skip_Gram(268, 64, 2, 0.1)
    skipgram.train_from_feature_seq(seq, epochs = 200)
    #generate_embedding_vis(avg_matrix, skipgram.encode(avg_matrix), embedding_name='SkipGram')

    cbow = CBOW(268, 64, 2, 0.1)
    cbow.train_from_feature_seq(seq, epochs = 200)
    #generate_embedding_vis(avg_matrix, cbow.encode(avg_matrix), embedding_name='CBOW')

    distances = [[avg_matrix, model.encode(avg_matrix)], [skipgram.encode(avg_matrix), cbow.encode(avg_matrix)]]
    names = [['Original Distances', 'Autoencoder Distances'], ['SkipGram Distances', 'CBOW Distances']]
    generate_embedding_vis_array(distances, names)
예제 #3
0
def cbow(train, evaluate, embedding_dim, sentence_length, window, epochs, learning_rate):
    from embeddings.random_walk import random_walk
    from embeddings.word2vec import CBOW
    
    Xm = train.mean(axis = 0)
    walk = random_walk(Xm, steps = sentence_length)
    one_hot = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        one_hot[i, :] = Xm[pos]

    model = CBOW(268, embedding_dim, window, learning_rate)
    model.train_from_feature_seq(one_hot, epochs = epochs)
    return model.encode(evaluate)
예제 #4
0
def main():
    X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat')
    Xm = X.mean(axis=0)

    walk = random_walk(Xm, steps=1000)
    one_hot = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        one_hot[i, :] = Xm[pos]

    #Skip-Gram
    model = Skip_Gram(268, 64, 2, 0.1)
    model.train_from_feature_seq(one_hot, epochs=200)
    generate_embedding_vis(Xm, model.encode(Xm), embedding_name="Skip-Gram")

    #CBOW
    model = CBOW(268, 64, 2, 0.1)
    model.train_from_feature_seq(one_hot, epochs=200)
    generate_embedding_vis(Xm, model.encode(Xm), embedding_name="CBOW")
예제 #5
0
def main():
    X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat')
    Xm = X.mean(axis=0)

    EMBEDDING_DIM = 16

    #Fully-Connected AutoEncoder
    e_x = tf.keras.layers.Input((None, X.shape[-1]))
    e_o = tf.keras.layers.TimeDistributed(
        tf.keras.layers.Dense(EMBEDDING_DIM, activation='tanh'))(e_x)
    e = tf.keras.Model(e_x, e_o)

    d_x = tf.keras.layers.Input((None, EMBEDDING_DIM))
    d_o = tf.keras.layers.TimeDistributed(
        tf.keras.layers.Dense(X.shape[-1], activation='linear'))(d_x)
    d = tf.keras.Model(d_x, d_o)

    ae_model = AutoEncoder(e, d)
    ae_model.train(X, epochs=50, learning_rate=0.001, loss='mse')

    #Transformer AutoEncoder
    et_x = tf.keras.layers.Input((X.shape[1], X.shape[2]))
    et_o = Transformer(EMBEDDING_DIM, heads=8, activation='tanh')(et_x)
    et = tf.keras.Model(et_x, et_o)

    dt_x = tf.keras.layers.Input((X.shape[1], EMBEDDING_DIM))
    dt_o = Transformer(X.shape[2], heads=8, activation='linear')(dt_x)
    dt = tf.keras.Model(dt_x, dt_o)

    ae_modelt = AutoEncoder(et, dt)
    ae_modelt.train(X, epochs=100, learning_rate=0.001, loss='mse')

    #Matrix Factorization
    mat_factorization = MatrixFactorization(Xm, EMBEDDING_DIM)
    mat_factorization.fit(200, 0.0001)

    #Tensor Factorization
    tens_factorization = TensorFactorization(X, EMBEDDING_DIM)
    tens_factorization.fit(50)

    walk = random_walk(Xm, steps=1000)
    one_hot = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        one_hot[i, :] = Xm[pos]

    #Skip-Gram
    skipgram = Skip_Gram(268, EMBEDDING_DIM, 3, 0.1)
    skipgram.train_from_feature_seq(one_hot, epochs=200)

    #CBOW
    cbow = CBOW(268, EMBEDDING_DIM, 3, 0.1)
    skipgram.train_from_feature_seq(one_hot, epochs=200)

    og_distances = calculate_distance_matrix(X.reshape((len(X), -1)))

    models = {
        'AutoEncoder': ae_model,
        'Transformer': ae_modelt,
        'Matrix Factorization': mat_factorization,
        'Tensor Factorization': tens_factorization,
        'Skip-Gram': skipgram,
        'CBOW': cbow
    }

    model_distances = {}

    for key, mod in models.items():
        x_embed = mod.encode(X)
        model_distances[key] = calculate_distance_matrix(
            x_embed.reshape((len(x_embed), -1)))

    #plot distances
    plt.matshow(og_distances, cmap='Blues', vmin=0)
    plt.title('Original Distances')
    plt.savefig('images/og_distance_matrix.png')

    fig, axes = plt.subplots(2, 3)
    i = 0
    for embedding_name, embedding_distances in model_distances.items():
        r, c = i // 3, i % 3
        axes[r, c].matshow(embedding_distances, cmap='Blues', vmin=0)
        axes[r, c].set_title(embedding_name)
        i += 1
    fig.savefig('images/embedding_distances_matrix.png')
def main():
    # dimensions to test
    DIMENSIONS = [64, 32, 16, 8, 4, 2]

    X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat')

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)

    # average matrix over train data
    avg_matrix = X_train.mean(axis=0)

    # generate random walks
    walk = random_walk(avg_matrix, steps=1000)
    seq = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        seq[i, :] = avg_matrix[pos]
    print(seq.shape)

    # train embeddings for each dimension
    skipgrams = list()
    for dimension in DIMENSIONS:

        print(str(dimension) + "-D Embedding Training")
        skipgram = CBOW(268, dimension, 2, 0.1)
        skipgram.train_from_feature_seq(seq, epochs=300)

        skipgrams.append((skipgram, dimension))

    # encode train and test data using embeddings, then flatten for prediction
    embedded_train_list = list()
    embedded_test_list = list()
    for skipgram in skipgrams:
        embedded_train_matrix = np.zeros((len(X_train), 268 * skipgram[1]))
        for i in range(len(X_train)):
            embedding_train = skipgram[0].encode(X_train[i])
            embedded_train_matrix[i] = np.ndarray.flatten(embedding_train)
        embedded_train_list.append(embedded_train_matrix)
        embedded_test_matrix = np.zeros((len(X_test), 268 * skipgram[1]))
        for i in range(len(X_test)):
            embedding_test = skipgram[0].encode(X_test[i])
            embedded_test_matrix[i] = np.ndarray.flatten(embedding_test)
        embedded_test_list.append(embedded_test_matrix)

    # train prediction models on encoded train data, then test on encoded test data and calculate Mean Squared Error
    lr_error_list = list()
    svr_error_list = list()
    mlp_error_list = list()
    for i in range(len(embedded_train_list)):
        #savemat(f'Data/cbow_{DIMENSIONS[i]}.mat', {'train':embedded_train_list[i] ,'test':embedded_test_list[i]})
        lr = Ridge().fit(embedded_train_list[i], y_train)
        svr = SVR().fit(embedded_train_list[i], np.reshape(y_train, -1))
        mlp = MLPRegressor(hidden_layer_sizes=(100,)).fit(embedded_train_list[i], np.reshape(y_train, -1))
        print(mlp.loss_)
        predictedLR = lr.predict(embedded_test_list[i])
        predictedSV = svr.predict(embedded_test_list[i])
        predictedMLP = mlp.predict(embedded_test_list[i])
        print(str(embedded_test_list[i].shape[-1] // 268) + "-D Predicted")
        lr_error = mean_squared_error(predictedLR, y_test)
        svr_error = mean_squared_error(predictedSV, y_test)
        mlp_error = mean_squared_error(predictedMLP, y_test)
        lr_error_list.append(lr_error)
        svr_error_list.append(svr_error)
        mlp_error_list.append(mlp_error)

    # plot MSE for different embedding dims and prediction methods
    width = 0.35
    plt.bar(np.arange(len(lr_error_list)), lr_error_list, width, label="LinReg")
    plt.bar(np.arange(len(svr_error_list)) + width, svr_error_list, width, label="SVR")
    plt.bar(np.arange(len(mlp_error_list)) + 2 * width, mlp_error_list, width, label="MLP")
    plt.ylabel("MSE")
    plt.xlabel("Dimensions")
    plt.title("SkipGram Mean Squared Error by Embedding Dimension")
    plt.xticks(np.arange(len(svr_error_list)) + width, list(DIMENSIONS))
    plt.legend(loc="best")
    plt.show()