def main(): dimension = 32 X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat') X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8) # average matrix over train data avg_matrix = X_train.mean(axis=0) # generate random walks walk = random_walk(avg_matrix, steps=1000) seq = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): seq[i, :] = avg_matrix[pos] print(seq.shape) skipgram = Skip_Gram(268, dimension, 2, 0.1) skipgram.train_from_feature_seq(seq, epochs=200) embedded_train_matrix = np.zeros((len(X_train), 268 * dimension)) for i in range(len(X_train)): embedding_train = skipgram.encode(X_train[i]) embedded_train_matrix[i] = np.ndarray.flatten(embedding_train) embedded_test_matrix = np.zeros((len(X_test), 268 * dimension)) for i in range(len(X_test)): embedding_test = skipgram.encode(X_test[i]) embedded_test_matrix[i] = np.ndarray.flatten(embedding_test) lasso = Lasso(100, .01) lasso.train_coordinate_descent(embedded_train_matrix, y_train) predicted = lasso.predict(embedded_test_matrix) print(mean_squared_error(y_test, predicted))
def main(): X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat') #X = data_processing.adjacency_matrix(X) avg_matrix = X.mean(axis = 0) print(avg_matrix.shape) model = AutoEncoder(X.shape[-1], 64, activation = 'relu') model.train(X, epochs = 200, learning_rate = 0.001, loss = 'mse') #generate_embedding_vis(avg_matrix, model.encode(avg_matrix), embedding_name='Neural Autoencoder') walk = random_walk(avg_matrix, steps = 1000) seq = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): seq[i, :] = avg_matrix[pos] print(seq.shape) skipgram = Skip_Gram(268, 64, 2, 0.1) skipgram.train_from_feature_seq(seq, epochs = 200) #generate_embedding_vis(avg_matrix, skipgram.encode(avg_matrix), embedding_name='SkipGram') cbow = CBOW(268, 64, 2, 0.1) cbow.train_from_feature_seq(seq, epochs = 200) #generate_embedding_vis(avg_matrix, cbow.encode(avg_matrix), embedding_name='CBOW') distances = [[avg_matrix, model.encode(avg_matrix)], [skipgram.encode(avg_matrix), cbow.encode(avg_matrix)]] names = [['Original Distances', 'Autoencoder Distances'], ['SkipGram Distances', 'CBOW Distances']] generate_embedding_vis_array(distances, names)
def cbow(train, evaluate, embedding_dim, sentence_length, window, epochs, learning_rate): from embeddings.random_walk import random_walk from embeddings.word2vec import CBOW Xm = train.mean(axis = 0) walk = random_walk(Xm, steps = sentence_length) one_hot = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): one_hot[i, :] = Xm[pos] model = CBOW(268, embedding_dim, window, learning_rate) model.train_from_feature_seq(one_hot, epochs = epochs) return model.encode(evaluate)
def main(): X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat') Xm = X.mean(axis=0) walk = random_walk(Xm, steps=1000) one_hot = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): one_hot[i, :] = Xm[pos] #Skip-Gram model = Skip_Gram(268, 64, 2, 0.1) model.train_from_feature_seq(one_hot, epochs=200) generate_embedding_vis(Xm, model.encode(Xm), embedding_name="Skip-Gram") #CBOW model = CBOW(268, 64, 2, 0.1) model.train_from_feature_seq(one_hot, epochs=200) generate_embedding_vis(Xm, model.encode(Xm), embedding_name="CBOW")
def main(): X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat') Xm = X.mean(axis=0) EMBEDDING_DIM = 16 #Fully-Connected AutoEncoder e_x = tf.keras.layers.Input((None, X.shape[-1])) e_o = tf.keras.layers.TimeDistributed( tf.keras.layers.Dense(EMBEDDING_DIM, activation='tanh'))(e_x) e = tf.keras.Model(e_x, e_o) d_x = tf.keras.layers.Input((None, EMBEDDING_DIM)) d_o = tf.keras.layers.TimeDistributed( tf.keras.layers.Dense(X.shape[-1], activation='linear'))(d_x) d = tf.keras.Model(d_x, d_o) ae_model = AutoEncoder(e, d) ae_model.train(X, epochs=50, learning_rate=0.001, loss='mse') #Transformer AutoEncoder et_x = tf.keras.layers.Input((X.shape[1], X.shape[2])) et_o = Transformer(EMBEDDING_DIM, heads=8, activation='tanh')(et_x) et = tf.keras.Model(et_x, et_o) dt_x = tf.keras.layers.Input((X.shape[1], EMBEDDING_DIM)) dt_o = Transformer(X.shape[2], heads=8, activation='linear')(dt_x) dt = tf.keras.Model(dt_x, dt_o) ae_modelt = AutoEncoder(et, dt) ae_modelt.train(X, epochs=100, learning_rate=0.001, loss='mse') #Matrix Factorization mat_factorization = MatrixFactorization(Xm, EMBEDDING_DIM) mat_factorization.fit(200, 0.0001) #Tensor Factorization tens_factorization = TensorFactorization(X, EMBEDDING_DIM) tens_factorization.fit(50) walk = random_walk(Xm, steps=1000) one_hot = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): one_hot[i, :] = Xm[pos] #Skip-Gram skipgram = Skip_Gram(268, EMBEDDING_DIM, 3, 0.1) skipgram.train_from_feature_seq(one_hot, epochs=200) #CBOW cbow = CBOW(268, EMBEDDING_DIM, 3, 0.1) skipgram.train_from_feature_seq(one_hot, epochs=200) og_distances = calculate_distance_matrix(X.reshape((len(X), -1))) models = { 'AutoEncoder': ae_model, 'Transformer': ae_modelt, 'Matrix Factorization': mat_factorization, 'Tensor Factorization': tens_factorization, 'Skip-Gram': skipgram, 'CBOW': cbow } model_distances = {} for key, mod in models.items(): x_embed = mod.encode(X) model_distances[key] = calculate_distance_matrix( x_embed.reshape((len(x_embed), -1))) #plot distances plt.matshow(og_distances, cmap='Blues', vmin=0) plt.title('Original Distances') plt.savefig('images/og_distance_matrix.png') fig, axes = plt.subplots(2, 3) i = 0 for embedding_name, embedding_distances in model_distances.items(): r, c = i // 3, i % 3 axes[r, c].matshow(embedding_distances, cmap='Blues', vmin=0) axes[r, c].set_title(embedding_name) i += 1 fig.savefig('images/embedding_distances_matrix.png')
def main(): # dimensions to test DIMENSIONS = [64, 32, 16, 8, 4, 2] X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat') X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8) # average matrix over train data avg_matrix = X_train.mean(axis=0) # generate random walks walk = random_walk(avg_matrix, steps=1000) seq = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): seq[i, :] = avg_matrix[pos] print(seq.shape) # train embeddings for each dimension skipgrams = list() for dimension in DIMENSIONS: print(str(dimension) + "-D Embedding Training") skipgram = CBOW(268, dimension, 2, 0.1) skipgram.train_from_feature_seq(seq, epochs=300) skipgrams.append((skipgram, dimension)) # encode train and test data using embeddings, then flatten for prediction embedded_train_list = list() embedded_test_list = list() for skipgram in skipgrams: embedded_train_matrix = np.zeros((len(X_train), 268 * skipgram[1])) for i in range(len(X_train)): embedding_train = skipgram[0].encode(X_train[i]) embedded_train_matrix[i] = np.ndarray.flatten(embedding_train) embedded_train_list.append(embedded_train_matrix) embedded_test_matrix = np.zeros((len(X_test), 268 * skipgram[1])) for i in range(len(X_test)): embedding_test = skipgram[0].encode(X_test[i]) embedded_test_matrix[i] = np.ndarray.flatten(embedding_test) embedded_test_list.append(embedded_test_matrix) # train prediction models on encoded train data, then test on encoded test data and calculate Mean Squared Error lr_error_list = list() svr_error_list = list() mlp_error_list = list() for i in range(len(embedded_train_list)): #savemat(f'Data/cbow_{DIMENSIONS[i]}.mat', {'train':embedded_train_list[i] ,'test':embedded_test_list[i]}) lr = Ridge().fit(embedded_train_list[i], y_train) svr = SVR().fit(embedded_train_list[i], np.reshape(y_train, -1)) mlp = MLPRegressor(hidden_layer_sizes=(100,)).fit(embedded_train_list[i], np.reshape(y_train, -1)) print(mlp.loss_) predictedLR = lr.predict(embedded_test_list[i]) predictedSV = svr.predict(embedded_test_list[i]) predictedMLP = mlp.predict(embedded_test_list[i]) print(str(embedded_test_list[i].shape[-1] // 268) + "-D Predicted") lr_error = mean_squared_error(predictedLR, y_test) svr_error = mean_squared_error(predictedSV, y_test) mlp_error = mean_squared_error(predictedMLP, y_test) lr_error_list.append(lr_error) svr_error_list.append(svr_error) mlp_error_list.append(mlp_error) # plot MSE for different embedding dims and prediction methods width = 0.35 plt.bar(np.arange(len(lr_error_list)), lr_error_list, width, label="LinReg") plt.bar(np.arange(len(svr_error_list)) + width, svr_error_list, width, label="SVR") plt.bar(np.arange(len(mlp_error_list)) + 2 * width, mlp_error_list, width, label="MLP") plt.ylabel("MSE") plt.xlabel("Dimensions") plt.title("SkipGram Mean Squared Error by Embedding Dimension") plt.xticks(np.arange(len(svr_error_list)) + width, list(DIMENSIONS)) plt.legend(loc="best") plt.show()