def encode_features(X_train, X_val, encode_type): assert(encode_type in ["NN-embedding", "one-hot", "label-encode"]) if encode_type == "NN-embedding": print("******* use NN embedding features ...") embeddings_path = os.path.join(data_dir, em_name) features_em, embeddings_dict, em_size = load_embeddings(embeddings_path) X_train = embed_features(X_train, embeddings_dict, features, features_em) # X shape (num_records, 42) X_val = embed_features(X_val, embeddings_dict, features, features_em) # X shape (num_records, 42) print("training X: ", X_train.shape) print("validation X: ", X_val.shape) elif encode_type == "one-hot": print("******* use one-hot encoded features ...") enc = OneHotEncoder(sparse=False) enc.fit(X_train) X_train = enc.transform(X_train) # X shape (num_records, 1182) X_val = enc.transform(X_val) # X shape (num_records, 1182) print("training X: ", X_train.shape) print("validation X: ", X_val.shape) elif encode_type == "label-encode": # do nothing, features are already label-encoded print("******* use label encoded features ...") print("training X: ", X_train.shape) print("validation X: ", X_val.shape) return X_train, X_val
def correlate(**kwargs): # Get pairwise dissimilarities of learnt representations embeddings = load_embeddings(**kwargs) learnt_dissimilarity_matrix = embeddings_to_dissimilarity(embeddings) learnt_dissimilarities = squareform(learnt_dissimilarity_matrix) # Get pairwise dissimilarities of ground truth representations level, lg = kwargs["level"], kwargs["lg"] assert level == "phoneme", "This function is only for phoneme-level models" phonemes_in_corpus = list(learnt_dissimilarity_matrix.index) features = load_features(lg).loc[phonemes_in_corpus] row_indices, column_indices = np.triu_indices( len(learnt_dissimilarity_matrix), k=1) rows = [learnt_dissimilarity_matrix.index[i] for i in row_indices] columns = [learnt_dissimilarity_matrix.columns[j] for j in column_indices] indices = list(zip(rows, columns)) ground_truth_dissimilarities = np.array( [distance_fn(*index) for index in indices]) r, p = spearmanr(learnt_dissimilarities, ground_truth_dissimilarities) # Write results to disk if "hidden" in kwargs: hyperparams = f"{kwargs['size']}-{kwargs['hidden']}" else: hyperparams = f"{kwargs['size']}-{kwargs['window']}" name = kwargs["name"] path = f"results/{level}/correlation/{lg}/{name}/{hyperparams}" ensure_dir(path) epoch = kwargs["epoch"] filename = os.path.join(path, f"{epoch}.txt") with open(filename, "w") as file: file.write(str((r, p))) return r, p
def original(**kwargs): embeddings = load_embeddings(**kwargs) lg = kwargs["lg"] features = load_features(lg) common_phonemes = embeddings.index.intersection(features.index) S = features.loc[common_phonemes] X = embeddings.loc[common_phonemes] assert X.shape[0] == S.shape[0], (X.shape, S.shape, "Unequal number of rows") assert X.shape[0] > 1, (X.shape, "Must have more than 1 row") X = norm_center_matrix(X) S = norm_center_matrix(S) X_q, _, _ = decomp_qr.qr(X, overwrite_a=True, mode="economic", pivoting=True) S_q, _, _ = decomp_qr.qr(S, overwrite_a=True, mode="economic", pivoting=True) C = np.dot(X_q.T, S_q) r = linalg.svd(C, full_matrices=False, compute_uv=False) d = min(X.shape[1], S.shape[1]) r = r[:d] r = np.minimum(np.maximum(r, 0.0), 1.0) # remove roundoff errs r = r.mean() # Write results to disk level, lg, name = kwargs["level"], kwargs["lg"], kwargs["name"] if "hidden" in kwargs: hyperparams = f"{kwargs['size']}-{kwargs['hidden']}" else: hyperparams = f"{kwargs['size']}-{kwargs['window']}" path = f"results/{level}/qvec/{lg}/{name}/{hyperparams}" ensure_dir(path) epoch = kwargs["epoch"] filename = os.path.join(path, f"{epoch}.txt") with open(filename, "w") as file: file.write(str(r)) return r
def correlate(**kwargs): embeddings = load_embeddings(**kwargs).T lg = kwargs["lg"] features = load_features(lg).T common_phonemes = embeddings.columns.intersection(features.columns) S = features[common_phonemes] X = embeddings[common_phonemes] correlations = pd.DataFrame( {i: X.corrwith(S.iloc[i], axis=1) for i in range(len(S))}) correlations.columns = S.index # Write results to disk level, name = kwargs["level"], kwargs["name"] if "hidden" in kwargs: hyperparams = f"{kwargs['size']}-{kwargs['hidden']}" else: hyperparams = f"{kwargs['size']}-{kwargs['window']}" path = f"results/{level}/qvec/{lg}/{name}/{hyperparams}" ensure_dir(path) epoch = kwargs["epoch"] filename = os.path.join(path, f"{epoch}.csv") correlations.to_csv(filename, index=False) return correlations
def get_raw_dissimilarities(**kwargs): # Get pairwise dissimilarities of learnt representations embeddings = load_embeddings(**kwargs) learnt_dissimilarity_matrix = embeddings_to_dissimilarity(embeddings) learnt_dissimilarities = squareform(learnt_dissimilarity_matrix) # Get pairwise dissimilarities of ground truth representations level, lg = kwargs["level"], kwargs["lg"] assert level == "phoneme", "This function is only for phoneme-level models" phonemes_in_corpus = list(learnt_dissimilarity_matrix.index) features = load_features(lg).loc[phonemes_in_corpus] row_indices, column_indices = np.triu_indices( len(learnt_dissimilarity_matrix), k=1) rows = [learnt_dissimilarity_matrix.index[i] for i in row_indices] columns = [learnt_dissimilarity_matrix.columns[j] for j in column_indices] indices = list(zip(rows, columns)) ground_truth_dissimilarities = np.array( [distance_fn(*index) for index in indices]) return pd.DataFrame( { "learnt": learnt_dissimilarities, "true": ground_truth_dissimilarities }, index=indices, )
def qvec_cca(**kwargs): embeddings = load_embeddings(**kwargs).T lg = kwargs["lg"] features = load_features(lg).T common_phonemes = embeddings.columns.intersection(features.columns) S = features[common_phonemes] X = embeddings[common_phonemes] cca = CCA(n_components=1) a, b = cca.fit_transform(X.T, S.T) a, b = a.reshape(-1), b.reshape(-1) r, p = pearsonr(a, b) # Write results to disk level, lg, name = kwargs["level"], kwargs["lg"], kwargs["name"] if "hidden" in kwargs: hyperparams = f"{kwargs['size']}-{kwargs['hidden']}" else: hyperparams = f"{kwargs['size']}-{kwargs['window']}" path = f"results/{level}/qvec/{lg}/{name}/{hyperparams}" ensure_dir(path) epoch = kwargs["epoch"] filename = os.path.join(path, f"{epoch}.txt") with open(filename, "w") as file: file.write(str((r, p))) return r, p
plt.rcParams.update({'font.size': 16}) import numpy as np import pandas as pd import os # directory of the pickle files data_dir = 'data/' # directory for saving the images image_dir = 'results/' # load embeddings em_name = 'embeddings_auto.pickle' # auto calculated (half of the original dim, at most 50) #em_name = 'embeddings_ref.pickle' # paper ver embeddings_path = os.path.join(data_dir, em_name) features_em, embeddings_dict, em_size = load_embeddings(embeddings_path) # load LabelEncoders with open(os.path.join(data_dir, "les.pickle"), 'rb') as f: les_dict = pickle.load(f) # usage: les_dict['DayOfWeek'] print("label encoded features: ", les_dict.keys()) def plot_2D(xx, yy, names, figsize=(8, 8)): # plot 2D results fig, ax = plt.subplots(figsize=figsize) ax.plot(xx, yy, 'o', markeredgecolor='k', alpha=0.6, markersize=10) for i, txt in enumerate(names): ax.annotate(txt, (xx[i], yy[i]), xytext=(8.5, -5), textcoords='offset points')
""" # plot performance results #model_res.index = ["embed", "one-hot", "label"] axes = model_res.plot.bar(subplots=True, figsize=(10, 4), layout=(1, 3), legend=None, rot=30) #axes[0, 0].set_ylim(0, 1200) #axes[0, 1].set_ylim(0, 10) #axes[0, 2].set_ylim(0, 0.31) plt.tight_layout() fig_path = os.path.join(res_dir, "res_"+model_name+".pdf") print("Save performance figure to: {}".format(fig_path)) plt.savefig(fig_path, bbox_inches='tight') # save embedding size embeddings_path = os.path.join(data_dir, em_name) _, _, em_size = load_embeddings(embeddings_path) res_path = os.path.join(res_dir, "em_size.csv") print("Save embedding size to: {}".format(res_path)) em_size.to_csv(res_path) """ # load testing data: features X, target y df_test = pd.read_csv(os.path.join(data_dir, 'feature_test_data.csv')) features = ['Open', 'Store', 'DayOfWeek', 'Promo', 'Year', 'Month', 'Day', 'State'] X = df_train[features] X = np.asarray(X) if embeddings_as_input: print("Using learned embeddings as input") X = embed_features(X, embeddings_path) # X shape (num_records, 42)