Пример #1
0
def encode_features(X_train, X_val, encode_type):
    assert(encode_type in ["NN-embedding", "one-hot", "label-encode"])
    
    if encode_type == "NN-embedding":
        print("******* use NN embedding features ...")
        embeddings_path = os.path.join(data_dir, em_name)
        features_em, embeddings_dict, em_size = load_embeddings(embeddings_path)
        X_train = embed_features(X_train, embeddings_dict, features, features_em) # X shape (num_records, 42)
        X_val = embed_features(X_val, embeddings_dict, features, features_em) # X shape (num_records, 42)
        print("training X: ", X_train.shape)
        print("validation X: ", X_val.shape)
        
    elif encode_type == "one-hot":
        print("******* use one-hot encoded features ...")
        enc = OneHotEncoder(sparse=False)
        enc.fit(X_train)
        X_train = enc.transform(X_train) # X shape (num_records, 1182)
        X_val = enc.transform(X_val) # X shape (num_records, 1182)
        print("training X: ", X_train.shape)
        print("validation X: ", X_val.shape)

    elif encode_type == "label-encode":
        # do nothing, features are already label-encoded
        print("******* use label encoded features ...")
        print("training X: ", X_train.shape)
        print("validation X: ", X_val.shape)
    
    return X_train, X_val
Пример #2
0
def correlate(**kwargs):
    # Get pairwise dissimilarities of learnt representations
    embeddings = load_embeddings(**kwargs)
    learnt_dissimilarity_matrix = embeddings_to_dissimilarity(embeddings)
    learnt_dissimilarities = squareform(learnt_dissimilarity_matrix)
    # Get pairwise dissimilarities of ground truth representations
    level, lg = kwargs["level"], kwargs["lg"]
    assert level == "phoneme", "This function is only for phoneme-level models"
    phonemes_in_corpus = list(learnt_dissimilarity_matrix.index)
    features = load_features(lg).loc[phonemes_in_corpus]
    row_indices, column_indices = np.triu_indices(
        len(learnt_dissimilarity_matrix), k=1)
    rows = [learnt_dissimilarity_matrix.index[i] for i in row_indices]
    columns = [learnt_dissimilarity_matrix.columns[j] for j in column_indices]
    indices = list(zip(rows, columns))
    ground_truth_dissimilarities = np.array(
        [distance_fn(*index) for index in indices])
    r, p = spearmanr(learnt_dissimilarities, ground_truth_dissimilarities)
    # Write results to disk
    if "hidden" in kwargs:
        hyperparams = f"{kwargs['size']}-{kwargs['hidden']}"
    else:
        hyperparams = f"{kwargs['size']}-{kwargs['window']}"
    name = kwargs["name"]
    path = f"results/{level}/correlation/{lg}/{name}/{hyperparams}"
    ensure_dir(path)
    epoch = kwargs["epoch"]
    filename = os.path.join(path, f"{epoch}.txt")
    with open(filename, "w") as file:
        file.write(str((r, p)))
    return r, p
Пример #3
0
def original(**kwargs):
    embeddings = load_embeddings(**kwargs)
    lg = kwargs["lg"]
    features = load_features(lg)
    common_phonemes = embeddings.index.intersection(features.index)
    S = features.loc[common_phonemes]
    X = embeddings.loc[common_phonemes]
    assert X.shape[0] == S.shape[0], (X.shape, S.shape,
                                      "Unequal number of rows")
    assert X.shape[0] > 1, (X.shape, "Must have more than 1 row")
    X = norm_center_matrix(X)
    S = norm_center_matrix(S)
    X_q, _, _ = decomp_qr.qr(X,
                             overwrite_a=True,
                             mode="economic",
                             pivoting=True)
    S_q, _, _ = decomp_qr.qr(S,
                             overwrite_a=True,
                             mode="economic",
                             pivoting=True)
    C = np.dot(X_q.T, S_q)
    r = linalg.svd(C, full_matrices=False, compute_uv=False)
    d = min(X.shape[1], S.shape[1])
    r = r[:d]
    r = np.minimum(np.maximum(r, 0.0), 1.0)  # remove roundoff errs
    r = r.mean()
    # Write results to disk
    level, lg, name = kwargs["level"], kwargs["lg"], kwargs["name"]
    if "hidden" in kwargs:
        hyperparams = f"{kwargs['size']}-{kwargs['hidden']}"
    else:
        hyperparams = f"{kwargs['size']}-{kwargs['window']}"
    path = f"results/{level}/qvec/{lg}/{name}/{hyperparams}"
    ensure_dir(path)
    epoch = kwargs["epoch"]
    filename = os.path.join(path, f"{epoch}.txt")
    with open(filename, "w") as file:
        file.write(str(r))
    return r
Пример #4
0
def correlate(**kwargs):
    embeddings = load_embeddings(**kwargs).T
    lg = kwargs["lg"]
    features = load_features(lg).T
    common_phonemes = embeddings.columns.intersection(features.columns)
    S = features[common_phonemes]
    X = embeddings[common_phonemes]
    correlations = pd.DataFrame(
        {i: X.corrwith(S.iloc[i], axis=1)
         for i in range(len(S))})
    correlations.columns = S.index
    # Write results to disk
    level, name = kwargs["level"], kwargs["name"]
    if "hidden" in kwargs:
        hyperparams = f"{kwargs['size']}-{kwargs['hidden']}"
    else:
        hyperparams = f"{kwargs['size']}-{kwargs['window']}"
    path = f"results/{level}/qvec/{lg}/{name}/{hyperparams}"
    ensure_dir(path)
    epoch = kwargs["epoch"]
    filename = os.path.join(path, f"{epoch}.csv")
    correlations.to_csv(filename, index=False)
    return correlations
Пример #5
0
def get_raw_dissimilarities(**kwargs):
    # Get pairwise dissimilarities of learnt representations
    embeddings = load_embeddings(**kwargs)
    learnt_dissimilarity_matrix = embeddings_to_dissimilarity(embeddings)
    learnt_dissimilarities = squareform(learnt_dissimilarity_matrix)
    # Get pairwise dissimilarities of ground truth representations
    level, lg = kwargs["level"], kwargs["lg"]
    assert level == "phoneme", "This function is only for phoneme-level models"
    phonemes_in_corpus = list(learnt_dissimilarity_matrix.index)
    features = load_features(lg).loc[phonemes_in_corpus]
    row_indices, column_indices = np.triu_indices(
        len(learnt_dissimilarity_matrix), k=1)
    rows = [learnt_dissimilarity_matrix.index[i] for i in row_indices]
    columns = [learnt_dissimilarity_matrix.columns[j] for j in column_indices]
    indices = list(zip(rows, columns))
    ground_truth_dissimilarities = np.array(
        [distance_fn(*index) for index in indices])
    return pd.DataFrame(
        {
            "learnt": learnt_dissimilarities,
            "true": ground_truth_dissimilarities
        },
        index=indices,
    )
Пример #6
0
def qvec_cca(**kwargs):
    embeddings = load_embeddings(**kwargs).T
    lg = kwargs["lg"]
    features = load_features(lg).T
    common_phonemes = embeddings.columns.intersection(features.columns)
    S = features[common_phonemes]
    X = embeddings[common_phonemes]
    cca = CCA(n_components=1)
    a, b = cca.fit_transform(X.T, S.T)
    a, b = a.reshape(-1), b.reshape(-1)
    r, p = pearsonr(a, b)
    # Write results to disk
    level, lg, name = kwargs["level"], kwargs["lg"], kwargs["name"]
    if "hidden" in kwargs:
        hyperparams = f"{kwargs['size']}-{kwargs['hidden']}"
    else:
        hyperparams = f"{kwargs['size']}-{kwargs['window']}"
    path = f"results/{level}/qvec/{lg}/{name}/{hyperparams}"
    ensure_dir(path)
    epoch = kwargs["epoch"]
    filename = os.path.join(path, f"{epoch}.txt")
    with open(filename, "w") as file:
        file.write(str((r, p)))
    return r, p
Пример #7
0
plt.rcParams.update({'font.size': 16})
import numpy as np
import pandas as pd
import os

# directory of the pickle files
data_dir = 'data/'

# directory for saving the images
image_dir = 'results/'

# load embeddings
em_name = 'embeddings_auto.pickle'  # auto calculated (half of the original dim, at most 50)
#em_name = 'embeddings_ref.pickle' # paper ver
embeddings_path = os.path.join(data_dir, em_name)
features_em, embeddings_dict, em_size = load_embeddings(embeddings_path)

# load LabelEncoders
with open(os.path.join(data_dir, "les.pickle"), 'rb') as f:
    les_dict = pickle.load(f)  # usage: les_dict['DayOfWeek']
print("label encoded features: ", les_dict.keys())


def plot_2D(xx, yy, names, figsize=(8, 8)):
    # plot 2D results
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot(xx, yy, 'o', markeredgecolor='k', alpha=0.6, markersize=10)
    for i, txt in enumerate(names):
        ax.annotate(txt, (xx[i], yy[i]),
                    xytext=(8.5, -5),
                    textcoords='offset points')
Пример #8
0
"""

# plot performance results
#model_res.index = ["embed", "one-hot", "label"]
axes = model_res.plot.bar(subplots=True, figsize=(10, 4), layout=(1, 3), legend=None, rot=30)
#axes[0, 0].set_ylim(0, 1200)
#axes[0, 1].set_ylim(0, 10)
#axes[0, 2].set_ylim(0, 0.31)
plt.tight_layout()
fig_path = os.path.join(res_dir, "res_"+model_name+".pdf")
print("Save performance figure to: {}".format(fig_path))
plt.savefig(fig_path, bbox_inches='tight')

# save embedding size
embeddings_path = os.path.join(data_dir, em_name)
_, _, em_size = load_embeddings(embeddings_path)
res_path = os.path.join(res_dir, "em_size.csv")
print("Save embedding size to: {}".format(res_path))
em_size.to_csv(res_path)



"""
# load testing data: features X, target y
df_test = pd.read_csv(os.path.join(data_dir, 'feature_test_data.csv'))
features = ['Open', 'Store', 'DayOfWeek', 'Promo', 'Year', 'Month', 'Day', 'State']
X = df_train[features]
X = np.asarray(X)
if embeddings_as_input:
    print("Using learned embeddings as input")
    X = embed_features(X, embeddings_path) # X shape (num_records, 42)