for l in languages:
            X = np.fromfile(embed_data_dir + l + "_embeddings.raw",
                            dtype=np.float32,
                            count=-1)
            print(X.shape)
            X.resize(X.shape[0] // dim, dim)
            print(X.shape)
            X = X[start:end]
            print(X.shape)
            target_embeddings.append(X)

        # Calculate representational similarity analysis
        x, C = get_dists(target_embeddings,
                         labels=languages,
                         ticklabels=[],
                         distance="cosine",
                         save_dir=result_dir)
        distance_matrix = compute_distance_over_dists(x,
                                                      C,
                                                      languages,
                                                      save_dir=result_dir +
                                                      category)
        print(distance_matrix.shape)

        # Save result
        with open(distance_matrix_name, 'wb') as handle:
            pickle.dump(distance_matrix, handle)

    else:
        # If distance matrix has been calculated and you just want to adjust the plot, you can also load it directly
Exemplo n.º 2
0
            print(langdict.keys())
            # get words
            if len(words) == 0:
                words = list(langdicts[0].keys())

            # get vectors
            langvectors = []
            for word in words:
                langvectors.append(langdict[word])
            vectors.append(langvectors)

        ### Representational Similarity Analysis
        # Calculate similarity matrices for all languages
        x, C = get_dists(vectors,
                         labels=languages,
                         ticklabels=words,
                         distance="cosine",
                         save_dir=save_dir + name)

        # Calculate RSA over all languages
        distance_matrix = compute_distance_over_dists(x,
                                                      C,
                                                      languages,
                                                      save_dir=save_dir + name)

        #
        # Save distance matrix
        with open(save_dir + "_distancematrix_" + name + ".pickle",
                  'wb') as handle:
            pickle.dump(distance_matrix, handle)
        words = list(set(list(langdict1.keys())).union(list(langdict2.keys())))
        print(words)

    # get vectors
    langvectors = []
    for word in words:
        try:
            langvectors.append(langdict1[word])
        except KeyError:
            langvectors.append(langdict2[word])
            pass
    vectors.append(langvectors)

# Calculate cosine similarities
x, C = get_dists(vectors,
                 labels=languages,
                 ticklabels=words,
                 distance="cosine")

pairs = {}

# Extra analysis for reviewer!
# Check languages with max and min values in examples

# for lang in ["et", "hu", "el", "he", "it", "pt", "es"]:
#     id = languages.index(lang)
#     similarities = np.array(C[id])
#     #Mask the lower part with zeros (including the diagonal)
#     upper_part = np.triu(similarities,1)
#     # Flatten and remove all zeros
#     flattened = np.matrix.flatten(upper_part)
#     similarities = [x for x in flattened if not x==0]
Exemplo n.º 4
0
    print("Read vectors for " + lang)
    with open(data_dir + embeddings_name + "_embeddings." + lang + ".pickle",
              'rb') as handle:
        langdict = pickle.load(handle)
    langdicts.append(langdict)

    # get vectors
    langvectors = []
    for word in words["en"]:
        langvectors.append(langdict[word])
    vectors.append(langvectors)

    # Calculate cosine similarities for selected words
    x, C = get_dists([langvectors],
                     labels=[lang],
                     ticklabels=words[lang],
                     distance="cosine",
                     save_dir=save_dir)

    ### PLOTTING
    data = C[0]
    mask = np.tri(data.shape[0], k=-1)
    data = np.ma.array(data, mask=mask)
    # Plot the heatmap
    im = ax.imshow(data, cmap="YlOrRd", vmin=0, vmax=1)

    # Finetuning the plot
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')
    ax.yaxis.tick_right()
    ax.yaxis.set_label_position('right')
for l in target_langs:
    X = np.fromfile(data_dir + l + "_embeddings.raw",
                    dtype=np.float32,
                    count=-1)
    X.resize(X.shape[0] // dim, dim)
    X = X[start:end]
    target_embeddings.append(X)

# Get English sentences
with open(data_dir + "en_sents.txt") as f:
    content = f.readlines()
content = [x.strip() for x in content]
content = content[start:end]

x, C = get_dists(target_embeddings,
                 labels=target_langs,
                 ticklabels=content,
                 distance="cosine")

pairs = {}
for sent1 in range(0, len(content)):
    for sent2 in range(0, len(content)):
        if sent1 < sent2:
            similarities = np.array(
                [C[lang][sent1][sent2] for lang in range(0, len(C))])
            pairs[(content[sent1], content[sent2])] = similarities

var = {}
for key in pairs.keys():
    var[key] = np.var(pairs[key])

print("Pairs with variation across languages:")