예제 #1
0
def vec2pairs(src_mono_vec, trg_mono_vec, src_vocab, trg_vocab, output_file):
    print("loading source vectors")
    src_vectors = gensim.models.KeyedVectors.load_word2vec_format(src_mono_vec,
                                                                  binary=False)
    print("loading target vectors")
    trg_vectors = gensim.models.KeyedVectors.load_word2vec_format(trg_mono_vec,
                                                                  binary=False)

    print("load src_word2index")
    src_word2index = read_vocab(src_vocab)
    print("load trg_word2index")
    trg_word2index = read_vocab(trg_vocab)
    align(src_word2index)
    align(trg_word2index)
    for word in src_word2index:
        print(src_word2index[word])

    src_index2word = {src_word2index[word]: word for word in src_word2index}
    trg_index2word = {trg_word2index[word]: word for word in trg_word2index}

    trg_matrix = np.zeros((trg_vectors.vector_size, len(trg_word2index)))
    src_matrix = np.zeros((len(src_word2index), src_vectors.vector_size))

    print("form the target language matrix")
    for word in trg_word2index:
        trg_matrix[:, trg_word2index[word]] = trg_vectors[word]

    # compute the norm-2 of every word vector. axis=0 means regarding every column as a vector
    trg_vec_length = np.reciprocal(LA.norm(trg_matrix, axis=0))
    for i in range(len(trg_vec_length)):
        trg_matrix[:, i] *= trg_vec_length[i]

    print("form the source language matrix")
    for word in src_word2index:
        src_matrix[src_word2index[word], :] = src_vectors[word]

    # compute the norm-2 of every word vector
    src_vec_length = np.reciprocal(LA.norm(src_matrix, axis=1))
    for i in range(len(src_vec_length)):
        src_matrix[i, :] *= src_vec_length[i]

    print("finding translation pairs")
    batch = 1000
    output = open(output_file, "w", encoding="utf-8")

    knn_sim_bwd = np.zeros(
        src_matrix.shape[0])  # src_matrix.shape[0]是目标预言的单词个数
    for i in range(0, src_matrix.shape[0], batch):
        print("batch", i)
        j = min(i + batch, src_matrix.shape[0])
        knn_sim_bwd[i:j] = topk_mean(src_matrix[i:j].dot(trg_matrix),
                                     k=100,
                                     inplace=True)

    for i in range(0, knn_sim_bwd.shape[0]):
        output.write(src_index2word[i] + "," + str(knn_sim_bwd[i]) + "\n")

    output.close()
예제 #2
0
def compute_D(pairs_file, src_vocab, trg_vocab, output_file):
    src_word2index = read_vocab(src_vocab)
    trg_word2index = read_vocab(trg_vocab)
    length = len(src_word2index) + len(trg_word2index)
    D = csc_matrix((length, length), dtype="float32")

    input = open(pairs_file, "r", encoding="UTF-8-sig")
    pairs = [line.strip().split() for line in input.readlines()]
    input.close()

    D = form_matrix(pairs, src_word2index, trg_word2index, D)
    save_matrix(output_file, D)
def cnt2svd(count_file, vocab_file, PPMI):
    with open(count_file, "r", encoding="UTF-8-sig") as src_file:
        text = src_file.readlines()

    word2index = read_vocab(vocab_file)

    print("length of word_dict: " + str(len(word2index)))

    counts = csc_matrix((len(word2index), len(word2index)), dtype="float32")
    tmp_counts = dok_matrix((len(word2index), len(word2index)),
                            dtype="float32")
    times = 0
    for i in range(len(text)):
        word, context, count = text[i].strip().split()
        tmp_counts[word2index[word], word2index[context]] = int(count)
        times += 1
        if times == st.UPDATE_THRESHOLD:
            counts = counts + tmp_counts.tocsc()
            tmp_counts = dok_matrix((len(word2index), len(word2index)),
                                    dtype="float32")
            times = 0
    counts = counts + tmp_counts.tocsc()
    #calculate e^pmi
    sum_r = np.array(counts.sum(axis=1))[:, 0]
    sum_c = np.array(counts.sum(axis=0))[0, :]

    sum_total = sum_c.sum()
    sum_r = np.reciprocal(sum_r)
    sum_c = np.reciprocal(sum_c)

    pmi = csc_matrix(counts)

    normalizer = dok_matrix((len(sum_r), len(sum_r)))
    normalizer.setdiag(sum_r)
    pmi = normalizer.tocsc().dot(pmi)

    normalizer = dok_matrix((len(sum_c), len(sum_c)))
    normalizer.setdiag(sum_c)
    pmi = pmi.dot(normalizer.tocsc())

    pmi = pmi * sum_total
    pmi.data = np.log(pmi.data)

    if PPMI:
        pmi[pmi < 0] = 0

    I = eye(pmi.shape[0], format="csc")
    print("start svd")
    start = time.time()
    ut, s = sparsesvd(pmi, I, st.VECTOR_LENGTH)[:2]

    if PPMI:
        for i in range(len(s)):
            ut[i, :] *= np.sqrt(s[i])
    else:
        for i in range(len(s)):
            ut[i, :] *= s[i]

    print(time.time() - start)
    return ut.T, word2index
예제 #4
0
def vec2pairs(src_mono_vec, trg_mono_vec, src_vocab, trg_vocab):
    print("loading source vectors")
    src_vectors = gensim.models.KeyedVectors.load_word2vec_format(src_mono_vec,
                                                                  binary=False)
    print("loading target vectors")
    trg_vectors = gensim.models.KeyedVectors.load_word2vec_format(trg_mono_vec,
                                                                  binary=False)

    print("load src_word2index")
    src_word2index = read_vocab(src_vocab)
    print("load trg_word2index")
    trg_word2index = read_vocab(trg_vocab)

    align(src_word2index)
    align(trg_word2index)

    trg_index2word = {trg_word2index[word]: word for word in trg_word2index}

    trg_matrix = np.zeros((trg_vectors.vector_size, len(trg_word2index)))

    print("form the target language matrix")
    for word in trg_word2index:
        trg_matrix[:, trg_word2index[word]] = trg_vectors[word]

    # compute the norm-2 of every word vector. axis=0 means regarding every column as a vector
    trg_vec_length = np.reciprocal(LA.norm(trg_matrix, axis=0))
    for i in range(len(trg_vec_length)):
        trg_matrix[:, i] *= trg_vec_length[i]

    words = ['abdomen', 'magnesium', 'ammonia', 'yuwen', 'everton']
    for word in words:
        if word not in src_word2index:
            continue
        a = LA.norm(src_vectors[word])
        sim_mat = src_vectors[word].dot(trg_matrix)
        indices = np.argpartition(sim_mat, -100)[-100:]
        print(word)
        for idx in indices:
            print(trg_index2word[idx], end=" ")
        print()
        for idx in indices:
            print(sim_mat[idx] / a, end=" ")
        print()
예제 #5
0
def compute_X(src_file, trg_file, src_vocab, trg_vocab, output_file):

    print("read word2index")
    src_word2index = read_vocab(src_vocab)
    trg_word2index = read_vocab(trg_vocab)
    total_number = len(src_word2index) + len(trg_word2index)

    print("form matrix X")
    counts = csc_matrix((total_number, total_number), dtype="float32")
    counts = form_matrix(open(src_file, "r", encoding="UTF-8-sig"),
                         src_word2index, counts)
    counts = form_matrix(open(trg_file, "r", encoding="UTF-8-sig"),
                         trg_word2index, counts)
    #calculate e^pmi
    print("compute pmi")
    sum_r = np.array(counts.sum(axis=1))[:, 0]
    sum_c = np.array(counts.sum(axis=0))[0, :]

    sum_total = sum_c.sum()
    sum_r = np.reciprocal(sum_r)
    sum_c = np.reciprocal(sum_c)

    pmi = csc_matrix(counts)
    print("divided by marginal sum")
    normalizer = dok_matrix((len(sum_r), len(sum_r)))
    normalizer.setdiag(sum_r)
    pmi = normalizer.tocsc().dot(pmi)

    normalizer = dok_matrix((len(sum_c), len(sum_c)))
    normalizer.setdiag(sum_c)
    pmi = pmi.dot(normalizer.tocsc())

    print("multiply total sum")
    pmi = pmi * sum_total
    pmi.data = np.log(pmi.data)

    pmi[pmi < 0] = 0

    save_matrix(output_file, pmi)
예제 #6
0
def svd2vec(src_vocab, trg_vocab, svd_path, src_vec, trg_vec):
    src_word2index = read_vocab(src_vocab)
    trg_word2index = read_vocab(trg_vocab)

    u = np.loadtxt(svd_path + "-U", dtype="float32")
    s = np.loadtxt(svd_path+"-s", dtype= "float32")
    # v = np.loadtxt(svd_path+"-V", dtype= "float32")

    for i in range(len(s)):
        u[:, i] *= np.sqrt(s[i])

    top = 40000
    print("output vectors")
    length = min(st.VECTOR_LENGTH, u.shape[1])
    output = open(src_vec, "w", encoding="utf-8")
    output.write(str(top) + " " + str(length) + "\n")
    for word in src_word2index:
        if src_word2index[word] >= top:
            continue
        vector = u[src_word2index[word]]
        output.write(word)
        for i in range(length):
            output.write(" %.8f" % vector[i])
        output.write("\n")
    output.close()

    total_top = top + len(src_word2index)
    output = open(trg_vec, "w", encoding="utf-8")
    output.write(str(top) + " " + str(length) + "\n")
    for word in trg_word2index:
        if trg_word2index[word] >= total_top:
            continue
        vector = u[trg_word2index[word]]
        output.write(word)
        for i in range(length):
            output.write(" %.8f" % vector[i])
        output.write("\n")
    output.close()
예제 #7
0
from corpus2vocab import read_vocab
if __name__ == "__main__":
    vocab_path = "tempdata/vocab/F10-W5.1en"
    input_file = open("../monolingual/vector/F10-W5.1en",
                      "r",
                      encoding="utf-8")
    output_file = open("../monolingual/vector/F10-W5.1en-40k",
                       "w",
                       encoding="utf-8")
    word2index = read_vocab(vocab_path)
    output_file.write("40000 " + input_file.readline().split()[1] + "\n")
    i = 0
    while i < 40000:
        line = input_file.readline()
        word = line.split()[0]
        if word in word2index:
            output_file.write(line)
            i += 1
    output_file.close()
    input_file.close()
def vec2pairs(src_mono_vec, trg_mono_vec, src_vocab, trg_vocab, output_file,
              TOP_TRANS):
    print("loading source vectors")
    src_vectors = gensim.models.KeyedVectors.load_word2vec_format(src_mono_vec,
                                                                  binary=False)
    print("loading target vectors")
    trg_vectors = gensim.models.KeyedVectors.load_word2vec_format(trg_mono_vec,
                                                                  binary=False)

    print("load src_word2index")
    src_word2index = read_vocab(src_vocab)
    print("load trg_word2index")
    trg_word2index = read_vocab(trg_vocab)
    length = len(src_word2index)
    trg_word2index = {
        word: trg_word2index[word] - length
        for word in trg_word2index
    }

    src_index2word = {src_word2index[word]: word for word in src_word2index}
    trg_index2word = {trg_word2index[word]: word for word in trg_word2index}

    trg_matrix = np.zeros((trg_vectors.vector_size, len(trg_word2index)))
    src_matrix = np.zeros((len(src_word2index), src_vectors.vector_size))

    print("form the target language matrix")
    for word in trg_word2index:
        trg_matrix[:, trg_word2index[word]] = trg_vectors[word]

    # compute the norm-2 of every word vector. axis=0 means regarding every column as a vector
    trg_vec_length = np.reciprocal(LA.norm(trg_matrix, axis=0))
    for i in range(len(trg_vec_length)):
        trg_matrix[:, i] *= trg_vec_length[i]

    print("form the source language matrix")
    for word in src_word2index:
        src_matrix[src_word2index[word], :] = src_vectors[word]

    # compute the norm-2 of every word vector
    src_vec_length = np.reciprocal(LA.norm(src_matrix, axis=1))
    for i in range(len(src_vec_length)):
        src_matrix[i, :] *= src_vec_length[i]

    print("finding translation pairs")
    batch = 1000
    output = open(output_file, "w", encoding="utf-8")

    knn_sim_bwd = np.zeros(trg_matrix.shape[1])
    for i in range(0, trg_matrix.shape[1], batch):
        j = min(i + batch, trg_matrix.shape[1])
        knn_sim_bwd[i:j] = topk_mean(trg_matrix.T[i:j].dot(src_matrix.T),
                                     k=10,
                                     inplace=True)

    for i in range(0, src_matrix.shape[0], batch):
        print("batch", i / batch)
        temp = src_matrix[i:min(i + batch, src_matrix.shape[0]), :]
        similarity = 2 * np.dot(temp, trg_matrix) - knn_sim_bwd
        index = np.argpartition(similarity, -TOP_TRANS, axis=1)[:, -TOP_TRANS:]
        for j in range(0, temp.shape[0]):
            sum = 0
            for k in range(TOP_TRANS):
                sum += similarity[j, index[j, k]]
            for k in range(TOP_TRANS):
                output.write(src_index2word[i + j] + " " +
                             trg_index2word[index[j, k]] + " " +
                             str(similarity[j, index[j, k]] / sum) + "\n")
    output.close()
예제 #9
0
def vec2pairs(src_mono_vec, trg_mono_vec, src_vocab, trg_vocab, output_file,
              TOP_TRANS):
    print("loading source vectors")
    src_vectors = gensim.models.KeyedVectors.load_word2vec_format(src_mono_vec,
                                                                  binary=False)
    print("loading target vectors")
    trg_vectors = gensim.models.KeyedVectors.load_word2vec_format(trg_mono_vec,
                                                                  binary=False)

    print("load src_word2index")
    src_word2index = read_vocab(src_vocab)
    print("load trg_word2index")
    trg_word2index = read_vocab(trg_vocab)
    length = len(src_word2index)
    trg_word2index = {
        word: trg_word2index[word] - length
        for word in trg_word2index
    }

    src_index2word = {src_word2index[word]: word for word in src_word2index}
    trg_index2word = {trg_word2index[word]: word for word in trg_word2index}

    trg_matrix = np.zeros((trg_vectors.vector_size, len(trg_word2index)))
    src_matrix = np.zeros((len(src_word2index), src_vectors.vector_size))

    print("form the target language matrix")
    for word in trg_word2index:
        trg_matrix[:, trg_word2index[word]] = trg_vectors[word]

    # compute the norm-2 of every word vector. axis=0 means regarding every column as a vector
    trg_vec_length = np.reciprocal(LA.norm(trg_matrix, axis=0))
    for i in range(len(trg_vec_length)):
        trg_matrix[:, i] *= trg_vec_length[i]

    print("form the source language matrix")
    for word in src_word2index:
        src_matrix[src_word2index[word], :] = src_vectors[word]

    # compute the norm-2 of every word vector
    src_vec_length = np.reciprocal(LA.norm(src_matrix, axis=1))
    for i in range(len(src_vec_length)):
        src_matrix[i, :] *= src_vec_length[i]

    print("finding translation pairs")
    batch = 1000
    output = open(output_file, "w", encoding="utf-8")
    for i in range(int(len(src_word2index) / batch)):
        print("batch", i)
        temp = src_matrix[i * batch:i * batch + batch, :]
        result = np.dot(temp, trg_matrix)
        index = np.argpartition(result, -TOP_TRANS, axis=1)[:, -TOP_TRANS:]
        base = i * batch
        for j in range(0, batch):
            sum = 0
            for k in range(TOP_TRANS):
                sum += result[j, index[j, k]]
            for k in range(TOP_TRANS):
                output.write(src_index2word[base + j] + " " +
                             trg_index2word[index[j, k]] + " " +
                             str(result[j, index[j, k]] / sum) + "\n")

    i = int(len(src_word2index) / batch)
    temp = src_matrix[i * batch:len(src_word2index), :]
    result = np.dot(temp, trg_matrix)
    index = np.argpartition(result, -TOP_TRANS, axis=1)[:, -TOP_TRANS:]
    base = i * batch
    for j in range(0, temp.shape[0]):
        sum = 0
        for k in range(TOP_TRANS):
            sum += result[j, index[j, k]]
        for k in range(TOP_TRANS):
            output.write(src_index2word[base + j] + " " +
                         trg_index2word[index[j, k]] + " " +
                         str(result[j, index[j, k]] / sum) + "\n")
    output.close()