def vec2pairs(src_mono_vec, trg_mono_vec, src_vocab, trg_vocab, output_file): print("loading source vectors") src_vectors = gensim.models.KeyedVectors.load_word2vec_format(src_mono_vec, binary=False) print("loading target vectors") trg_vectors = gensim.models.KeyedVectors.load_word2vec_format(trg_mono_vec, binary=False) print("load src_word2index") src_word2index = read_vocab(src_vocab) print("load trg_word2index") trg_word2index = read_vocab(trg_vocab) align(src_word2index) align(trg_word2index) for word in src_word2index: print(src_word2index[word]) src_index2word = {src_word2index[word]: word for word in src_word2index} trg_index2word = {trg_word2index[word]: word for word in trg_word2index} trg_matrix = np.zeros((trg_vectors.vector_size, len(trg_word2index))) src_matrix = np.zeros((len(src_word2index), src_vectors.vector_size)) print("form the target language matrix") for word in trg_word2index: trg_matrix[:, trg_word2index[word]] = trg_vectors[word] # compute the norm-2 of every word vector. axis=0 means regarding every column as a vector trg_vec_length = np.reciprocal(LA.norm(trg_matrix, axis=0)) for i in range(len(trg_vec_length)): trg_matrix[:, i] *= trg_vec_length[i] print("form the source language matrix") for word in src_word2index: src_matrix[src_word2index[word], :] = src_vectors[word] # compute the norm-2 of every word vector src_vec_length = np.reciprocal(LA.norm(src_matrix, axis=1)) for i in range(len(src_vec_length)): src_matrix[i, :] *= src_vec_length[i] print("finding translation pairs") batch = 1000 output = open(output_file, "w", encoding="utf-8") knn_sim_bwd = np.zeros( src_matrix.shape[0]) # src_matrix.shape[0]是目标预言的单词个数 for i in range(0, src_matrix.shape[0], batch): print("batch", i) j = min(i + batch, src_matrix.shape[0]) knn_sim_bwd[i:j] = topk_mean(src_matrix[i:j].dot(trg_matrix), k=100, inplace=True) for i in range(0, knn_sim_bwd.shape[0]): output.write(src_index2word[i] + "," + str(knn_sim_bwd[i]) + "\n") output.close()
def compute_D(pairs_file, src_vocab, trg_vocab, output_file): src_word2index = read_vocab(src_vocab) trg_word2index = read_vocab(trg_vocab) length = len(src_word2index) + len(trg_word2index) D = csc_matrix((length, length), dtype="float32") input = open(pairs_file, "r", encoding="UTF-8-sig") pairs = [line.strip().split() for line in input.readlines()] input.close() D = form_matrix(pairs, src_word2index, trg_word2index, D) save_matrix(output_file, D)
def cnt2svd(count_file, vocab_file, PPMI): with open(count_file, "r", encoding="UTF-8-sig") as src_file: text = src_file.readlines() word2index = read_vocab(vocab_file) print("length of word_dict: " + str(len(word2index))) counts = csc_matrix((len(word2index), len(word2index)), dtype="float32") tmp_counts = dok_matrix((len(word2index), len(word2index)), dtype="float32") times = 0 for i in range(len(text)): word, context, count = text[i].strip().split() tmp_counts[word2index[word], word2index[context]] = int(count) times += 1 if times == st.UPDATE_THRESHOLD: counts = counts + tmp_counts.tocsc() tmp_counts = dok_matrix((len(word2index), len(word2index)), dtype="float32") times = 0 counts = counts + tmp_counts.tocsc() #calculate e^pmi sum_r = np.array(counts.sum(axis=1))[:, 0] sum_c = np.array(counts.sum(axis=0))[0, :] sum_total = sum_c.sum() sum_r = np.reciprocal(sum_r) sum_c = np.reciprocal(sum_c) pmi = csc_matrix(counts) normalizer = dok_matrix((len(sum_r), len(sum_r))) normalizer.setdiag(sum_r) pmi = normalizer.tocsc().dot(pmi) normalizer = dok_matrix((len(sum_c), len(sum_c))) normalizer.setdiag(sum_c) pmi = pmi.dot(normalizer.tocsc()) pmi = pmi * sum_total pmi.data = np.log(pmi.data) if PPMI: pmi[pmi < 0] = 0 I = eye(pmi.shape[0], format="csc") print("start svd") start = time.time() ut, s = sparsesvd(pmi, I, st.VECTOR_LENGTH)[:2] if PPMI: for i in range(len(s)): ut[i, :] *= np.sqrt(s[i]) else: for i in range(len(s)): ut[i, :] *= s[i] print(time.time() - start) return ut.T, word2index
def vec2pairs(src_mono_vec, trg_mono_vec, src_vocab, trg_vocab): print("loading source vectors") src_vectors = gensim.models.KeyedVectors.load_word2vec_format(src_mono_vec, binary=False) print("loading target vectors") trg_vectors = gensim.models.KeyedVectors.load_word2vec_format(trg_mono_vec, binary=False) print("load src_word2index") src_word2index = read_vocab(src_vocab) print("load trg_word2index") trg_word2index = read_vocab(trg_vocab) align(src_word2index) align(trg_word2index) trg_index2word = {trg_word2index[word]: word for word in trg_word2index} trg_matrix = np.zeros((trg_vectors.vector_size, len(trg_word2index))) print("form the target language matrix") for word in trg_word2index: trg_matrix[:, trg_word2index[word]] = trg_vectors[word] # compute the norm-2 of every word vector. axis=0 means regarding every column as a vector trg_vec_length = np.reciprocal(LA.norm(trg_matrix, axis=0)) for i in range(len(trg_vec_length)): trg_matrix[:, i] *= trg_vec_length[i] words = ['abdomen', 'magnesium', 'ammonia', 'yuwen', 'everton'] for word in words: if word not in src_word2index: continue a = LA.norm(src_vectors[word]) sim_mat = src_vectors[word].dot(trg_matrix) indices = np.argpartition(sim_mat, -100)[-100:] print(word) for idx in indices: print(trg_index2word[idx], end=" ") print() for idx in indices: print(sim_mat[idx] / a, end=" ") print()
def compute_X(src_file, trg_file, src_vocab, trg_vocab, output_file): print("read word2index") src_word2index = read_vocab(src_vocab) trg_word2index = read_vocab(trg_vocab) total_number = len(src_word2index) + len(trg_word2index) print("form matrix X") counts = csc_matrix((total_number, total_number), dtype="float32") counts = form_matrix(open(src_file, "r", encoding="UTF-8-sig"), src_word2index, counts) counts = form_matrix(open(trg_file, "r", encoding="UTF-8-sig"), trg_word2index, counts) #calculate e^pmi print("compute pmi") sum_r = np.array(counts.sum(axis=1))[:, 0] sum_c = np.array(counts.sum(axis=0))[0, :] sum_total = sum_c.sum() sum_r = np.reciprocal(sum_r) sum_c = np.reciprocal(sum_c) pmi = csc_matrix(counts) print("divided by marginal sum") normalizer = dok_matrix((len(sum_r), len(sum_r))) normalizer.setdiag(sum_r) pmi = normalizer.tocsc().dot(pmi) normalizer = dok_matrix((len(sum_c), len(sum_c))) normalizer.setdiag(sum_c) pmi = pmi.dot(normalizer.tocsc()) print("multiply total sum") pmi = pmi * sum_total pmi.data = np.log(pmi.data) pmi[pmi < 0] = 0 save_matrix(output_file, pmi)
def svd2vec(src_vocab, trg_vocab, svd_path, src_vec, trg_vec): src_word2index = read_vocab(src_vocab) trg_word2index = read_vocab(trg_vocab) u = np.loadtxt(svd_path + "-U", dtype="float32") s = np.loadtxt(svd_path+"-s", dtype= "float32") # v = np.loadtxt(svd_path+"-V", dtype= "float32") for i in range(len(s)): u[:, i] *= np.sqrt(s[i]) top = 40000 print("output vectors") length = min(st.VECTOR_LENGTH, u.shape[1]) output = open(src_vec, "w", encoding="utf-8") output.write(str(top) + " " + str(length) + "\n") for word in src_word2index: if src_word2index[word] >= top: continue vector = u[src_word2index[word]] output.write(word) for i in range(length): output.write(" %.8f" % vector[i]) output.write("\n") output.close() total_top = top + len(src_word2index) output = open(trg_vec, "w", encoding="utf-8") output.write(str(top) + " " + str(length) + "\n") for word in trg_word2index: if trg_word2index[word] >= total_top: continue vector = u[trg_word2index[word]] output.write(word) for i in range(length): output.write(" %.8f" % vector[i]) output.write("\n") output.close()
from corpus2vocab import read_vocab if __name__ == "__main__": vocab_path = "tempdata/vocab/F10-W5.1en" input_file = open("../monolingual/vector/F10-W5.1en", "r", encoding="utf-8") output_file = open("../monolingual/vector/F10-W5.1en-40k", "w", encoding="utf-8") word2index = read_vocab(vocab_path) output_file.write("40000 " + input_file.readline().split()[1] + "\n") i = 0 while i < 40000: line = input_file.readline() word = line.split()[0] if word in word2index: output_file.write(line) i += 1 output_file.close() input_file.close()
def vec2pairs(src_mono_vec, trg_mono_vec, src_vocab, trg_vocab, output_file, TOP_TRANS): print("loading source vectors") src_vectors = gensim.models.KeyedVectors.load_word2vec_format(src_mono_vec, binary=False) print("loading target vectors") trg_vectors = gensim.models.KeyedVectors.load_word2vec_format(trg_mono_vec, binary=False) print("load src_word2index") src_word2index = read_vocab(src_vocab) print("load trg_word2index") trg_word2index = read_vocab(trg_vocab) length = len(src_word2index) trg_word2index = { word: trg_word2index[word] - length for word in trg_word2index } src_index2word = {src_word2index[word]: word for word in src_word2index} trg_index2word = {trg_word2index[word]: word for word in trg_word2index} trg_matrix = np.zeros((trg_vectors.vector_size, len(trg_word2index))) src_matrix = np.zeros((len(src_word2index), src_vectors.vector_size)) print("form the target language matrix") for word in trg_word2index: trg_matrix[:, trg_word2index[word]] = trg_vectors[word] # compute the norm-2 of every word vector. axis=0 means regarding every column as a vector trg_vec_length = np.reciprocal(LA.norm(trg_matrix, axis=0)) for i in range(len(trg_vec_length)): trg_matrix[:, i] *= trg_vec_length[i] print("form the source language matrix") for word in src_word2index: src_matrix[src_word2index[word], :] = src_vectors[word] # compute the norm-2 of every word vector src_vec_length = np.reciprocal(LA.norm(src_matrix, axis=1)) for i in range(len(src_vec_length)): src_matrix[i, :] *= src_vec_length[i] print("finding translation pairs") batch = 1000 output = open(output_file, "w", encoding="utf-8") knn_sim_bwd = np.zeros(trg_matrix.shape[1]) for i in range(0, trg_matrix.shape[1], batch): j = min(i + batch, trg_matrix.shape[1]) knn_sim_bwd[i:j] = topk_mean(trg_matrix.T[i:j].dot(src_matrix.T), k=10, inplace=True) for i in range(0, src_matrix.shape[0], batch): print("batch", i / batch) temp = src_matrix[i:min(i + batch, src_matrix.shape[0]), :] similarity = 2 * np.dot(temp, trg_matrix) - knn_sim_bwd index = np.argpartition(similarity, -TOP_TRANS, axis=1)[:, -TOP_TRANS:] for j in range(0, temp.shape[0]): sum = 0 for k in range(TOP_TRANS): sum += similarity[j, index[j, k]] for k in range(TOP_TRANS): output.write(src_index2word[i + j] + " " + trg_index2word[index[j, k]] + " " + str(similarity[j, index[j, k]] / sum) + "\n") output.close()
def vec2pairs(src_mono_vec, trg_mono_vec, src_vocab, trg_vocab, output_file, TOP_TRANS): print("loading source vectors") src_vectors = gensim.models.KeyedVectors.load_word2vec_format(src_mono_vec, binary=False) print("loading target vectors") trg_vectors = gensim.models.KeyedVectors.load_word2vec_format(trg_mono_vec, binary=False) print("load src_word2index") src_word2index = read_vocab(src_vocab) print("load trg_word2index") trg_word2index = read_vocab(trg_vocab) length = len(src_word2index) trg_word2index = { word: trg_word2index[word] - length for word in trg_word2index } src_index2word = {src_word2index[word]: word for word in src_word2index} trg_index2word = {trg_word2index[word]: word for word in trg_word2index} trg_matrix = np.zeros((trg_vectors.vector_size, len(trg_word2index))) src_matrix = np.zeros((len(src_word2index), src_vectors.vector_size)) print("form the target language matrix") for word in trg_word2index: trg_matrix[:, trg_word2index[word]] = trg_vectors[word] # compute the norm-2 of every word vector. axis=0 means regarding every column as a vector trg_vec_length = np.reciprocal(LA.norm(trg_matrix, axis=0)) for i in range(len(trg_vec_length)): trg_matrix[:, i] *= trg_vec_length[i] print("form the source language matrix") for word in src_word2index: src_matrix[src_word2index[word], :] = src_vectors[word] # compute the norm-2 of every word vector src_vec_length = np.reciprocal(LA.norm(src_matrix, axis=1)) for i in range(len(src_vec_length)): src_matrix[i, :] *= src_vec_length[i] print("finding translation pairs") batch = 1000 output = open(output_file, "w", encoding="utf-8") for i in range(int(len(src_word2index) / batch)): print("batch", i) temp = src_matrix[i * batch:i * batch + batch, :] result = np.dot(temp, trg_matrix) index = np.argpartition(result, -TOP_TRANS, axis=1)[:, -TOP_TRANS:] base = i * batch for j in range(0, batch): sum = 0 for k in range(TOP_TRANS): sum += result[j, index[j, k]] for k in range(TOP_TRANS): output.write(src_index2word[base + j] + " " + trg_index2word[index[j, k]] + " " + str(result[j, index[j, k]] / sum) + "\n") i = int(len(src_word2index) / batch) temp = src_matrix[i * batch:len(src_word2index), :] result = np.dot(temp, trg_matrix) index = np.argpartition(result, -TOP_TRANS, axis=1)[:, -TOP_TRANS:] base = i * batch for j in range(0, temp.shape[0]): sum = 0 for k in range(TOP_TRANS): sum += result[j, index[j, k]] for k in range(TOP_TRANS): output.write(src_index2word[base + j] + " " + trg_index2word[index[j, k]] + " " + str(result[j, index[j, k]] / sum) + "\n") output.close()