def get_id_vector(word2vec_file_name=None): """ Returns: dictionary whose key is word id, value is its word embedding """ if word2vec_file_name is None: word2vec_file_name = WORD2VEC_FILE_NAME logging.info("read word embedding from {}".format(word2vec_file_name)) if control.high_version(flag=True): id2vector = {index - 1: list(map(float, line.split(' ')[1:])) for index, line in enumerate(open(word2vec_file_name, 'r', encoding="utf-8"))} else: id2vector = {index - 1: list(map(float, line.split(' ')[1:])) for index, line in enumerate(open(word2vec_file_name, 'r'))} # EOS id2vector[-1] = [1.0] * 256 # UNK id2vector[-2] = [0.0] * 256 logging.info("exit get_id_vector") return id2vector
def read_from_input(input, word_vec=None, word_weights=None): """ example: input= "1,2,3 4,5,6 7,8,9", return=([1,2,3], [4,5,6], [7,8,9]) Args: input: Returns: """ if control.high_version(): input1 = np.array( list(map(lambda x: str(x, encoding="utf-8").split(' '), input[:, 0]))).astype(np.int32) input2 = np.array( list(map(lambda x: str(x, encoding='utf-8').split(' '), input[:, 1]))).astype(np.int32) input3 = np.array( list(map(lambda x: str(x, encoding='utf-8').split(' '), input[:, 2]))).astype(np.int32) else: input1 = np.array( list(map(lambda x: str(x).split(' '), input[:, 0]))).astype(np.int32) input2 = np.array( list(map(lambda x: str(x).split(' '), input[:, 1]))).astype(np.int32) input3 = np.array( list(map(lambda x: str(x).split(' '), input[:, 2]))).astype(np.int32) if word_vec is not None: input1 = np.array(get_embedding(input1, word_vec, word_weights)) input2 = np.array(get_embedding(input2, word_vec, word_weights)) input3 = np.array(get_embedding(input3, word_vec, word_weights)) return input1, input2, input3
def get_word_vec(word2vec_file_name): """ numpy array whose i-th row represents the word vector of the i-th word :param word2vec_file_name: :return: """ logging.info("get_word_vecs {}".format(word2vec_file_name)) if control.high_version(flag=True): id2vector = [np.array(line.split(' ')[1:], dtype=np.float32) for line in islice(open(word2vec_file_name, 'r', encoding="utf-8"), 1, None)] else: id2vector = [np.array(line.split(' ')[1:], dtype=np.float32) for line in islice(open(word2vec_file_name, 'r'), 1, None)] # id2vector.append(np.random.normal(0, 1, size=(128,))) # id2vector.append(np.random.normal(0, 1, size=(128,))) id2vector.append(np.ones([128], dtype=np.float32)) id2vector.append(np.zeros([128], dtype=np.float32)) word_vec = np.array(id2vector, dtype=np.float32).reshape([len(id2vector), 128]) logging.info("exit get_word_vecs {}".format(word_vec.shape)) return word_vec
def get_weights(): """ :return: """ if not os.path.isfile(WEIGHT_FILE): logging.error("{} doesn't exist".format(WEIGHT_FILE)) return None if control.high_version(): term_weights = {x.split('/')[0]: float(x.split('/')[1].replace('\n', '')) for x in open(WEIGHT_FILE, 'r', encoding="utf-8") if is_float(x.split('/')[1].replace('\n', ''))} term_id = {x.split(' ')[0]: int(x.split(' ')[1].replace('\n', '')) for x in open(WORD_ID_FILE, 'r', encoding="utf-8") if len(x.split(' ')) >= 2} id_weights = {term_id[term]: value for (term, value) in term_weights.items() if term in term_id} else: term_weights = {x.split('/')[0]: float(x.split('/')[1].replace('\n', '')) for x in open(WEIGHT_FILE, 'r') if is_float(x.split('/')[1].replace('\n', ''))} term_id = {x.split(' ')[0]: int(x.split(' ')[1].replace('\n', '')) for x in open(WORD_ID_FILE, 'r') if len(x.split(' ')) >= 2} id_weights = {term_id[term]: value for (term, value) in term_weights.items() if term in term_id} return id_weights
def write_loss(file_name, loss): """ Args: file_name: loss: Returns: """ path = os.path.split(file_name)[0] if not os.path.exists(path): os.mkdir(path) os.system("touch {}".format(file_name)) if control.high_version(): with open(file_name, 'a', encoding="utf-8") as f: f.write(str(loss)) f.write('\n') else: with open(file_name, 'a') as f: f.write(str(loss)) f.write('\n')
# -*- coding: utf-8 -*- import os import sys import matplotlib.pyplot as plt import control VERSION = sys.version.split(" ")[0] ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) if __name__ == '__main__': file_name = ROOT_PATH + "/loss_dssm_l1/loss.txt" x = [] if control.high_version(): with open(file_name, 'r', encoding="utf-8") as fr: for line in fr.readlines(): x.append(float(line.replace("\n", ""))) else: with open(file_name, 'r') as fr: for line in fr.readlines(): if "epoch" in line and "loss" in line and "accuracy" in line: x.append(float(line.split(",")[2].split(" ")[2])) fig, ax = plt.subplots() line1, = ax.plot(range(len(x)), x, '-', linewidth=2, label='loss') ax.legend(loc='upper right') plt.show()