def learn_triplets_cooccur_mat(file_in, co_mat_file): learned_co_mat = CooccurMatrix() learned_co_mat.load(co_mat_file) np_voc = learned_co_mat.vocabulary np1_matrix = learned_co_mat.matrix np1_all = vocabulary.Vocabulary() with open(file_in, 'r') as f: for line in f: if (line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') np1 = cleansing.clean(triplets[0].split()) # Delete words not in the similarity vocabulary. np1_new = [w for w in np1 if np_voc.contain(w)] for w in np1_new: np1_all.add(w) num_np1 = np1_all.size() similarity_mat_np1 = zeros([num_np1, num_np1]) for i in range(num_np1): for j in range(num_np1): similarity_mat_np1[i, j] = np1_matrix[ np_voc.get_word_index(np1_all.get_word(i)), np_voc.get_word_index(np1_all.get_word(j))] return CooccurMatrix(similarity_mat_np1, np1_all)
def learn_triplets_cooccur_mat(file_in, co_mat_file): learned_co_mat = CooccurMatrix() learned_co_mat.load(co_mat_file) np_voc = learned_co_mat.vocabulary np1_matrix = learned_co_mat.matrix np1_all = vocabulary.Vocabulary() with open(file_in, 'r') as f: for line in f: if(line[0] != '<'): line = (line[:-2]).lower() triplets = line.split('|') np1 = cleansing.clean(triplets[0].split()) # Delete words not in the similarity vocabulary. np1_new = [w for w in np1 if np_voc.contain(w)] for w in np1_new: np1_all.add(w) num_np1 = np1_all.size() similarity_mat_np1 = zeros([num_np1, num_np1]) for i in range(num_np1): for j in range(num_np1): similarity_mat_np1[i, j] = np1_matrix[np_voc.get_word_index(np1_all.get_word(i)), np_voc.get_word_index(np1_all.get_word(j))] return CooccurMatrix(similarity_mat_np1, np1_all)
def learn_story_distances(triplets_file_path, co_mat_file, use_similarity=True, min_similarity=None, output_file=False): # Load learned similarity matrix. learned_co_mat = CooccurMatrix() learned_co_mat.load(co_mat_file) np_voc = learned_co_mat.vocabulary np1_matrix = learned_co_mat.matrix if min_similarity is not None: np1_matrix = (np1_matrix >= 0.8) * np1_matrix files = glob.glob(triplets_file_path) files.sort() print len(files) file_num = len(files) # Calculate word histogram for each story. np_num = np_voc.size() hist = np.zeros([file_num, np_num]) count = 0 for file_in in files: (h, wordlist) = learn_story_histogram(file_in, np_voc) hist[count, :] = h count += 1 # Calculate pair-wise distance between stories. dist = np.zeros([file_num, file_num]) for i in range(file_num): for j in range(file_num): dif = hist[i, :] - hist[j, :] if use_similarity: sq = np.dot(np.dot(dif, np1_matrix), dif.T) if (sq < 0): sq = 0 dist[i, j] = sqrt(sq) else: dist[i, j] = sqrt(np.dot(dif, dif.T)) labels = [] for filename in files: labels.append(filename.split('/')[-1][:-4]) if output_file: np.savetxt('../mat/histogram.txt', hist) np.savetxt('../mat/distance.txt', dist) with codecs.open('../mat/filename.txt', "w", encoding='ISO-8859-1') as f: for l in labels: f.writelines(l + '\n') return (dist, labels)
def learn_story_distances(triplets_file_path, co_mat_file, use_similarity=True, min_similarity=None, output_file=False): # Load learned similarity matrix. learned_co_mat = CooccurMatrix() learned_co_mat.load(co_mat_file) np_voc = learned_co_mat.vocabulary np1_matrix = learned_co_mat.matrix if min_similarity is not None: np1_matrix = (np1_matrix >= 0.8) * np1_matrix files = glob.glob(triplets_file_path) files.sort() print len(files) file_num = len(files) # Calculate word histogram for each story. np_num = np_voc.size() hist = np.zeros([file_num, np_num]) count = 0 for file_in in files: (h, wordlist) = learn_story_histogram(file_in, np_voc) hist[count, :] = h count += 1 # Calculate pair-wise distance between stories. dist = np.zeros([file_num, file_num]) for i in range(file_num): for j in range(file_num): dif = hist[i, :] - hist[j, :] if use_similarity: sq = np.dot(np.dot(dif, np1_matrix), dif.T) if sq < 0: sq = 0 dist[i, j] = sqrt(sq) else: dist[i, j] = sqrt(np.dot(dif, dif.T)) labels = [] for filename in files: labels.append(filename.split("/")[-1][:-4]) if output_file: np.savetxt("../mat/histogram.txt", hist) np.savetxt("../mat/distance.txt", dist) with codecs.open("../mat/filename.txt", "w", encoding="ISO-8859-1") as f: for l in labels: f.writelines(l + "\n") return (dist, labels)