def saveClusters(directions_fn, scores_fn, names_fn, filename, amt_of_dirs, data_type, cluster_amt, rewrite_files=False, algorithm="meanshift_k"): dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt" cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt" all_fns = [dict_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", saveClusters.__name__) return else: print("Running task", saveClusters.__name__) p_dir = dt.import2dArray(directions_fn) p_names = dt.import1dArray(names_fn, "s") p_scores = dt.import1dArray(scores_fn, "f") ids = np.argsort(p_scores) p_dir = np.flipud(p_dir[ids])[:amt_of_dirs] p_names = np.flipud(p_names[ids])[:amt_of_dirs] if algorithm == "meanshift": labels = meanShift(p_dir) else: labels = kMeans(p_dir, cluster_amt) unique, counts = np.unique(labels, return_counts=True) clusters = [] dir_clusters = [] for i in range(len(unique)): clusters.append([]) dir_clusters.append([]) for i in range(len(labels)): clusters[labels[i]].append(p_names[i]) dir_clusters[labels[i]].append(p_dir[i]) cluster_directions = [] for l in range(len(dir_clusters)): cluster_directions.append(dt.mean_of_array(dir_clusters[l])) print("------------------------") for c in clusters: print(c) print("------------------------") dt.write2dArray(clusters, dict_fn) dt.write2dArray(cluster_directions, cluster_directions_fn)
def nameClustersMeanDirection(cluster_directions): # Import the word vectors from Wikipedia file = open("../data/wikipedia/word_vectors/glove.6B.50d.txt", encoding="utf8") lines = file.readlines() word_vectors = [] word_vector_names = [] for l in lines: l = l.split() word_vector_names.append(l[0]) del l[0] for i in range(len(l)): l[i] = float(l[i]) word_vectors.append(l) words = [] for key, value in cluster_directions.items(): for v in range(len(value) - 1, -1, -1): if key == value[v] or key == value[v][:-1] or key[:-1] == value[v]: print("deleted", value[v], key) value[v] = "DELETE" for val in reversed(value): if val == value[v][:-1] or val[:-1] == value[v]: print("deleted", value[v], val) value[v] = "DELETE" for key, value in cluster_directions.items(): for v in range(len(value) - 1, -1, -1): if value[v] == "DELETE": del value[v] for key, value in cluster_directions.items(): cluster_word_vectors = [] for w in range(len(word_vector_names)): if word_vector_names[w].strip() == key.strip(): cluster_word_vectors.append(word_vectors[w]) print("Success", key) break if w == len(word_vector_names) - 1: print("Failed", key) for v in range(len(value)): for w in range(len(word_vector_names)): if v > 9: break if word_vector_names[w].strip() == value[v].strip(): cluster_word_vectors.append(word_vectors[w]) print("Success", value[v]) break if w == len(word_vector_names) - 1: print("Failed", value[v]) if len(cluster_word_vectors) > 0: mean_vector = dt.mean_of_array(cluster_word_vectors) print(mean_vector) print(cluster_word_vectors[0]) h_sim = 0 closest_word = "" for v in range(len(word_vectors)): sim = st.getSimilarity(word_vectors[v], mean_vector) if sim > h_sim: print("New highest sim", word_vector_names[v]) h_sim = sim closest_word = word_vector_names[v] print("Closest Word", closest_word) words.append(closest_word) else: words.append(key) return words
def createTermClusters(hv_directions, lv_directions, hv_names, lv_names, amt_of_clusters, dont_cluster): least_similar_clusters = [] least_similar_cluster_ids = [] least_similar_cluster_names = [] print("Overall amount of HV directions: ", len(hv_directions)) # Create high-valued clusters least_similar_cluster_ids.append(0) least_similar_clusters.append(hv_directions[0]) least_similar_cluster_names.append(hv_names[0]) print("Least Similar Term", hv_names[0]) hv_to_delete = [0] for i in range(len(hv_directions)): if i >= amt_of_clusters - 1: break else: ti = st.getNextClusterTerm(least_similar_clusters, hv_directions, least_similar_cluster_ids, 1) least_similar_cluster_ids.append(ti) least_similar_clusters.append(hv_directions[ti]) least_similar_cluster_names.append(hv_names[ti]) hv_to_delete.append(ti) print( str(i + 1) + "/" + str(amt_of_clusters), "Least Similar Term", hv_names[ti]) # Add remaining high value directions to the low value direction list if dont_cluster == 0: hv_directions = np.asarray(hv_directions) hv_names = np.asarray(hv_names) hv_directions = np.delete(hv_directions, hv_to_delete, 0) hv_names = np.delete(hv_names, hv_to_delete, 0) for i in range(len(hv_directions)): lv_directions.insert(0, hv_directions[i]) lv_names.insert(0, hv_names[i]) # Initialize dictionaries for printing / visualizing cluster_name_dict = OrderedDict() for c in least_similar_cluster_names: cluster_name_dict[c] = [] # For every low value direction, find the high value direction its most similar to and append it to the directions every_cluster_direction = [] for i in least_similar_clusters: every_cluster_direction.append([i]) # Finding the most similar directions to each cluster_centre # Creating a dictionary of {cluster_centre: [cluster_direction(1), ..., cluster_direction(n)]} pairs for d in range(len(lv_directions)): i = st.getXMostSimilarIndex(lv_directions[d], least_similar_clusters, [], 1)[0] every_cluster_direction[i].append(lv_directions[d]) print( str(d + 1) + "/" + str(len(lv_directions)), "Most Similar to", lv_names[d], "Is", least_similar_cluster_names[i]) cluster_name_dict[least_similar_cluster_names[i]].append( lv_names[d]) # Mean of all directions = cluster direction cluster_directions = [] for l in range(len(least_similar_clusters)): cluster_directions.append( dt.mean_of_array(every_cluster_direction[l])) else: cluster_name_dict = OrderedDict() for c in least_similar_cluster_names: cluster_name_dict[c] = [] # For every low value direction, find the high value direction its most similar to and append it to the directions every_cluster_direction = [] for i in least_similar_clusters: every_cluster_direction.append([i]) cluster_directions = least_similar_clusters return cluster_directions, least_similar_cluster_names, cluster_name_dict, least_similar_clusters
def nameClustersRemoveOutliers(cluster_directions): # Import the word vectors from Wikipedia file = open("../data/wikipedia/word_vectors/glove.6B.50d.txt", encoding="utf8") lines = file.readlines() wv = [] wvn = [] # Create an array of word vectors from the text file for l in lines: l = l.split() wvn.append(l[0]) del l[0] for i in range(len(l)): l[i] = float(l[i]) wv.append(l) words = [] for key, value in cluster_directions.items(): for v in range(len(value) - 1, -1, -1): if key == value[v] or key == value[v][:-1] or key[:-1] == value[v]: print("deleted", value[v], key) value[v] = "DELETE" for val in reversed(value): if val == value[v][:-1] or val[:-1] == value[v]: print("deleted", value[v], val) value[v] = "DELETE" for key, value in cluster_directions.items(): for v in range(len(value) - 1, -1, -1): if value[v] == "DELETE": del value[v] # For every cluster (key: cluster center, value: similar terms) for key, value in cluster_directions.items(): # If the center/values in the vector have a corresponding word vector, add the vectors to an array cluster_word_vectors = [] cluster_word_vector_names = [] for w in range(len(wvn)): if wvn[w].strip() == key.strip(): cluster_word_vectors.append(wv[w]) cluster_word_vector_names.append(wvn[w]) print("Success", key) break if w == len(wvn) - 1: print("Failed", key) for v in range(len(value)): for w in range(len(wvn)): if v > 9: break if wvn[w].strip() == value[v].strip(): cluster_word_vectors.append(wv[w]) cluster_word_vector_names.append(wvn[w]) print("Success", value[v]) break if w == len(wvn) - 1: print("Failed", value[v]) # If we found word vectors if len(cluster_word_vectors) > 1: # Get the angular distance between every word vector, and find the minimum angular distance point min_ang_dist = 214700000 min_index = None ang_dists = np.zeros( [len(cluster_word_vectors), len(cluster_word_vectors)]) for i in range(len(cluster_word_vectors)): total_dist = 0 for j in range(len(cluster_word_vectors)): dist = spatial.distance.cosine(cluster_word_vectors[i], cluster_word_vectors[j]) if ang_dists[i][j] == 0: ang_dists[i][j] = dist total_dist += dist if total_dist < min_ang_dist: min_ang_dist = total_dist min_index = i print("New min word:", cluster_word_vector_names[min_index]) medoid_wv = [] medoid_wvn = [] # Delete outliers for i in range(len(cluster_word_vectors)): threshold = 0.8 dist = spatial.distance.cosine(cluster_word_vectors[min_index], cluster_word_vectors[i]) if dist < threshold: medoid_wv.append(cluster_word_vectors[i]) medoid_wvn.append(cluster_word_vector_names[i]) else: print("Deleted outlier", cluster_word_vector_names[i]) if len(medoid_wv) > 1: # Get the mean direction of non-outlier directions mean_vector = dt.mean_of_array(medoid_wv) # Find the most similar vector to that mean h_sim = 0 closest_word = "" for v in range(len(wv)): sim = st.getSimilarity(wv[v], mean_vector) if sim > h_sim: print("New highest sim", wvn[v]) h_sim = sim closest_word = wvn[v] print("Closest Word", closest_word) words.append(closest_word) else: words.append(medoid_wvn[0]) else: words.append(key) return words
def nameClustersRemoveOutliersWeight(cluster_directions, weights_fn, is_gini): # Import the word vectors from Wikipedia weights = dt.import1dArray(weights_fn) weights = [float(w) for w in weights] phrases = dt.import1dArray("../data/movies/bow/names/200.txt") wv, wvn = dt.getWordVectors() words = [] for key, value in cluster_directions.items(): for v in range(len(value) - 1, -1, -1): if key == value[v] or key == value[v][:-1] or key[:-1] == value[v]: print("deleted", value[v], key) value[v] = "DELETE" for val in reversed(value): if val == value[v][:-1] or val[:-1] == value[v]: print("deleted", value[v], val) value[v] = "DELETE" for key, value in cluster_directions.items(): for v in range(len(value) - 1, -1, -1): if value[v] == "DELETE": del value[v] # For every cluster (key: cluster center, value: similar terms) for key, value in cluster_directions.items(): # If the center/values in the vector have a corresponding word vector, add the vectors to an array cluster_word_vectors = [] cluster_word_vector_names = [] for w in range(len(wvn)): if wvn[w].strip() == key.strip(): cluster_word_vectors.append(wv[w]) cluster_word_vector_names.append(wvn[w]) print("Success", key) break if w == len(wvn) - 1: print("Failed", key) for v in range(len(value)): for w in range(len(wvn)): if v > 9: break if wvn[w].strip() == value[v].strip(): cluster_word_vectors.append(wv[w]) cluster_word_vector_names.append(wvn[w]) print("Success", value[v]) break if w == len(wvn) - 1: print("Failed", value[v]) # If we found word vectors if len(cluster_word_vectors) > 1: # Get the angular distance between every word vector, and find the minimum angular distance point min_ang_dist = 214700000 min_index = None ang_dists = np.zeros( [len(cluster_word_vectors), len(cluster_word_vectors)]) for i in range(len(cluster_word_vectors)): total_dist = 0 for j in range(len(cluster_word_vectors)): dist = spatial.distance.cosine(cluster_word_vectors[i], cluster_word_vectors[j]) if ang_dists[i][j] == 0: ang_dists[i][j] = dist total_dist += dist if total_dist < min_ang_dist: min_ang_dist = total_dist min_index = i print("New min word:", cluster_word_vector_names[min_index]) medoid_wv = [] medoid_wvn = [] # Delete outliers for i in range(len(cluster_word_vectors)): threshold = 0.8 dist = spatial.distance.cosine(cluster_word_vectors[min_index], cluster_word_vectors[i]) if dist < threshold: medoid_wv.append(cluster_word_vectors[i]) medoid_wvn.append(cluster_word_vector_names[i]) else: print("Deleted outlier", cluster_word_vector_names[i]) if len(medoid_wv) > 1: si = [] for wvna in medoid_wvn: for w in range(len(phrases)): if phrases[w][6:] == wvna: si.append(w) a_weights = [] for s in si: a_weights.append(weights[s]) if is_gini: for s in range(len(a_weights)): a_weights[s] = 1.0 - a_weights[s] for m in range(len(medoid_wv)): for a in range(len(medoid_wv[m])): medoid_wv[m][a] = medoid_wv[m][a] * a_weights[m] # Get the mean direction of non-outlier directions mean_vector = dt.mean_of_array(medoid_wv) # Find the most similar vector to that mean h_sim = 0 closest_word = "" for v in range(len(wv)): sim = st.getSimilarity(wv[v], mean_vector) if sim > h_sim: print("New highest sim", wvn[v]) h_sim = sim closest_word = wvn[v] print("Closest Word", closest_word) words.append(closest_word) else: words.append(medoid_wvn[0]) else: words.append(key) return words